upgini 1.2.6a1__py3-none-any.whl → 1.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.6a1"
1
+ __version__ = "1.2.7"
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from typing import Dict
2
3
 
3
4
  from upgini.autofe.binary import (
@@ -83,4 +84,4 @@ ALL_OPERANDS: Dict[str, Operand] = {
83
84
 
84
85
 
85
86
  def find_op(name):
86
- return ALL_OPERANDS.get(name)
87
+ return deepcopy(ALL_OPERANDS.get(name))
upgini/autofe/feature.py CHANGED
@@ -22,6 +22,9 @@ class Column:
22
22
  def set_op_params(self, params: Dict[str, str]) -> "Column":
23
23
  return self
24
24
 
25
+ def get_op_params(self, **kwargs):
26
+ return dict()
27
+
25
28
  def rename_columns(self, mapping: Dict[str, str]) -> "Column":
26
29
  self.name = self._unhash(mapping.get(self.name) or self.name)
27
30
  return self
@@ -44,6 +47,10 @@ class Column:
44
47
  def get_columns(self, **kwargs) -> List[str]:
45
48
  return [self.name]
46
49
 
50
+ @property
51
+ def children(self) -> List[Union["Feature", "Column"]]:
52
+ return []
53
+
47
54
  def infer_type(self, data: pd.DataFrame) -> DtypeObj:
48
55
  return data[self.name].dtype
49
56
 
@@ -88,9 +95,30 @@ class Feature:
88
95
  self.op.set_params(params)
89
96
 
90
97
  for child in self.children:
91
- child.set_op_params(params)
98
+ child_params = {
99
+ k[len(child.get_display_name()) + 1 :]: v
100
+ for k, v in params.items()
101
+ if k.startswith(child.get_display_name())
102
+ }
103
+ if not child_params:
104
+ child_params = params
105
+ child.set_op_params(child_params)
92
106
  return self
93
107
 
108
+ def get_op_params(self, **kwargs) -> Dict[str, str]:
109
+ return {
110
+ k: str(v)
111
+ for k, v in dict(
112
+ (
113
+ (f"{child.get_display_name(**kwargs)}_{k}", v)
114
+ for child in self.children
115
+ for k, v in child.get_op_params(**kwargs).items()
116
+ ),
117
+ **(self.op.get_params() or {}),
118
+ ).items()
119
+ if v is not None
120
+ }
121
+
94
122
  def get_hash(self) -> str:
95
123
  return hashlib.sha256(
96
124
  "_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
@@ -326,24 +354,26 @@ class FeatureGroup:
326
354
  return names
327
355
 
328
356
  def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
329
- main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
330
357
  if isinstance(self.op, PandasOperand):
331
- columns = self.get_columns()
332
- lower_order_children = [
358
+ main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
359
+ lower_order_children = []
360
+ if self.main_column_node is not None:
361
+ lower_order_children.append(self.main_column_node)
362
+ lower_order_children.extend(
333
363
  ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
334
- ]
364
+ )
335
365
  lower_order_names = [ch.get_display_name() for ch in lower_order_children]
336
- if any(isinstance(f, Feature) for f in lower_order_children):
337
- child_data = pd.concat(
338
- [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
339
- axis=1,
340
- )
341
- child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
342
- else:
343
- child_data = data[columns]
366
+ child_data = pd.concat(
367
+ [ch.calculate(data) for ch in lower_order_children],
368
+ axis=1,
369
+ )
370
+ child_data.columns = lower_order_names
344
371
 
345
372
  new_data = self.op.calculate_group(child_data, main_column=main_column)
346
- new_data.rename(columns=dict(zip(lower_order_names, self.get_display_names())), inplace=True)
373
+ new_data.rename(
374
+ columns=dict(zip((n for n in lower_order_names if n != main_column), self.get_display_names())),
375
+ inplace=True,
376
+ )
347
377
  else:
348
378
  raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
349
379
 
@@ -64,6 +64,7 @@ class DataSourcePublisher:
64
64
  date_features: Optional[List[str]] = None,
65
65
  date_vector_features: Optional[List[str]] = None,
66
66
  generate_runtime_embeddings: Optional[List[str]] = None,
67
+ exclude_raw: Optional[List[str]] = None,
67
68
  _force_generation=False,
68
69
  _silent=False,
69
70
  ) -> str:
@@ -88,6 +89,8 @@ class DataSourcePublisher:
88
89
  features_for_embeddings - optional list of str - list of features that should be used for GPT features
89
90
  generation
90
91
 
92
+ exclude_raw - optional list of str - list of features that should NOT be used as raw features
93
+
91
94
  ...
92
95
 
93
96
  data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
@@ -166,6 +169,8 @@ class DataSourcePublisher:
166
169
  request["dateVectorFeatures"] = date_vector_features
167
170
  if generate_runtime_embeddings is not None:
168
171
  request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
172
+ if exclude_raw is not None:
173
+ request["excludeRaw"] = exclude_raw
169
174
  self.logger.info(f"Start registering data table {request}")
170
175
 
171
176
  task_id = self._rest_client.register_ads(request, trace_id)
@@ -281,6 +286,7 @@ class DataSourcePublisher:
281
286
  date_vector_features: Optional[List[str]] = None,
282
287
  exclude_from_autofe_generation: Optional[List[str]] = None,
283
288
  generate_runtime_embeddings: Optional[List[str]] = None,
289
+ exclude_raw: Optional[List[str]] = None,
284
290
  ):
285
291
  trace_id = str(uuid.uuid4())
286
292
  with MDC(trace_id=trace_id):
@@ -336,6 +342,8 @@ class DataSourcePublisher:
336
342
  request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
337
343
  if generate_runtime_embeddings is not None:
338
344
  request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
345
+ if exclude_raw is not None:
346
+ request["excludeRaw"] = exclude_raw
339
347
  self.logger.info(f"Activating data tables with request {request}")
340
348
 
341
349
  self._rest_client.activate_datatables(request, trace_id)
@@ -378,7 +386,6 @@ class DataSourcePublisher:
378
386
  search_keys = [k.value.value for k in search_keys] if search_keys else None
379
387
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
380
388
  task_id = self._rest_client.upload_online(request, trace_id)
381
- print(f"Start polling management task_id={task_id} with trace_id={trace_id}")
382
389
  with Spinner():
383
390
  status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
384
391
  while status_response["status"] not in self.FINAL_STATUSES:
@@ -3235,7 +3235,6 @@ class FeaturesEnricher(TransformerMixin):
3235
3235
  ]
3236
3236
  )
3237
3237
 
3238
- # TODO some columns not exists
3239
3238
  all_other_columns = sorted_other_keys + other_columns
3240
3239
 
3241
3240
  search_keys_hash = "search_keys_hash"
upgini/utils/ip_utils.py CHANGED
@@ -57,17 +57,17 @@ class IpSearchKeyConverter:
57
57
  except ValueError:
58
58
  pass
59
59
 
60
- # @staticmethod
61
- # def _is_ipv4(ip: Optional[_BaseAddress]):
62
- # return ip is not None and (
63
- # isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
64
- # )
65
-
66
- # @staticmethod
67
- # def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
68
- # if isinstance(ip, IPv4Address):
69
- # return ip
70
- # return None
60
+ @staticmethod
61
+ def _is_ipv4(ip: Optional[_BaseAddress]):
62
+ return ip is not None and (
63
+ isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
64
+ )
65
+
66
+ @staticmethod
67
+ def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
68
+ if isinstance(ip, IPv4Address):
69
+ return ip
70
+ return None
71
71
 
72
72
  @staticmethod
73
73
  def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
@@ -87,10 +87,10 @@ class IpSearchKeyConverter:
87
87
  raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
88
88
 
89
89
  # legacy support
90
- # ipv4 = self.ip_column + "_v4"
91
- # df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
92
- # self.search_keys[ipv4] = SearchKey.IP
93
- # self.columns_renaming[ipv4] = original_ip
90
+ ipv4 = self.ip_column + "_v4"
91
+ df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
92
+ self.search_keys[ipv4] = SearchKey.IP
93
+ self.columns_renaming[ipv4] = original_ip
94
94
 
95
95
  ipv6 = self.ip_column + "_v6"
96
96
  df[ipv6] = (
@@ -104,7 +104,7 @@ class IpSearchKeyConverter:
104
104
  del self.search_keys[self.ip_column]
105
105
  del self.columns_renaming[self.ip_column]
106
106
  self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
107
- self.columns_renaming[ipv6] = original_ip # could be upgini_ip_unnest...
107
+ self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
108
108
 
109
109
  return df
110
110
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.6a1
3
+ Version: 1.2.7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=iKaF643F-CtlunSR-fjGeg8d8_wX6p_P6BtFkRK6sEA,24
1
+ upgini/__about__.py,sha256=49prCLbE3fFzLfxem5rd2dr1iV4_L-bN0N4J7jxU5yA,22
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=NHlf_ib70UiaDTzWvA30Gz2HANzpf61Ql5EDFZqQzk0,188120
6
+ upgini/features_enricher.py,sha256=twH4qdl91iHZF_AraLk0aIbRDw61S_DYtCWCZ34Yjjg,188077
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
@@ -14,16 +14,16 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
17
+ upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
18
18
  upgini/autofe/binary.py,sha256=TRjEdxsfyPY5E8ksYfdKMmU6GtvALfGFPNVIG7DBhzM,7520
19
19
  upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
- upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
20
+ upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
21
  upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
22
  upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
23
23
  upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
24
  upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- upgini/data_source/data_source_publisher.py,sha256=6ZiT36flJHioh1J3F9lv6vSyqjRnQ_enZ6Mmb3GI2T8,22209
26
+ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -49,7 +49,7 @@ upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
- upgini/utils/ip_utils.py,sha256=fB9jhabRr8mtZzNNscXIHQ5-QJqIZkAw3FO06eQ9jO8,5176
52
+ upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
53
53
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
54
54
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.6a1.dist-info/METADATA,sha256=h6NV4SfVmBdj8hXvweTrWrOCid84grVTAGPVFukvqiI,48609
61
- upgini-1.2.6a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.6a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.6a1.dist-info/RECORD,,
60
+ upgini-1.2.7.dist-info/METADATA,sha256=WvOelDedurBhIekxyFlENU8uZxq7ZcQhhEQCrc2TL04,48607
61
+ upgini-1.2.7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.7.dist-info/RECORD,,