upgini 1.2.127__py3-none-any.whl → 1.2.128__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/utils/track_info.py +29 -1
- {upgini-1.2.127.dist-info → upgini-1.2.128.dist-info}/METADATA +118 -118
- {upgini-1.2.127.dist-info → upgini-1.2.128.dist-info}/RECORD +6 -6
- {upgini-1.2.127.dist-info → upgini-1.2.128.dist-info}/WHEEL +0 -0
- {upgini-1.2.127.dist-info → upgini-1.2.128.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.128"
|
upgini/utils/track_info.py
CHANGED
|
@@ -5,6 +5,7 @@ import sys
|
|
|
5
5
|
from functools import lru_cache
|
|
6
6
|
from getpass import getuser
|
|
7
7
|
from hashlib import sha256
|
|
8
|
+
from threading import Event, Lock
|
|
8
9
|
from typing import Optional
|
|
9
10
|
from uuid import getnode
|
|
10
11
|
|
|
@@ -51,8 +52,12 @@ def _get_execution_ide() -> str:
|
|
|
51
52
|
return "other"
|
|
52
53
|
|
|
53
54
|
|
|
55
|
+
_inflight_lock = Lock()
|
|
56
|
+
_inflight_events = {}
|
|
57
|
+
|
|
58
|
+
|
|
54
59
|
@lru_cache
|
|
55
|
-
def
|
|
60
|
+
def _compute_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
56
61
|
# default values
|
|
57
62
|
track = {"ide": _get_execution_ide()}
|
|
58
63
|
ident_res = "https://api64.ipify.org"
|
|
@@ -164,3 +169,26 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
164
169
|
track["ip"] = "0.0.0.0"
|
|
165
170
|
|
|
166
171
|
return track
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
175
|
+
key = (client_ip, client_visitorid)
|
|
176
|
+
with _inflight_lock:
|
|
177
|
+
event = _inflight_events.get(key)
|
|
178
|
+
if event is None:
|
|
179
|
+
event = Event()
|
|
180
|
+
_inflight_events[key] = event
|
|
181
|
+
is_owner = True
|
|
182
|
+
else:
|
|
183
|
+
is_owner = False
|
|
184
|
+
|
|
185
|
+
if not is_owner:
|
|
186
|
+
event.wait()
|
|
187
|
+
return _compute_track_metrics(client_ip, client_visitorid)
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
return _compute_track_metrics(client_ip, client_visitorid)
|
|
191
|
+
finally:
|
|
192
|
+
with _inflight_lock:
|
|
193
|
+
event.set()
|
|
194
|
+
_inflight_events.pop(key, None)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.128
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -51,7 +51,7 @@ Description-Content-Type: text/markdown
|
|
|
51
51
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
|
|
52
52
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
|
|
53
53
|
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
|
|
54
|
-
<p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
|
|
54
|
+
<p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community, and premium external data sources, </br>including open & commercial LLMs</b> </p>
|
|
55
55
|
<p align="center">
|
|
56
56
|
<br />
|
|
57
57
|
<a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
|
|
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
|
|
|
59
59
|
<a href="https://profile.upgini.com">Register / Sign In</a> |
|
|
60
60
|
<!-- <a href="https://gitter.im/upgini/community?utm_source=share-link&utm_medium=link&utm_campaign=share-link">Gitter Community</a> | -->
|
|
61
61
|
<a href="https://4mlg.short.gy/join-upgini-community">Slack Community</a> |
|
|
62
|
-
<a href="https://forms.gle/pH99gb5hPxBEfNdR7"><strong>Propose new
|
|
62
|
+
<a href="https://forms.gle/pH99gb5hPxBEfNdR7"><strong>Propose a new data source</strong></a>
|
|
63
63
|
</p>
|
|
64
64
|
<p align=center>
|
|
65
65
|
<a href="/LICENSE"><img alt="BSD-3 license" src="https://img.shields.io/badge/license-BSD--3%20Clause-green"></a>
|
|
@@ -75,19 +75,19 @@ Description-Content-Type: text/markdown
|
|
|
75
75
|
[](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
|
|
76
76
|
## ❔ Overview
|
|
77
77
|
|
|
78
|
-
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of
|
|
78
|
+
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of ML features using large language models (LLMs), GNNs (graph neural networks), and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
|
|
79
79
|
|
|
80
|
-
**Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify
|
|
80
|
+
**Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want to radically simplify feature search and enrichment to make external data a standard approach. Like hyperparameter tuning in machine learning today.
|
|
81
81
|
|
|
82
82
|
**Mission:** Democratize access to data sources for data science community.
|
|
83
83
|
|
|
84
84
|
## 🚀 Awesome features
|
|
85
|
-
⭐️ Automatically find only relevant features that *
|
|
86
|
-
⭐️ Automated feature generation from the sources: feature generation with
|
|
87
|
-
⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/
|
|
88
|
-
⭐️ Calculate accuracy metrics and
|
|
89
|
-
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
90
|
-
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
85
|
+
⭐️ Automatically find only relevant features that *improve your model’s accuracy*. Not just correlated with the target variable, which in 9 out of 10 cases yields zero accuracy improvement
|
|
86
|
+
⭐️ Automated feature generation from the sources: feature generation with LLM‑based data augmentation, RNNs, and GraphNNs; ensembling across multiple data sources
|
|
87
|
+
⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/ZIP code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
88
|
+
⭐️ Calculate accuracy metrics and uplift after enriching an existing ML model with external features
|
|
89
|
+
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate the risks of unstable external data dependencies in the ML pipeline
|
|
90
|
+
⭐️ Easy to use - a single request to enrich the training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
91
91
|
<table>
|
|
92
92
|
<tr>
|
|
93
93
|
<td> date / datetime </td>
|
|
@@ -103,7 +103,7 @@ Description-Content-Type: text/markdown
|
|
|
103
103
|
</tr>
|
|
104
104
|
</table>
|
|
105
105
|
|
|
106
|
-
⭐️ Scikit-learn
|
|
106
|
+
⭐️ Scikit-learn-compatible interface for quick data integration with existing ML pipelines
|
|
107
107
|
⭐️ Support for most common supervised ML tasks on tabular data:
|
|
108
108
|
<table>
|
|
109
109
|
<tr>
|
|
@@ -112,7 +112,7 @@ Description-Content-Type: text/markdown
|
|
|
112
112
|
</tr>
|
|
113
113
|
<tr>
|
|
114
114
|
<td><a href="https://en.wikipedia.org/wiki/Regression_analysis">☑️ regression</a></td>
|
|
115
|
-
<td><a href="https://en.wikipedia.org/wiki/Time_series#Prediction_and_forecasting">☑️ time
|
|
115
|
+
<td><a href="https://en.wikipedia.org/wiki/Time_series#Prediction_and_forecasting">☑️ time-series prediction</a></td>
|
|
116
116
|
</tr>
|
|
117
117
|
</table>
|
|
118
118
|
|
|
@@ -124,13 +124,13 @@ Description-Content-Type: text/markdown
|
|
|
124
124
|
|
|
125
125
|
## 🌎 Connected data sources and coverage
|
|
126
126
|
|
|
127
|
-
- **Public data
|
|
128
|
-
- **Community
|
|
127
|
+
- **Public data**: public sector, academic institutions, other sources through open data portals. Curated and updated by the Upgini team
|
|
128
|
+
- **Community‑shared data**: royalty- or license-free datasets or features from the data science community (our users). This includes both public and scraped data
|
|
129
129
|
- **Premium data providers**: commercial data sources verified by the Upgini team in real-world use cases
|
|
130
130
|
|
|
131
|
-
👉 [**Details on
|
|
131
|
+
👉 [**Details on datasets and features**](https://upgini.com/#data_sources)
|
|
132
132
|
#### 📊 Total: **239 countries** and **up to 41 years** of history
|
|
133
|
-
|Data sources|Countries|History
|
|
133
|
+
|Data sources|Countries|History (years)|# sources for ensembling|Update frequency|Search keys|API Key required
|
|
134
134
|
|--|--|--|--|--|--|--|
|
|
135
135
|
|Historical weather & Climate normals | 68 |22|-|Monthly|date, country, postal/ZIP code|No
|
|
136
136
|
|Location/Places/POI/Area/Proximity information from OpenStreetMap | 221 |2|-|Monthly|date, country, postal/ZIP code|No
|
|
@@ -138,7 +138,7 @@ Description-Content-Type: text/markdown
|
|
|
138
138
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
139
139
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
140
140
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
141
|
-
|World mobile & fixed
|
|
141
|
+
|World mobile & fixed-broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
142
142
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
143
143
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
144
144
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -153,8 +153,8 @@ Description-Content-Type: text/markdown
|
|
|
153
153
|
|
|
154
154
|
### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
|
|
155
155
|
|
|
156
|
-
* The goal is to predict salary for data science job
|
|
157
|
-
* Following this guide, you'll learn how to **search
|
|
156
|
+
* The goal is to predict salary for a data science job posting based on information about the employer and job description.
|
|
157
|
+
* Following this guide, you'll learn how to **search and auto‑generate new relevant features with the Upgini library**
|
|
158
158
|
* The evaluation metric is [Mean Absolute Error (MAE)](https://en.wikipedia.org/wiki/Mean_absolute_error).
|
|
159
159
|
|
|
160
160
|
Run [Feature search & generation notebook](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb) inside your browser:
|
|
@@ -169,7 +169,7 @@ Run [Feature search & generation notebook](https://github.com/upgini/upgini/blob
|
|
|
169
169
|
### ❓ [Simple sales prediction for retail stores](https://github.com/upgini/upgini/blob/main/notebooks/kaggle_example.ipynb)
|
|
170
170
|
|
|
171
171
|
* The goal is to **predict future sales of different goods in stores** based on a 5-year history of sales.
|
|
172
|
-
* Kaggle Competition [Store Item Demand Forecasting Challenge](https://www.kaggle.com/c/demand-forecasting-kernels-only) is a product sales forecasting. The evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
|
|
172
|
+
* Kaggle Competition [Store Item Demand Forecasting Challenge](https://www.kaggle.com/c/demand-forecasting-kernels-only) is a product sales forecasting competition. The evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
|
|
173
173
|
|
|
174
174
|
Run [Simple sales prediction for retail stores](https://github.com/upgini/upgini/blob/main/notebooks/kaggle_example.ipynb) inside your browser:
|
|
175
175
|
|
|
@@ -181,25 +181,25 @@ Run [Simple sales prediction for retail stores](https://github.com/upgini/upgini
|
|
|
181
181
|
[](https://gitpod.io/#/github.com/upgini/upgini)
|
|
182
182
|
-->
|
|
183
183
|
|
|
184
|
-
### ❓ [How to boost ML model accuracy for Kaggle
|
|
184
|
+
### ❓ [How to boost ML model accuracy for Kaggle Top-1 leaderboard in 10 minutes](https://www.kaggle.com/code/romaupgini/more-external-features-for-top1-private-lb-4-54/notebook)
|
|
185
185
|
|
|
186
|
-
* The goal is **
|
|
187
|
-
* [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting
|
|
186
|
+
* The goal is **to improve a Top‑1 winning Kaggle solution** by adding new relevant external features and data.
|
|
187
|
+
* [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting competition; the evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
|
|
188
188
|
|
|
189
189
|
### ❓ [How to do low-code feature engineering for AutoML tools](https://www.kaggle.com/code/romaupgini/zero-feature-engineering-with-upgini-pycaret/notebook)
|
|
190
190
|
|
|
191
191
|
* **Save time on feature search and engineering**. Use ready-to-use external features and data sources to maximize overall AutoML accuracy, right out of the box.
|
|
192
192
|
* [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting, evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
|
|
193
|
-
* Low-code AutoML
|
|
193
|
+
* Low-code AutoML frameworks: [Upgini](https://github.com/upgini/upgini) and [PyCaret](https://github.com/pycaret/pycaret)
|
|
194
194
|
|
|
195
|
-
### ❓ [How to improve accuracy of Multivariate
|
|
195
|
+
### ❓ [How to improve accuracy of Multivariate time-series forecast from external features & data](https://www.kaggle.com/code/romaupgini/guide-external-data-features-for-multivariatets/notebook)
|
|
196
196
|
|
|
197
|
-
* The goal is **accuracy
|
|
197
|
+
* The goal is **to improve the accuracy of multivariate time‑series forecasting** using new relevant external features and data. The main challenge is the data and feature enrichment strategy, in which a component of a multivariate time series depends not only on its past values but also on other components.
|
|
198
198
|
* [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting, evaluation metric is [RMSLE](https://www.kaggle.com/code/carlmcbrideellis/store-sales-using-the-average-of-the-last-16-days#Note-regarding-calculating-the-average).
|
|
199
199
|
|
|
200
200
|
### ❓ [How to speed up feature engineering hypothesis tests with ready-to-use external features](https://www.kaggle.com/code/romaupgini/statement-dates-to-use-or-not-to-use/notebook)
|
|
201
201
|
|
|
202
|
-
* **Save time on external data wrangling and feature calculation code** for hypothesis tests. The key challenge
|
|
202
|
+
* **Save time on external data wrangling and feature calculation code** for hypothesis tests. The key challenge is the time‑dependent representation of information in the training dataset, which is uncommon for credit default prediction tasks. As a result, special data enrichment strategy is used.
|
|
203
203
|
* [Kaggle Competition](https://www.kaggle.com/competitions/amex-default-prediction) is a credit default prediction, evaluation metric is [normalized Gini coefficient](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327464).
|
|
204
204
|
|
|
205
205
|
## 🏁 Quick start
|
|
@@ -228,19 +228,19 @@ docker build -t upgini .</i></br>
|
|
|
228
228
|
<i>
|
|
229
229
|
docker run -p 8888:8888 upgini</br>
|
|
230
230
|
</i></br>
|
|
231
|
-
3. Open http://localhost:8888?token
|
|
231
|
+
3. Open http://localhost:8888?token=<your_token_from_console_output> in your browser
|
|
232
232
|
</details>
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
### 2. 💡 Use your labeled training dataset for search
|
|
236
236
|
|
|
237
237
|
You can use your labeled training datasets "as is" to initiate the search. Under the hood, we'll search for relevant data using:
|
|
238
|
-
- **[search keys](#-search-key-types-we-support-more-to-come)** from training dataset to match records from potential data sources with
|
|
239
|
-
- **labels** from training dataset to estimate
|
|
240
|
-
- **your features** from training dataset to find external datasets and features
|
|
238
|
+
- **[search keys](#-search-key-types-we-support-more-to-come)** from the training dataset to match records from potential data sources with new features
|
|
239
|
+
- **labels** from the training dataset to estimate the relevance of features or datasets for your ML task and calculate feature importance metrics
|
|
240
|
+
- **your features** from the training dataset to find external datasets and features that improve accuracy of your existing data and estimate accuracy uplift ([optional](#find-features-only-give-accuracy-gain-to-existing-data-in-the-ml-model))
|
|
241
241
|
|
|
242
242
|
|
|
243
|
-
Load training dataset into
|
|
243
|
+
Load the training dataset into a Pandas DataFrame and separate feature columns from the label column in a Scikit-learn way:
|
|
244
244
|
```python
|
|
245
245
|
import pandas as pd
|
|
246
246
|
# labeled training dataset - customer_churn_prediction_train.csv
|
|
@@ -251,7 +251,7 @@ y = train_df["churn_flag"]
|
|
|
251
251
|
<table border=1 cellpadding=10><tr><td>
|
|
252
252
|
⚠️ <b>Requirements for search initialization dataset</b>
|
|
253
253
|
<br>
|
|
254
|
-
We
|
|
254
|
+
We perform dataset verification and cleaning under the hood, but still there are some requirements to follow:
|
|
255
255
|
<br>
|
|
256
256
|
1. <b>pandas.DataFrame</b>, <b>pandas.Series</b> or <b>numpy.ndarray</b> representation;
|
|
257
257
|
<br>
|
|
@@ -259,12 +259,12 @@ We do dataset verification and cleaning under the hood, but still there are some
|
|
|
259
259
|
<br>
|
|
260
260
|
3. at least one column selected as a <a href="#-search-key-types-we-support-more-to-come">search key</a>;
|
|
261
261
|
<br>
|
|
262
|
-
4. min size after deduplication by search
|
|
262
|
+
4. min size after deduplication by search-key columns and removal of NaNs: <i>100 records</i>
|
|
263
263
|
</td></tr></table>
|
|
264
264
|
|
|
265
|
-
### 3. 🔦 Choose one or
|
|
266
|
-
*Search keys* columns will be used to match records from all potential external data sources
|
|
267
|
-
Define one or
|
|
265
|
+
### 3. 🔦 Choose one or more columns as search keys
|
|
266
|
+
*Search keys* columns will be used to match records from all potential external data sources/features.
|
|
267
|
+
Define one or more columns as search keys when initializing the `FeaturesEnricher` class.
|
|
268
268
|
```python
|
|
269
269
|
from upgini.features_enricher import FeaturesEnricher
|
|
270
270
|
from upgini.metadata import SearchKey
|
|
@@ -284,7 +284,7 @@ enricher = FeaturesEnricher(
|
|
|
284
284
|
<tr>
|
|
285
285
|
<th> Search Key<br/>Meaning Type </th>
|
|
286
286
|
<th> Description </th>
|
|
287
|
-
<th> Allowed pandas dtypes (
|
|
287
|
+
<th> Allowed pandas dtypes (Python types) </th>
|
|
288
288
|
<th> Example </th>
|
|
289
289
|
</tr>
|
|
290
290
|
<tr>
|
|
@@ -301,13 +301,13 @@ enricher = FeaturesEnricher(
|
|
|
301
301
|
</tr>
|
|
302
302
|
<tr>
|
|
303
303
|
<td> SearchKey.IP </td>
|
|
304
|
-
<td>
|
|
305
|
-
<td> <tt>object(str, ipaddress.IPv4Address)</tt> <br/> <tt>string</tt> <br/> <tt>int64</tt> </td>
|
|
304
|
+
<td> IPv4 or IPv6 address</td>
|
|
305
|
+
<td> <tt>object(str, ipaddress.IPv4Address, ipaddress.IPv6Address)</tt> <br/> <tt>string</tt> <br/> <tt>int64</tt> </td>
|
|
306
306
|
<td> <tt>192.168.0.1 </tt> </td>
|
|
307
307
|
</tr>
|
|
308
308
|
<tr>
|
|
309
309
|
<td> SearchKey.PHONE </td>
|
|
310
|
-
<td> phone number
|
|
310
|
+
<td> phone number (<a href="https://en.wikipedia.org/wiki/E.164">E.164 standard</a>) </td>
|
|
311
311
|
<td> <tt>object(str)</tt> <br/> <tt>string</tt> <br/> <tt>int64</tt> <br/> <tt>float64</tt> </td>
|
|
312
312
|
<td> <tt>443451925138 </tt> </td>
|
|
313
313
|
</tr>
|
|
@@ -322,7 +322,7 @@ enricher = FeaturesEnricher(
|
|
|
322
322
|
</td>
|
|
323
323
|
<td>
|
|
324
324
|
<tt>2020-02-12 </tt> (<a href="https://en.wikipedia.org/wiki/ISO_8601">ISO-8601 standard</a>)
|
|
325
|
-
<br/> <tt>12.02.2020 </tt> (non
|
|
325
|
+
<br/> <tt>12.02.2020 </tt> (non‑standard notation)
|
|
326
326
|
</td>
|
|
327
327
|
</tr>
|
|
328
328
|
<tr>
|
|
@@ -344,7 +344,7 @@ enricher = FeaturesEnricher(
|
|
|
344
344
|
</tr>
|
|
345
345
|
<tr>
|
|
346
346
|
<td> SearchKey.POSTAL_CODE </td>
|
|
347
|
-
<td> Postal code a.k.a. ZIP code.
|
|
347
|
+
<td> Postal code a.k.a. ZIP code. Can only be used with SearchKey.COUNTRY </td>
|
|
348
348
|
<td> <tt>object(str)</tt> <br/> <tt>string</tt> </td>
|
|
349
349
|
<td> <tt>21174 </tt> <br/> <tt>061107 </tt> <br/> <tt>SE-999-99 </tt> </td>
|
|
350
350
|
</tr>
|
|
@@ -352,7 +352,7 @@ enricher = FeaturesEnricher(
|
|
|
352
352
|
|
|
353
353
|
</details>
|
|
354
354
|
|
|
355
|
-
For the
|
|
355
|
+
For the search key types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to specify the date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
|
|
356
356
|
```python
|
|
357
357
|
from upgini.features_enricher import FeaturesEnricher
|
|
358
358
|
from upgini.metadata import SearchKey
|
|
@@ -370,12 +370,12 @@ enricher = FeaturesEnricher(
|
|
|
370
370
|
)
|
|
371
371
|
```
|
|
372
372
|
|
|
373
|
-
To use
|
|
373
|
+
To use a non-UTC timezone for datetime, you can cast datetime column explicitly to your timezone (example for Warsaw):
|
|
374
374
|
```python
|
|
375
375
|
df["date"] = df.date.astype("datetime64").dt.tz_localize("Europe/Warsaw")
|
|
376
376
|
```
|
|
377
377
|
|
|
378
|
-
|
|
378
|
+
A single country for the whole training dataset can be passed via `country_code` parameter:
|
|
379
379
|
```python
|
|
380
380
|
from upgini.features_enricher import FeaturesEnricher
|
|
381
381
|
from upgini.metadata import SearchKey
|
|
@@ -391,10 +391,10 @@ enricher = FeaturesEnricher(
|
|
|
391
391
|
```
|
|
392
392
|
|
|
393
393
|
### 4. 🔍 Start your first feature search!
|
|
394
|
-
The main abstraction you interact is `FeaturesEnricher`, a Scikit-learn
|
|
395
|
-
Create instance of the `FeaturesEnricher` class and call:
|
|
394
|
+
The main abstraction you interact with is `FeaturesEnricher`, a Scikit-learn-compatible estimator. You can easily add it to your existing ML pipelines.
|
|
395
|
+
Create an instance of the `FeaturesEnricher` class and call:
|
|
396
396
|
- `fit` to search relevant datasets & features
|
|
397
|
-
-
|
|
397
|
+
- then `transform` to enrich your dataset with features from the search result
|
|
398
398
|
|
|
399
399
|
Let's try it out!
|
|
400
400
|
```python
|
|
@@ -407,7 +407,7 @@ train_df = pd.read_csv("customer_churn_prediction_train.csv")
|
|
|
407
407
|
X = train_df.drop(columns="churn_flag")
|
|
408
408
|
y = train_df["churn_flag"]
|
|
409
409
|
|
|
410
|
-
# now we're going to create `FeaturesEnricher` class
|
|
410
|
+
# now we're going to create an instance of the `FeaturesEnricher` class
|
|
411
411
|
enricher = FeaturesEnricher(
|
|
412
412
|
search_keys={
|
|
413
413
|
"subscription_activation_date": SearchKey.DATE,
|
|
@@ -415,15 +415,15 @@ enricher = FeaturesEnricher(
|
|
|
415
415
|
"zip_code": SearchKey.POSTAL_CODE
|
|
416
416
|
})
|
|
417
417
|
|
|
418
|
-
#
|
|
419
|
-
#
|
|
418
|
+
# Everything is ready to fit! For 100k records, fitting should take around 10 minutes
|
|
419
|
+
# We'll send an email notification; just register on profile.upgini.com
|
|
420
420
|
enricher.fit(X, y)
|
|
421
421
|
```
|
|
422
422
|
|
|
423
|
-
That's
|
|
423
|
+
That's it! The `FeaturesEnricher` is now fitted.
|
|
424
424
|
### 5. 📈 Evaluate feature importances (SHAP values) from the search result
|
|
425
425
|
|
|
426
|
-
`FeaturesEnricher` class has two properties for feature importances,
|
|
426
|
+
`FeaturesEnricher` class has two properties for feature importances, that are populated after fit - `feature_names_` and `feature_importances_`:
|
|
427
427
|
- `feature_names_` - feature names from the search result, and if parameter `keep_input=True` was used, initial columns from search dataset as well
|
|
428
428
|
- `feature_importances_` - SHAP values for features from the search result, same order as in `feature_names_`
|
|
429
429
|
|
|
@@ -434,8 +434,8 @@ enricher.get_features_info()
|
|
|
434
434
|
Get more details about `FeaturesEnricher` at runtime using docstrings via `help(FeaturesEnricher)` or `help(FeaturesEnricher.fit)`.
|
|
435
435
|
|
|
436
436
|
### 6. 🏭 Enrich Production ML pipeline with relevant external features
|
|
437
|
-
`FeaturesEnricher` is a Scikit-learn
|
|
438
|
-
Use `transform` method of `FeaturesEnricher
|
|
437
|
+
`FeaturesEnricher` is a Scikit-learn-compatible estimator, so any pandas dataframe can be enriched with external features from a search result (after `fit`).
|
|
438
|
+
Use the `transform` method of `FeaturesEnricher`, and let the magic do the rest 🪄
|
|
439
439
|
```python
|
|
440
440
|
# load dataset for enrichment
|
|
441
441
|
test_x = pd.read_csv("test.csv")
|
|
@@ -444,24 +444,24 @@ enriched_test_features = enricher.transform(test_x)
|
|
|
444
444
|
```
|
|
445
445
|
#### 6.1 Reuse completed search for enrichment without 'fit' run
|
|
446
446
|
|
|
447
|
-
`FeaturesEnricher` can be
|
|
447
|
+
`FeaturesEnricher` can be initialized with `search_id` from a completed search (after a fit call).
|
|
448
448
|
Just use `enricher.get_search_id()` or copy search id string from the `fit()` output.
|
|
449
|
-
Search keys and features in X
|
|
449
|
+
Search keys and features in X must be the same as for `fit()`
|
|
450
450
|
```python
|
|
451
451
|
enricher = FeaturesEnricher(
|
|
452
|
-
#same set of
|
|
452
|
+
# same set of search keys as for the fit step
|
|
453
453
|
search_keys={"date": SearchKey.DATE},
|
|
454
|
-
api_key="<YOUR API_KEY>", # if you
|
|
454
|
+
api_key="<YOUR API_KEY>", # if you fitted the enricher with an api_key, then you should use it here
|
|
455
455
|
search_id = "abcdef00-0000-0000-0000-999999999999"
|
|
456
456
|
)
|
|
457
|
-
enriched_prod_dataframe=enricher.transform(input_dataframe)
|
|
457
|
+
enriched_prod_dataframe = enricher.transform(input_dataframe)
|
|
458
458
|
```
|
|
459
|
-
#### 6.2 Enrichment with
|
|
460
|
-
|
|
461
|
-
`FeaturesEnricher`, when
|
|
462
|
-
And then, for `transform` in a production ML pipeline, you'll get enrichment with relevant features,
|
|
459
|
+
#### 6.2 Enrichment with updated external data sources and features
|
|
460
|
+
In most ML cases, the training step requires a labeled dataset with historical observations. For production, you'll need updated, current data sources and features to generate predictions.
|
|
461
|
+
`FeaturesEnricher`, when initialized with a set of search keys that includes `SearchKey.DATE`, will match records from all potential external data sources **exactly on the specified date/datetime** based on `SearchKey.DATE`, to avoid enrichment with features "from the future" during the `fit` step.
|
|
462
|
+
And then, for `transform` in a production ML pipeline, you'll get enrichment with relevant features, current as of the present date.
|
|
463
463
|
|
|
464
|
-
⚠️
|
|
464
|
+
⚠️ Include `SearchKey.DATE` in the set of search keys to get current features for production and avoid features from the future during training:
|
|
465
465
|
```python
|
|
466
466
|
enricher = FeaturesEnricher(
|
|
467
467
|
search_keys={
|
|
@@ -475,13 +475,13 @@ enricher = FeaturesEnricher(
|
|
|
475
475
|
## 💻 How does it work?
|
|
476
476
|
|
|
477
477
|
### 🧹 Search dataset validation
|
|
478
|
-
We validate and clean search
|
|
478
|
+
We validate and clean the search‑initialization dataset under the hood:
|
|
479
479
|
|
|
480
|
-
-
|
|
480
|
+
- check your **search keys** columns' formats;
|
|
481
481
|
- check zero variance for label column;
|
|
482
|
-
- check dataset for full row duplicates. If we find any, we remove
|
|
483
|
-
- check inconsistent labels - rows with the same features and keys but different labels, we remove them and
|
|
484
|
-
-
|
|
482
|
+
- check dataset for full row duplicates. If we find any, we remove them and report their share;
|
|
483
|
+
- check inconsistent labels - rows with the same features and keys but different labels, we remove them and report their share;
|
|
484
|
+
- remove columns with zero variance - we treat any non **search key** column in the search dataset as a feature, so columns with zero variance will be removed
|
|
485
485
|
|
|
486
486
|
### ❔ Supervised ML tasks detection
|
|
487
487
|
We detect ML task under the hood based on label column values. Currently we support:
|
|
@@ -489,7 +489,7 @@ We detect ML task under the hood based on label column values. Currently we supp
|
|
|
489
489
|
- ModelTaskType.MULTICLASS
|
|
490
490
|
- ModelTaskType.REGRESSION
|
|
491
491
|
|
|
492
|
-
But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML
|
|
492
|
+
But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML task type:
|
|
493
493
|
```python
|
|
494
494
|
from upgini.features_enricher import FeaturesEnricher
|
|
495
495
|
from upgini.metadata import SearchKey, ModelTaskType
|
|
@@ -499,12 +499,12 @@ enricher = FeaturesEnricher(
|
|
|
499
499
|
model_task_type=ModelTaskType.REGRESSION
|
|
500
500
|
)
|
|
501
501
|
```
|
|
502
|
-
#### ⏰ Time
|
|
503
|
-
*Time
|
|
504
|
-
* [Scikit-learn time
|
|
505
|
-
* [Blocked time
|
|
502
|
+
#### ⏰ Time-series prediction support
|
|
503
|
+
*Time-series prediction* is supported as `ModelTaskType.REGRESSION` or `ModelTaskType.BINARY` tasks with time-series‑specific cross-validation splits:
|
|
504
|
+
* [Scikit-learn time-series cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html#time-series-split) - `CVType.time_series` parameter
|
|
505
|
+
* [Blocked time-series cross-validation](https://goldinlocks.github.io/Time-Series-Cross-Validation/#Blocked-and-Time-Series-Split-Cross-Validation) - `CVType.blocked_time_series` parameter
|
|
506
506
|
|
|
507
|
-
To initiate feature search you can pass cross-validation type parameter to `FeaturesEnricher` with time
|
|
507
|
+
To initiate feature search, you can pass the cross-validation type parameter to `FeaturesEnricher` with a time-series‑specific CV type:
|
|
508
508
|
```python
|
|
509
509
|
from upgini.features_enricher import FeaturesEnricher
|
|
510
510
|
from upgini.metadata import SearchKey, CVType
|
|
@@ -525,12 +525,12 @@ enricher = FeaturesEnricher(
|
|
|
525
525
|
cv=CVType.time_series
|
|
526
526
|
)
|
|
527
527
|
```
|
|
528
|
-
⚠️ **
|
|
528
|
+
⚠️ **Preprocess the dataset** in case of time-series prediction:
|
|
529
529
|
sort rows in dataset according to observation order, in most cases - ascending order by date/datetime.
|
|
530
530
|
|
|
531
531
|
### 🆙 Accuracy and uplift metrics calculations
|
|
532
|
-
`FeaturesEnricher`
|
|
533
|
-
You can use any model estimator with scikit-learn
|
|
532
|
+
`FeaturesEnricher` automatically calculates model metrics and uplift from new relevant features either using `calculate_metrics()` method or `calculate_metrics=True` parameter in `fit` or `fit_transform` methods (example below).
|
|
533
|
+
You can use any model estimator with scikit-learn-compatible interface, some examples are:
|
|
534
534
|
* [All Scikit-Learn supervised models](https://scikit-learn.org/stable/supervised_learning.html)
|
|
535
535
|
* [Xgboost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn)
|
|
536
536
|
* [LightGBM](https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api)
|
|
@@ -538,8 +538,8 @@ You can use any model estimator with scikit-learn compartible interface, some ex
|
|
|
538
538
|
|
|
539
539
|
<details>
|
|
540
540
|
<summary>
|
|
541
|
-
👈 Evaluation metric should be passed to <i>calculate_metrics()</i> by <i>scoring</i>
|
|
542
|
-
out-of-the
|
|
541
|
+
👈 Evaluation metric should be passed to <i>calculate_metrics()</i> by the <i>scoring</i> parameter,<br/>
|
|
542
|
+
out-of-the-box Upgini supports
|
|
543
543
|
</summary>
|
|
544
544
|
<table style="table-layout: fixed;">
|
|
545
545
|
<tr>
|
|
@@ -646,10 +646,10 @@ You can use any model estimator with scikit-learn compartible interface, some ex
|
|
|
646
646
|
</table>
|
|
647
647
|
</details>
|
|
648
648
|
|
|
649
|
-
In addition to that list, you can define custom evaluation metric function using [scikit-learn make_scorer](https://scikit-learn.org/
|
|
649
|
+
In addition to that list, you can define a custom evaluation metric function using [scikit-learn make_scorer](https://scikit-learn.org/1.7/modules/model_evaluation.html#defining-your-scoring-strategy-from-score-functions), for example [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
|
|
650
650
|
|
|
651
|
-
By default, `calculate_metrics()` method calculates evaluation metric with the same cross-validation split as selected for `FeaturesEnricher.fit()` by parameter `cv = CVType.<cross-validation-split>`.
|
|
652
|
-
But you can easily define new split by passing
|
|
651
|
+
By default, the `calculate_metrics()` method calculates the evaluation metric with the same cross-validation split as selected for `FeaturesEnricher.fit()` by the parameter `cv = CVType.<cross-validation-split>`.
|
|
652
|
+
But you can easily define a new split by passing a subclass of `BaseCrossValidator` to the `cv` parameter in `calculate_metrics()`.
|
|
653
653
|
|
|
654
654
|
Example with more tips-and-tricks:
|
|
655
655
|
```python
|
|
@@ -674,7 +674,7 @@ enricher.calculate_metrics(scoring=custom_scoring)
|
|
|
674
674
|
custom_cv = TimeSeriesSplit(n_splits=5)
|
|
675
675
|
enricher.calculate_metrics(cv=custom_cv)
|
|
676
676
|
|
|
677
|
-
# All
|
|
677
|
+
# All of these custom parameters can be combined in both methods: fit, fit_transform and calculate_metrics:
|
|
678
678
|
enricher.fit(X, y, eval_set, calculate_metrics=True, estimator=custom_estimator, scoring=custom_scoring, cv=custom_cv)
|
|
679
679
|
```
|
|
680
680
|
|
|
@@ -684,9 +684,9 @@ enricher.fit(X, y, eval_set, calculate_metrics=True, estimator=custom_estimator,
|
|
|
684
684
|
|
|
685
685
|
### 🤖 Automated feature generation from columns in a search dataset
|
|
686
686
|
|
|
687
|
-
If a training dataset has a text column, you can generate additional embeddings from it using
|
|
687
|
+
If a training dataset has a text column, you can generate additional embeddings from it using instruction‑guided embedding generation with LLMs and data augmentation from external sources, just like Upgini does for all records from connected data sources.
|
|
688
688
|
|
|
689
|
-
|
|
689
|
+
In most cases, this gives better results than direct embeddings generation from a text field. Currently, Upgini has two LLMs connected to the search engine - GPT-3.5 from OpenAI and GPT-J.
|
|
690
690
|
|
|
691
691
|
To use this feature, pass the column names as arguments to the `generate_features` parameter. You can use up to 2 columns.
|
|
692
692
|
|
|
@@ -701,17 +701,17 @@ enricher = FeaturesEnricher(
|
|
|
701
701
|
|
|
702
702
|
With this code, Upgini will generate LLM embeddings from text columns and then check them for predictive power for your ML task.
|
|
703
703
|
|
|
704
|
-
Finally, Upgini will return a dataset enriched
|
|
704
|
+
Finally, Upgini will return a dataset enriched with only the relevant components of LLM embeddings.
|
|
705
705
|
|
|
706
|
-
### Find features only
|
|
706
|
+
### Find features that only provide accuracy gains to existing data in the ML model
|
|
707
707
|
|
|
708
|
-
If you already have features or other external data sources, you can specifically search new datasets
|
|
708
|
+
If you already have features or other external data sources, you can specifically search for new datasets and features that only provide accuracy gains "on top" of them.
|
|
709
709
|
|
|
710
|
-
Just leave all these existing features in the labeled training dataset and Upgini library automatically
|
|
710
|
+
Just leave all these existing features in the labeled training dataset and the Upgini library automatically uses them during the feature search process and as a baseline ML model to calculate accuracy metric uplift. Only features that improve accuracy will be returned.
|
|
711
711
|
|
|
712
712
|
### Check robustness of accuracy improvement from external features
|
|
713
713
|
|
|
714
|
-
You can validate external features
|
|
714
|
+
You can validate the robustness of external features on an out-of-time dataset using the `eval_set` parameter:
|
|
715
715
|
```python
|
|
716
716
|
# load train dataset
|
|
717
717
|
train_df = pd.read_csv("train.csv")
|
|
@@ -738,13 +738,13 @@ enricher.fit(
|
|
|
738
738
|
- Same data schema as for search initialization X dataset
|
|
739
739
|
- Pandas dataframe representation
|
|
740
740
|
|
|
741
|
-
There are 3 options to pass out-of-time without labels:
|
|
741
|
+
The out-of-time dataset can be without labels. There are 3 options to pass out-of-time without labels:
|
|
742
742
|
```python
|
|
743
743
|
enricher.fit(
|
|
744
744
|
train_ids_and_features,
|
|
745
745
|
train_label,
|
|
746
746
|
eval_set = [
|
|
747
|
-
(eval_ids_and_features_1,), #
|
|
747
|
+
(eval_ids_and_features_1,), # A tuple with 1 element
|
|
748
748
|
(eval_ids_and_features_2, None), # None as labels
|
|
749
749
|
(eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
|
|
750
750
|
]
|
|
@@ -776,15 +776,15 @@ enriched_df = enricher.fit_transform(
|
|
|
776
776
|
```
|
|
777
777
|
|
|
778
778
|
**Stability parameters:**
|
|
779
|
-
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI
|
|
779
|
+
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI above this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
|
|
780
780
|
- `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
|
|
781
781
|
|
|
782
|
-
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
|
|
782
|
+
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models. PSI is calculated on the eval_set, which should contain the most recent dates relative to the training dataset.
|
|
783
783
|
|
|
784
784
|
### Use custom loss function in feature selection & metrics calculation
|
|
785
785
|
|
|
786
786
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
|
787
|
-
Depending on ML
|
|
787
|
+
Depending on the ML task, you can use the following loss functions:
|
|
788
788
|
- `regression`: regression, regression_l1, huber, poisson, quantile, mape, gamma, tweedie;
|
|
789
789
|
- `binary`: binary;
|
|
790
790
|
- `multiclass`: multiclass, multiclassova.
|
|
@@ -803,7 +803,7 @@ enriched_dataframe.fit(X, y)
|
|
|
803
803
|
|
|
804
804
|
### Exclude premium data sources from fit, transform and metrics calculation
|
|
805
805
|
|
|
806
|
-
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with
|
|
806
|
+
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with the `exclude_features_sources` parameter to exclude Trial or Paid features from Premium data sources:
|
|
807
807
|
```python
|
|
808
808
|
enricher = FeaturesEnricher(
|
|
809
809
|
search_keys={"subscription_activation_date": SearchKey.DATE}
|
|
@@ -816,7 +816,7 @@ enricher.transform(X, exclude_features_sources=(trial_features + paid_features))
|
|
|
816
816
|
```
|
|
817
817
|
|
|
818
818
|
### Turn off autodetection for search key columns
|
|
819
|
-
Upgini has autodetection of search keys
|
|
819
|
+
Upgini has autodetection of search keys enabled by default.
|
|
820
820
|
To turn off use `autodetect_search_keys=False`:
|
|
821
821
|
|
|
822
822
|
```python
|
|
@@ -828,8 +828,8 @@ enricher = FeaturesEnricher(
|
|
|
828
828
|
enricher.fit(X, y)
|
|
829
829
|
```
|
|
830
830
|
|
|
831
|
-
### Turn off
|
|
832
|
-
Upgini
|
|
831
|
+
### Turn off removal of target outliers
|
|
832
|
+
Upgini detects rows with target outliers for regression tasks. By default such rows are dropped during metrics calculation. To turn off the removal of target‑outlier rows, use the `remove_outliers_calc_metrics=False` parameter in the fit, fit_transform, or calculate_metrics methods:
|
|
833
833
|
|
|
834
834
|
```python
|
|
835
835
|
enricher = FeaturesEnricher(
|
|
@@ -839,8 +839,8 @@ enricher = FeaturesEnricher(
|
|
|
839
839
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
|
840
840
|
```
|
|
841
841
|
|
|
842
|
-
### Turn off
|
|
843
|
-
Upgini
|
|
842
|
+
### Turn off feature generation on search keys
|
|
843
|
+
Upgini attempts to generate features for email, date and datetime search keys. By default this generation is enabled. To disable it use the `generate_search_key_features` parameter of the FeaturesEnricher constructor:
|
|
844
844
|
|
|
845
845
|
```python
|
|
846
846
|
enricher = FeaturesEnricher(
|
|
@@ -851,37 +851,37 @@ enricher = FeaturesEnricher(
|
|
|
851
851
|
|
|
852
852
|
## 🔑 Open up all capabilities of Upgini
|
|
853
853
|
|
|
854
|
-
[Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features:
|
|
854
|
+
[Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features: 600M+ phone numbers, 350M+ emails, 2^32 IP addresses
|
|
855
855
|
|
|
856
856
|
|Benefit|No Sign-up | Registered user |
|
|
857
857
|
|--|--|--|
|
|
858
858
|
|Enrichment with **date/datetime, postal/ZIP code and country keys** | Yes | Yes |
|
|
859
|
-
|Enrichment with **phone number, hashed email/HEM and IP
|
|
859
|
+
|Enrichment with **phone number, hashed email/HEM and IP address keys** | No | Yes |
|
|
860
860
|
|Email notification on **search task completion** | No | Yes |
|
|
861
861
|
|Automated **feature generation with LLMs** from columns in a search dataset| Yes, *till 12/05/23* | Yes |
|
|
862
862
|
|Email notification on **new data source activation** 🔜 | No | Yes |
|
|
863
863
|
|
|
864
|
-
## 👩🏻💻 How to share data/features with
|
|
865
|
-
You may publish ANY data which you consider as royalty
|
|
864
|
+
## 👩🏻💻 How to share data/features with the community?
|
|
865
|
+
You may publish ANY data which you consider as royalty‑ or license‑free ([Open Data](http://opendatahandbook.org/guide/en/what-is-open-data/)) and potentially valuable for ML applications for **community usage**:
|
|
866
866
|
1. Please Sign Up [here](https://profile.upgini.com)
|
|
867
|
-
2. Copy *Upgini API key* from profile and upload your data from Upgini
|
|
867
|
+
2. Copy *Upgini API key* from your profile and upload your data from the Upgini Python library with this key:
|
|
868
868
|
```python
|
|
869
869
|
import pandas as pd
|
|
870
870
|
from upgini.metadata import SearchKey
|
|
871
871
|
from upgini.ads import upload_user_ads
|
|
872
872
|
import os
|
|
873
873
|
os.environ["UPGINI_API_KEY"] = "your_long_string_api_key_goes_here"
|
|
874
|
-
#you can define custom search key
|
|
874
|
+
#you can define a custom search key that might not yet be supported; just use SearchKey.CUSTOM_KEY type
|
|
875
875
|
sample_df = pd.read_csv("path_to_data_sample_file")
|
|
876
876
|
upload_user_ads("test", sample_df, {
|
|
877
877
|
"city": SearchKey.CUSTOM_KEY,
|
|
878
878
|
"stats_date": SearchKey.DATE
|
|
879
879
|
})
|
|
880
880
|
```
|
|
881
|
-
3. After data verification, search results on community data will be available usual way.
|
|
881
|
+
3. After data verification, search results on community data will be available in the usual way.
|
|
882
882
|
|
|
883
883
|
## 🛠 Getting Help & Community
|
|
884
|
-
Please note
|
|
884
|
+
Please note that we are still in beta.
|
|
885
885
|
Requests and support, in preferred order
|
|
886
886
|
[](https://4mlg.short.gy/join-upgini-community)
|
|
887
887
|
[](https://github.com/upgini/upgini/issues)
|
|
@@ -894,22 +894,22 @@ Requests and support, in preferred order
|
|
|
894
894
|
|
|
895
895
|
## 🧩 Contributing
|
|
896
896
|
We are not a large team, so we probably won't be able to:
|
|
897
|
-
- implement smooth integration with most common low-code ML libraries and platforms ([PyCaret](https://www.github.com/pycaret/pycaret), [H2O AutoML](https://github.com//h2oai/h2o-3/blob/master/h2o-docs/src/product/automl.rst), etc.
|
|
897
|
+
- implement smooth integration with the most common low-code ML libraries and platforms ([PyCaret](https://www.github.com/pycaret/pycaret), [H2O AutoML](https://github.com//h2oai/h2o-3/blob/master/h2o-docs/src/product/automl.rst), etc.)
|
|
898
898
|
- implement all possible data verification and normalization capabilities for different types of search keys
|
|
899
899
|
And we need some help from the community!
|
|
900
900
|
|
|
901
|
-
So, we'll be happy about every **pull request** you open and **issue** you
|
|
902
|
-
**For major changes**, please open an issue first to discuss what you would like to change
|
|
901
|
+
So, we'll be happy about every **pull request** you open and every **issue** you report to make this library **even better**. Please note that it might sometimes take us a while to get back to you.
|
|
902
|
+
**For major changes**, please open an issue first to discuss what you would like to change.
|
|
903
903
|
#### Developing
|
|
904
904
|
Some convenient ways to start contributing are:
|
|
905
905
|
⚙️ [**Open in Visual Studio Code**](https://open.vscode.dev/upgini/upgini) You can remotely open this repo in VS Code without cloning or automatically clone and open it inside a docker container.
|
|
906
906
|
⚙️ **Gitpod** [](https://gitpod.io/#https://github.com/upgini/upgini) You can use Gitpod to launch a fully functional development environment right in your browser.
|
|
907
907
|
|
|
908
908
|
## 🔗 Useful links
|
|
909
|
-
- [Simple sales
|
|
909
|
+
- [Simple sales prediction template notebook](#-simple-sales-prediction-for-retail-stores)
|
|
910
910
|
- [Full list of Kaggle Guides & Examples](https://www.kaggle.com/romaupgini/code)
|
|
911
911
|
- [Project on PyPI](https://pypi.org/project/upgini)
|
|
912
912
|
- [More perks for registered users](https://profile.upgini.com)
|
|
913
913
|
|
|
914
|
-
<sup>😔 Found
|
|
914
|
+
<sup>😔 Found typo or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
915
915
|
Please report it here</a></sup>
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=C0d4LBtUePjgosyBrp3M9Gs4uNFtaPonkhavEUGrIKY,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=Nm2ZmwyQqvTnymYpGUwyJWy7y2ebXlHMyYmGeGcyA_s,31652
|
|
@@ -71,10 +71,10 @@ upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,
|
|
|
71
71
|
upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
|
|
72
72
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
|
73
73
|
upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,10882
|
|
74
|
-
upgini/utils/track_info.py,sha256=
|
|
74
|
+
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.128.dist-info/METADATA,sha256=zp7dVLkRwcrMKpsJ1r-a_CSib9n36FpD1WbS0VeBGVM,51142
|
|
78
|
+
upgini-1.2.128.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
79
|
+
upgini-1.2.128.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.128.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|