upgini 1.1.261a3250.post2__tar.gz → 1.2.31a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (103) hide show
  1. upgini-1.2.31a1/.gitignore +157 -0
  2. {upgini-1.1.261a3250.post2/src/upgini.egg-info → upgini-1.2.31a1}/PKG-INFO +43 -30
  3. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/README.md +24 -11
  4. upgini-1.2.31a1/pyproject.toml +124 -0
  5. upgini-1.2.31a1/src/upgini/__about__.py +1 -0
  6. upgini-1.2.31a1/src/upgini/__init__.py +5 -0
  7. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/ads.py +6 -2
  8. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/ads_management/ads_manager.py +4 -2
  9. upgini-1.2.31a1/src/upgini/autofe/all_operands.py +87 -0
  10. upgini-1.2.31a1/src/upgini/autofe/binary.py +237 -0
  11. upgini-1.2.31a1/src/upgini/autofe/date.py +289 -0
  12. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/autofe/feature.py +104 -21
  13. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/autofe/groupby.py +25 -23
  14. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/autofe/operand.py +15 -9
  15. upgini-1.2.31a1/src/upgini/autofe/unary.py +155 -0
  16. upgini-1.2.31a1/src/upgini/autofe/vector.py +24 -0
  17. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/data_source/data_source_publisher.py +152 -10
  18. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/dataset.py +73 -445
  19. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/errors.py +1 -1
  20. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/features_enricher.py +1034 -546
  21. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/http.py +29 -17
  22. upgini-1.2.31a1/src/upgini/lazy_import.py +35 -0
  23. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/mdc/__init__.py +1 -3
  24. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/mdc/context.py +4 -6
  25. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/metadata.py +87 -59
  26. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/metrics.py +305 -111
  27. upgini-1.2.31a1/src/upgini/normalizer/normalize_utils.py +197 -0
  28. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/resource_bundle/__init__.py +5 -5
  29. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/resource_bundle/strings.properties +68 -48
  30. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/sampler/base.py +1 -4
  31. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/sampler/random_under_sampler.py +2 -5
  32. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/search_task.py +15 -9
  33. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/spinner.py +1 -1
  34. upgini-1.2.31a1/src/upgini/utils/Roboto-Regular.ttf +0 -0
  35. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/__init__.py +3 -2
  36. upgini-1.2.31a1/src/upgini/utils/base_search_key_detector.py +27 -0
  37. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/blocked_time_series.py +4 -2
  38. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/country_utils.py +18 -2
  39. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/custom_loss_utils.py +42 -38
  40. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/cv_utils.py +2 -2
  41. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/datetime_utils.py +146 -36
  42. upgini-1.2.31a1/src/upgini/utils/deduplicate_utils.py +208 -0
  43. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/display_utils.py +46 -15
  44. upgini-1.2.31a1/src/upgini/utils/email_utils.py +132 -0
  45. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/fallback_progress_bar.py +1 -1
  46. upgini-1.2.31a1/src/upgini/utils/feature_info.py +172 -0
  47. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/features_validator.py +35 -20
  48. upgini-1.2.31a1/src/upgini/utils/ip_utils.py +152 -0
  49. upgini-1.1.261a3250.post2/src/upgini/normalizer/phone_normalizer.py → upgini-1.2.31a1/src/upgini/utils/phone_utils.py +41 -27
  50. upgini-1.2.31a1/src/upgini/utils/postal_code_utils.py +45 -0
  51. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/progress_bar.py +1 -1
  52. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/sklearn_ext.py +43 -34
  53. upgini-1.2.31a1/src/upgini/utils/target_utils.py +253 -0
  54. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/track_info.py +27 -15
  55. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/warning_counter.py +1 -0
  56. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/version_validator.py +10 -6
  57. upgini-1.1.261a3250.post2/PKG-INFO +0 -844
  58. upgini-1.1.261a3250.post2/pyproject.toml +0 -10
  59. upgini-1.1.261a3250.post2/setup.cfg +0 -4
  60. upgini-1.1.261a3250.post2/setup.py +0 -104
  61. upgini-1.1.261a3250.post2/src/upgini/__init__.py +0 -21
  62. upgini-1.1.261a3250.post2/src/upgini/autofe/all_operands.py +0 -46
  63. upgini-1.1.261a3250.post2/src/upgini/autofe/binary.py +0 -133
  64. upgini-1.1.261a3250.post2/src/upgini/autofe/date.py +0 -42
  65. upgini-1.1.261a3250.post2/src/upgini/autofe/unary.py +0 -105
  66. upgini-1.1.261a3250.post2/src/upgini/autofe/vector.py +0 -20
  67. upgini-1.1.261a3250.post2/src/upgini/utils/base_search_key_detector.py +0 -25
  68. upgini-1.1.261a3250.post2/src/upgini/utils/deduplicate_utils.py +0 -142
  69. upgini-1.1.261a3250.post2/src/upgini/utils/email_utils.py +0 -94
  70. upgini-1.1.261a3250.post2/src/upgini/utils/ip_utils.py +0 -53
  71. upgini-1.1.261a3250.post2/src/upgini/utils/phone_utils.py +0 -11
  72. upgini-1.1.261a3250.post2/src/upgini/utils/postal_code_utils.py +0 -11
  73. upgini-1.1.261a3250.post2/src/upgini/utils/target_utils.py +0 -74
  74. upgini-1.1.261a3250.post2/src/upgini.egg-info/SOURCES.txt +0 -82
  75. upgini-1.1.261a3250.post2/src/upgini.egg-info/dependency_links.txt +0 -1
  76. upgini-1.1.261a3250.post2/src/upgini.egg-info/requires.txt +0 -13
  77. upgini-1.1.261a3250.post2/src/upgini.egg-info/top_level.txt +0 -1
  78. upgini-1.1.261a3250.post2/tests/test_autofe_operands.py +0 -28
  79. upgini-1.1.261a3250.post2/tests/test_binary_dataset.py +0 -47
  80. upgini-1.1.261a3250.post2/tests/test_blocked_time_series.py +0 -80
  81. upgini-1.1.261a3250.post2/tests/test_categorical_dataset.py +0 -44
  82. upgini-1.1.261a3250.post2/tests/test_continuous_dataset.py +0 -47
  83. upgini-1.1.261a3250.post2/tests/test_country_utils.py +0 -51
  84. upgini-1.1.261a3250.post2/tests/test_custom_loss_utils.py +0 -50
  85. upgini-1.1.261a3250.post2/tests/test_datetime_utils.py +0 -185
  86. upgini-1.1.261a3250.post2/tests/test_email_utils.py +0 -99
  87. upgini-1.1.261a3250.post2/tests/test_etalon_validation.py +0 -765
  88. upgini-1.1.261a3250.post2/tests/test_features_enricher.py +0 -2628
  89. upgini-1.1.261a3250.post2/tests/test_metrics.py +0 -1364
  90. upgini-1.1.261a3250.post2/tests/test_phone_utils.py +0 -31
  91. upgini-1.1.261a3250.post2/tests/test_postal_code_utils.py +0 -31
  92. upgini-1.1.261a3250.post2/tests/test_target_utils.py +0 -74
  93. upgini-1.1.261a3250.post2/tests/test_widget.py +0 -432
  94. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/LICENSE +0 -0
  95. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/ads_management/__init__.py +0 -0
  96. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/autofe/__init__.py +0 -0
  97. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/data_source/__init__.py +0 -0
  98. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/normalizer/__init__.py +0 -0
  99. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  100. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  101. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/sampler/__init__.py +0 -0
  102. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/sampler/utils.py +0 -0
  103. {upgini-1.1.261a3250.post2 → upgini-1.2.31a1}/src/upgini/utils/format.py +0 -0
@@ -0,0 +1,157 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ env8/
109
+ env9/
110
+ env10/
111
+ .env10/
112
+ .env310/
113
+ env11/
114
+ venv/
115
+ ENV/
116
+ env.bak/
117
+ venv.bak/
118
+
119
+ # Spyder project settings
120
+ .spyderproject
121
+ .spyproject
122
+
123
+ # Rope project settings
124
+ .ropeproject
125
+
126
+ # mkdocs documentation
127
+ /site
128
+
129
+ # mypy
130
+ .mypy_cache/
131
+ .dmypy.json
132
+ dmypy.json
133
+
134
+ # Pyre type checker
135
+ .pyre/
136
+
137
+ # IDE
138
+ .vscode/
139
+ .idea/
140
+
141
+ # macOS
142
+ .DS_Store
143
+
144
+ # Other
145
+ .cache/
146
+ activate_venv.sh
147
+ test-results/
148
+ test_notebooks/
149
+ publish.sh
150
+ catboost_info/
151
+ build/
152
+ playgroung.ipynb
153
+ fingerprint.js
154
+ envVars.txt
155
+ .ruff_cache
156
+ .jupyter
157
+ *.excalidraw
@@ -1,14 +1,13 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.261a3250.post2
3
+ Version: 1.2.31a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
- Home-page: https://upgini.com/
6
- Author: Upgini Developers
7
- Author-email: madewithlove@upgini.com
8
- License: BSD 3-Clause License
9
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
+ Project-URL: Homepage, https://upgini.com/
10
7
  Project-URL: Source, https://github.com/upgini/upgini
11
- Keywords: data science,machine learning,data mining,automl,data search
8
+ Author-email: Upgini Developers <madewithlove@upgini.com>
9
+ License-File: LICENSE
10
+ Keywords: automl,data mining,data science,data search,machine learning
12
11
  Classifier: Development Status :: 5 - Production/Stable
13
12
  Classifier: Intended Audience :: Customer Service
14
13
  Classifier: Intended Audience :: Developers
@@ -23,22 +22,23 @@ Classifier: Programming Language :: Python :: 3.9
23
22
  Classifier: Programming Language :: Python :: 3.10
24
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
24
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
- Requires-Python: >=3.8,<3.11
27
- Description-Content-Type: text/markdown
28
- License-File: LICENSE
25
+ Requires-Python: <3.12,>=3.8
26
+ Requires-Dist: catboost>=1.0.3
27
+ Requires-Dist: fastparquet>=0.8.1
28
+ Requires-Dist: ipywidgets>=8.1.0
29
+ Requires-Dist: jarowinkler>=2.0.0
30
+ Requires-Dist: levenshtein>=0.25.1
31
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
32
+ Requires-Dist: pandas<3.0.0,>=1.1.0
33
+ Requires-Dist: pydantic<3.0.0,>1.0.0
34
+ Requires-Dist: pyjwt>=2.8.0
35
+ Requires-Dist: python-bidi==0.4.2
29
36
  Requires-Dist: python-dateutil>=2.8.0
37
+ Requires-Dist: python-json-logger>=2.0.2
30
38
  Requires-Dist: requests>=2.8.0
31
- Requires-Dist: pandas<2.0.0,>=1.1.0
32
- Requires-Dist: numpy>=1.19.0
33
39
  Requires-Dist: scikit-learn>=1.3.0
34
- Requires-Dist: pydantic<2.0.0,>=1.8.2
35
- Requires-Dist: fastparquet>=0.8.1
36
- Requires-Dist: python-json-logger>=2.0.2
37
- Requires-Dist: catboost>=1.0.3
38
- Requires-Dist: lightgbm>=3.3.2
39
- Requires-Dist: pyjwt>=2.8.0
40
40
  Requires-Dist: xhtml2pdf==0.2.11
41
- Requires-Dist: ipywidgets>=8.1.0
41
+ Description-Content-Type: text/markdown
42
42
 
43
43
 
44
44
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
@@ -132,7 +132,7 @@ Requires-Dist: ipywidgets>=8.1.0
132
132
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
133
133
  |World economic indicators|191 |41|-|Monthly|date, country|No
134
134
  |Markets data|-|17|-|Monthly|date, datetime|No
135
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
135
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
136
136
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
137
137
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
138
138
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -145,7 +145,7 @@ Requires-Dist: ipywidgets>=8.1.0
145
145
 
146
146
  ## 💼 Tutorials
147
147
 
148
- ### [Search of relevant external features & Automated feature generation for Salary predicton task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
148
+ ### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
149
149
 
150
150
  * The goal is to predict salary for data science job postning based on information about employer and job description.
151
151
  * Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
@@ -259,7 +259,9 @@ We do dataset verification and cleaning under the hood, but still there are some
259
259
  *Search keys* columns will be used to match records from all potential external data sources / features.
260
260
  Define one or multiple columns as a search keys with `FeaturesEnricher` class initialization.
261
261
  ```python
262
- from upgini import FeaturesEnricher, SearchKey
262
+ from upgini.features_enricher import FeaturesEnricher
263
+ from upgini.metadata import SearchKey
264
+
263
265
  enricher = FeaturesEnricher(
264
266
  search_keys={
265
267
  "subscription_activation_date": SearchKey.DATE,
@@ -345,7 +347,9 @@ enricher = FeaturesEnricher(
345
347
 
346
348
  For the meaning types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to clarify date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
347
349
  ```python
348
- from upgini import FeaturesEnricher, SearchKey
350
+ from upgini.features_enricher import FeaturesEnricher
351
+ from upgini.metadata import SearchKey
352
+
349
353
  enricher = FeaturesEnricher(
350
354
  search_keys={
351
355
  "subscription_activation_date": SearchKey.DATE,
@@ -366,7 +370,9 @@ df["date"] = df.date.astype("datetime64").dt.tz_localize("Europe/Warsaw")
366
370
 
367
371
  Single country for the whole training dataset can be passed with `country_code` parameter:
368
372
  ```python
369
- from upgini import FeaturesEnricher, SearchKey
373
+ from upgini.features_enricher import FeaturesEnricher
374
+ from upgini.metadata import SearchKey
375
+
370
376
  enricher = FeaturesEnricher(
371
377
  search_keys={
372
378
  "subscription_activation_date": SearchKey.DATE,
@@ -385,7 +391,8 @@ Create instance of the `FeaturesEnricher` class and call:
385
391
  Let's try it out!
386
392
  ```python
387
393
  import pandas as pd
388
- from upgini import FeaturesEnricher, SearchKey
394
+ from upgini.features_enricher import FeaturesEnricher
395
+ from upgini.metadata import SearchKey
389
396
 
390
397
  # load labeled training dataset to initiate search
391
398
  train_df = pd.read_csv("customer_churn_prediction_train.csv")
@@ -476,7 +483,9 @@ We detect ML task under the hood based on label column values. Currently we supp
476
483
 
477
484
  But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML taks type:
478
485
  ```python
479
- from upgini import ModelTaskType
486
+ from upgini.features_enricher import FeaturesEnricher
487
+ from upgini.metadata import SearchKey, ModelTaskType
488
+
480
489
  enricher = FeaturesEnricher(
481
490
  search_keys={"subscription_activation_date": SearchKey.DATE},
482
491
  model_task_type=ModelTaskType.REGRESSION
@@ -489,7 +498,9 @@ enricher = FeaturesEnricher(
489
498
 
490
499
  To initiate feature search you can pass cross-validation type parameter to `FeaturesEnricher` with time series specific CV type:
491
500
  ```python
492
- from upgini.metadata import CVType
501
+ from upgini.features_enricher import FeaturesEnricher
502
+ from upgini.metadata import SearchKey, CVType
503
+
493
504
  enricher = FeaturesEnricher(
494
505
  search_keys={"sales_date": SearchKey.DATE},
495
506
  cv=CVType.time_series
@@ -623,7 +634,9 @@ But you can easily define new split by passing child of BaseCrossValidator to pa
623
634
 
624
635
  Example with more tips-and-tricks:
625
636
  ```python
626
- from upgini import FeaturesEnricher, SearchKey
637
+ from upgini.features_enricher import FeaturesEnricher
638
+ from upgini.metadata import SearchKey
639
+
627
640
  enricher = FeaturesEnricher(search_keys={"registration_date": SearchKey.DATE})
628
641
 
629
642
  # Fit with default setup for metrics calculation
@@ -796,7 +809,7 @@ You may publish ANY data which you consider as royalty / license free ([Open Dat
796
809
  2. Copy *Upgini API key* from profile and upload your data from Upgini python library with this key:
797
810
  ```python
798
811
  import pandas as pd
799
- from upgini import SearchKey
812
+ from upgini.metadata import SearchKey
800
813
  from upgini.ads import upload_user_ads
801
814
  import os
802
815
  os.environ["UPGINI_API_KEY"] = "your_long_string_api_key_goes_here"
@@ -841,4 +854,4 @@ Some convenient ways to start contributing are:
841
854
  - [More perks for registered users](https://profile.upgini.com)
842
855
 
843
856
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
844
- Please report it here.</a></sup>
857
+ Please report it here</a></sup>
@@ -90,7 +90,7 @@
90
90
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
91
91
  |World economic indicators|191 |41|-|Monthly|date, country|No
92
92
  |Markets data|-|17|-|Monthly|date, datetime|No
93
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
93
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
94
94
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
95
95
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
96
96
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -103,7 +103,7 @@
103
103
 
104
104
  ## 💼 Tutorials
105
105
 
106
- ### [Search of relevant external features & Automated feature generation for Salary predicton task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
106
+ ### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
107
107
 
108
108
  * The goal is to predict salary for data science job postning based on information about employer and job description.
109
109
  * Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
@@ -217,7 +217,9 @@ We do dataset verification and cleaning under the hood, but still there are some
217
217
  *Search keys* columns will be used to match records from all potential external data sources / features.
218
218
  Define one or multiple columns as a search keys with `FeaturesEnricher` class initialization.
219
219
  ```python
220
- from upgini import FeaturesEnricher, SearchKey
220
+ from upgini.features_enricher import FeaturesEnricher
221
+ from upgini.metadata import SearchKey
222
+
221
223
  enricher = FeaturesEnricher(
222
224
  search_keys={
223
225
  "subscription_activation_date": SearchKey.DATE,
@@ -303,7 +305,9 @@ enricher = FeaturesEnricher(
303
305
 
304
306
  For the meaning types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to clarify date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
305
307
  ```python
306
- from upgini import FeaturesEnricher, SearchKey
308
+ from upgini.features_enricher import FeaturesEnricher
309
+ from upgini.metadata import SearchKey
310
+
307
311
  enricher = FeaturesEnricher(
308
312
  search_keys={
309
313
  "subscription_activation_date": SearchKey.DATE,
@@ -324,7 +328,9 @@ df["date"] = df.date.astype("datetime64").dt.tz_localize("Europe/Warsaw")
324
328
 
325
329
  Single country for the whole training dataset can be passed with `country_code` parameter:
326
330
  ```python
327
- from upgini import FeaturesEnricher, SearchKey
331
+ from upgini.features_enricher import FeaturesEnricher
332
+ from upgini.metadata import SearchKey
333
+
328
334
  enricher = FeaturesEnricher(
329
335
  search_keys={
330
336
  "subscription_activation_date": SearchKey.DATE,
@@ -343,7 +349,8 @@ Create instance of the `FeaturesEnricher` class and call:
343
349
  Let's try it out!
344
350
  ```python
345
351
  import pandas as pd
346
- from upgini import FeaturesEnricher, SearchKey
352
+ from upgini.features_enricher import FeaturesEnricher
353
+ from upgini.metadata import SearchKey
347
354
 
348
355
  # load labeled training dataset to initiate search
349
356
  train_df = pd.read_csv("customer_churn_prediction_train.csv")
@@ -434,7 +441,9 @@ We detect ML task under the hood based on label column values. Currently we supp
434
441
 
435
442
  But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML taks type:
436
443
  ```python
437
- from upgini import ModelTaskType
444
+ from upgini.features_enricher import FeaturesEnricher
445
+ from upgini.metadata import SearchKey, ModelTaskType
446
+
438
447
  enricher = FeaturesEnricher(
439
448
  search_keys={"subscription_activation_date": SearchKey.DATE},
440
449
  model_task_type=ModelTaskType.REGRESSION
@@ -447,7 +456,9 @@ enricher = FeaturesEnricher(
447
456
 
448
457
  To initiate feature search you can pass cross-validation type parameter to `FeaturesEnricher` with time series specific CV type:
449
458
  ```python
450
- from upgini.metadata import CVType
459
+ from upgini.features_enricher import FeaturesEnricher
460
+ from upgini.metadata import SearchKey, CVType
461
+
451
462
  enricher = FeaturesEnricher(
452
463
  search_keys={"sales_date": SearchKey.DATE},
453
464
  cv=CVType.time_series
@@ -581,7 +592,9 @@ But you can easily define new split by passing child of BaseCrossValidator to pa
581
592
 
582
593
  Example with more tips-and-tricks:
583
594
  ```python
584
- from upgini import FeaturesEnricher, SearchKey
595
+ from upgini.features_enricher import FeaturesEnricher
596
+ from upgini.metadata import SearchKey
597
+
585
598
  enricher = FeaturesEnricher(search_keys={"registration_date": SearchKey.DATE})
586
599
 
587
600
  # Fit with default setup for metrics calculation
@@ -754,7 +767,7 @@ You may publish ANY data which you consider as royalty / license free ([Open Dat
754
767
  2. Copy *Upgini API key* from profile and upload your data from Upgini python library with this key:
755
768
  ```python
756
769
  import pandas as pd
757
- from upgini import SearchKey
770
+ from upgini.metadata import SearchKey
758
771
  from upgini.ads import upload_user_ads
759
772
  import os
760
773
  os.environ["UPGINI_API_KEY"] = "your_long_string_api_key_goes_here"
@@ -799,4 +812,4 @@ Some convenient ways to start contributing are:
799
812
  - [More perks for registered users](https://profile.upgini.com)
800
813
 
801
814
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
802
- Please report it here.</a></sup>
815
+ Please report it here</a></sup>
@@ -0,0 +1,124 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "upgini"
7
+ dynamic = ["version"]
8
+ description = "Intelligent data search & enrichment for Machine Learning"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8,<3.12"
11
+ authors = [
12
+ { name = "Upgini Developers", email = "madewithlove@upgini.com" },
13
+ ]
14
+ keywords = [
15
+ "automl",
16
+ "data mining",
17
+ "data science",
18
+ "data search",
19
+ "machine learning",
20
+ ]
21
+ classifiers = [
22
+ "Development Status :: 5 - Production/Stable",
23
+ "Intended Audience :: Customer Service",
24
+ "Intended Audience :: Developers",
25
+ "Intended Audience :: Financial and Insurance Industry",
26
+ "Intended Audience :: Information Technology",
27
+ "Intended Audience :: Science/Research",
28
+ "Intended Audience :: Telecommunications Industry",
29
+ "License :: OSI Approved :: BSD License",
30
+ "Operating System :: OS Independent",
31
+ "Programming Language :: Python :: 3.8",
32
+ "Programming Language :: Python :: 3.9",
33
+ "Programming Language :: Python :: 3.10",
34
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
35
+ "Topic :: Scientific/Engineering :: Information Analysis",
36
+ ]
37
+ dependencies = [
38
+ "catboost>=1.0.3",
39
+ "fastparquet>=0.8.1",
40
+ "ipywidgets>=8.1.0",
41
+ "numpy>=1.19.0,<=1.26.4",
42
+ "pandas>=1.1.0,<3.0.0",
43
+ "pydantic>1.0.0,<3.0.0",
44
+ "pyjwt>=2.8.0",
45
+ "python-dateutil>=2.8.0",
46
+ "python-json-logger>=2.0.2",
47
+ "requests>=2.8.0",
48
+ "scikit-learn>=1.3.0",
49
+ "python-bidi==0.4.2",
50
+ "xhtml2pdf==0.2.11",
51
+ "jarowinkler>=2.0.0",
52
+ "levenshtein>=0.25.1",
53
+ ]
54
+
55
+ [project.urls]
56
+ "Bug Reports" = "https://github.com/upgini/upgini/issues"
57
+ Homepage = "https://upgini.com/"
58
+ Source = "https://github.com/upgini/upgini"
59
+
60
+ [tool.hatch.version]
61
+ path = "src/upgini/__about__.py"
62
+
63
+ [tool.hatch.build.targets.sdist]
64
+ include = [
65
+ "src"
66
+ ]
67
+
68
+ [tool.hatch.build.targets.wheel]
69
+ packages = [
70
+ "src/upgini"
71
+ ]
72
+
73
+ [tool.hatch.build]
74
+ include = [
75
+ "/src/utils/Roboto-Regular.ttf",
76
+ ]
77
+
78
+ [tool.hatch.envs.default]
79
+ type = "virtual"
80
+ python = "3.10"
81
+
82
+ [tool.hatch.envs.test.scripts]
83
+ cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
84
+ format = "black {args}"
85
+ lint = "ruff check {args}"
86
+ test_all = 'pytest -s -vv tests'
87
+
88
+ [[tool.hatch.envs.test.matrix]]
89
+ python = ["3.8"]
90
+ pandas = ["1.1.0"]
91
+
92
+ [[tool.hatch.envs.test.matrix]]
93
+ python = ["3.8", "3.9", "3.10"]
94
+ pandas = ["1.2.0", "1.3.0", "1.4.0", "1.5.0", "2.0.0"]
95
+
96
+ [[tool.hatch.envs.test.matrix]]
97
+ python = ["3.9", "3.10"]
98
+ pandas = ["2.1.0", "2.2.0"]
99
+
100
+ # from versions: 0.1, 0.2, 0.3.0, 0.4.0, 0.4.1, 0.4.2, 0.4.3, 0.5.0, 0.6.0, 0.6.1, 0.7.0, 0.7.1, 0.7.2, 0.7.3, 0.8.0, 0.8.1, 0.9.0, 0.9.1, 0.10.0, 0.10.1, 0.11.0, 0.12.0, 0.13.0, 0.13.1, 0.14.0, 0.14.1, 0.15.0, 0.15.1, 0.15.2, 0.16.0, 0.16.1, 0.16.2, 0.17.0, 0.17.1, 0.18.0, 0.18.1, 0.19.0, 0.19.1, 0.19.2, 0.20.0, 0.20.1, 0.20.2, 0.20.3, 0.21.0, 0.21.1, 0.22.0, 0.23.0, 0.23.1, 0.23.2, 0.23.3, 0.23.4, 0.24.0, 0.24.1, 0.24.2, 0.25.0, 0.25.1, 0.25.2, 0.25.3, 1.0.0, 1.0.1, 1.0.2, 1.0.3, 1.0.4, 1.0.5, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.1.4, 1.1.5, 1.2.0, 1.2.1, 1.2.2, 1.2.3, 1.2.4, 1.2.5, 1.3.0, 1.3.1, 1.3.2, 1.3.3, 1.3.4, 1.3.5, 1.4.0rc0, 1.4.0, 1.4.1, 1.4.2, 1.4.3, 1.4.4, 1.5.0rc0, 1.5.0, 1.5.1, 1.5.2, 1.5.3, 2.0.0rc0, 2.0.0rc1, 2.0.0, 2.0.1, 2.0.2, 2.0.3
101
+
102
+ [tool.hatch.envs.test]
103
+ dependencies = [
104
+ "coverage[toml]",
105
+ "pytest",
106
+ "pytest-cov",
107
+ # "pytest-timeout",
108
+ "requests-mock",
109
+ "pytest-datafiles",
110
+ "pytest-xdist",
111
+ "pandas~={matrix:pandas}",
112
+ ]
113
+
114
+ [tool.black]
115
+ line-length = 120
116
+
117
+ [tool.isort]
118
+ profile = "black"
119
+
120
+ [tool.pytest.ini_options]
121
+ pythonpath = [
122
+ "./src"
123
+ ]
124
+ addopts="-n 4"
@@ -0,0 +1 @@
1
+ __version__ = "1.2.31a1"
@@ -0,0 +1,5 @@
1
+ from upgini.features_enricher import FeaturesEnricher # noqa: F401
2
+ from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
3
+ import warnings
4
+
5
+ warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")
@@ -5,7 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from pandas.api.types import is_string_dtype
8
+ from pandas.api.types import is_object_dtype, is_string_dtype
9
9
 
10
10
  from upgini import SearchKey
11
11
  from upgini.http import get_rest_client
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
34
34
  if df[column_name].notnull().sum() < min_valid_rows_count:
35
35
  raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
36
36
  meaning_type = search_keys[column_name].value
37
- if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
37
+ if (
38
+ meaning_type == FileColumnMeaningType.MSISDN
39
+ and not is_string_dtype(df[column_name])
40
+ and not is_object_dtype(df[column_name])
41
+ ):
38
42
  df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
39
43
  else:
40
44
  meaning_type = FileColumnMeaningType.FEATURE
@@ -1,9 +1,11 @@
1
1
  import time
2
- from typing import Dict, Optional
3
2
  import uuid
3
+ from typing import Dict, Optional
4
+
5
+ import pandas as pd
6
+
4
7
  from upgini.http import get_rest_client
5
8
  from upgini.spinner import Spinner
6
- import pandas as pd
7
9
 
8
10
 
9
11
  class AdsManager:
@@ -0,0 +1,87 @@
1
+ from copy import deepcopy
2
+ from typing import Dict
3
+
4
+ from upgini.autofe.binary import (
5
+ Add,
6
+ Combine,
7
+ CombineThenFreq,
8
+ Distance,
9
+ Divide,
10
+ JaroWinklerSim1,
11
+ JaroWinklerSim2,
12
+ LevenshteinSim,
13
+ Max,
14
+ Min,
15
+ Multiply,
16
+ Sim,
17
+ Subtract,
18
+ )
19
+ from upgini.autofe.date import (
20
+ DateDiff,
21
+ DateDiffType2,
22
+ DateListDiff,
23
+ DateListDiffBounded,
24
+ DatePercentile,
25
+ DatePercentileMethod2,
26
+ )
27
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
28
+ from upgini.autofe.operand import Operand
29
+ from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
30
+ from upgini.autofe.vector import Mean, Sum
31
+
32
+ ALL_OPERANDS: Dict[str, Operand] = {
33
+ op.name: op
34
+ for op in [
35
+ Freq(),
36
+ Mean(),
37
+ Sum(),
38
+ Abs(),
39
+ Log(),
40
+ Sqrt(),
41
+ Square(),
42
+ Sigmoid(),
43
+ Floor(),
44
+ Residual(),
45
+ Min(),
46
+ Max(),
47
+ Add(),
48
+ Subtract(),
49
+ Multiply(),
50
+ Divide(),
51
+ GroupByThenAgg(name="GroupByThenMin", agg="min"),
52
+ GroupByThenAgg(name="GroupByThenMax", agg="max"),
53
+ GroupByThenAgg(name="GroupByThenMean", agg="mean"),
54
+ GroupByThenAgg(name="GroupByThenMedian", agg="median"),
55
+ GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
56
+ GroupByThenRank(),
57
+ Combine(),
58
+ CombineThenFreq(),
59
+ GroupByThenNUnique(),
60
+ GroupByThenFreq(),
61
+ Sim(),
62
+ DateDiff(),
63
+ DateDiffType2(),
64
+ DateListDiff(aggregation="min"),
65
+ DateListDiff(aggregation="max"),
66
+ DateListDiff(aggregation="mean"),
67
+ DateListDiff(aggregation="nunique"),
68
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
69
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
70
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
71
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
72
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
73
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
74
+ DatePercentile(),
75
+ DatePercentileMethod2(),
76
+ Norm(),
77
+ JaroWinklerSim1(),
78
+ JaroWinklerSim2(),
79
+ LevenshteinSim(),
80
+ Distance(),
81
+ Embeddings(),
82
+ ]
83
+ }
84
+
85
+
86
+ def find_op(name):
87
+ return deepcopy(ALL_OPERANDS.get(name))