kumoai 2.13.0.dev202511131731__cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kumoai might be problematic. Click here for more details.
- kumoai/__init__.py +294 -0
- kumoai/_logging.py +29 -0
- kumoai/_singleton.py +25 -0
- kumoai/_version.py +1 -0
- kumoai/artifact_export/__init__.py +9 -0
- kumoai/artifact_export/config.py +209 -0
- kumoai/artifact_export/job.py +108 -0
- kumoai/client/__init__.py +5 -0
- kumoai/client/client.py +221 -0
- kumoai/client/connector.py +110 -0
- kumoai/client/endpoints.py +150 -0
- kumoai/client/graph.py +120 -0
- kumoai/client/jobs.py +447 -0
- kumoai/client/online.py +78 -0
- kumoai/client/pquery.py +203 -0
- kumoai/client/rfm.py +112 -0
- kumoai/client/source_table.py +53 -0
- kumoai/client/table.py +101 -0
- kumoai/client/utils.py +130 -0
- kumoai/codegen/__init__.py +19 -0
- kumoai/codegen/cli.py +100 -0
- kumoai/codegen/context.py +16 -0
- kumoai/codegen/edits.py +473 -0
- kumoai/codegen/exceptions.py +10 -0
- kumoai/codegen/generate.py +222 -0
- kumoai/codegen/handlers/__init__.py +4 -0
- kumoai/codegen/handlers/connector.py +118 -0
- kumoai/codegen/handlers/graph.py +71 -0
- kumoai/codegen/handlers/pquery.py +62 -0
- kumoai/codegen/handlers/table.py +109 -0
- kumoai/codegen/handlers/utils.py +42 -0
- kumoai/codegen/identity.py +114 -0
- kumoai/codegen/loader.py +93 -0
- kumoai/codegen/naming.py +94 -0
- kumoai/codegen/registry.py +121 -0
- kumoai/connector/__init__.py +31 -0
- kumoai/connector/base.py +153 -0
- kumoai/connector/bigquery_connector.py +200 -0
- kumoai/connector/databricks_connector.py +213 -0
- kumoai/connector/file_upload_connector.py +189 -0
- kumoai/connector/glue_connector.py +150 -0
- kumoai/connector/s3_connector.py +278 -0
- kumoai/connector/snowflake_connector.py +252 -0
- kumoai/connector/source_table.py +471 -0
- kumoai/connector/utils.py +1775 -0
- kumoai/databricks.py +14 -0
- kumoai/encoder/__init__.py +4 -0
- kumoai/exceptions.py +26 -0
- kumoai/experimental/__init__.py +0 -0
- kumoai/experimental/rfm/__init__.py +67 -0
- kumoai/experimental/rfm/authenticate.py +433 -0
- kumoai/experimental/rfm/infer/__init__.py +11 -0
- kumoai/experimental/rfm/infer/categorical.py +40 -0
- kumoai/experimental/rfm/infer/id.py +46 -0
- kumoai/experimental/rfm/infer/multicategorical.py +48 -0
- kumoai/experimental/rfm/infer/timestamp.py +41 -0
- kumoai/experimental/rfm/local_graph.py +810 -0
- kumoai/experimental/rfm/local_graph_sampler.py +184 -0
- kumoai/experimental/rfm/local_graph_store.py +359 -0
- kumoai/experimental/rfm/local_pquery_driver.py +689 -0
- kumoai/experimental/rfm/local_table.py +545 -0
- kumoai/experimental/rfm/pquery/__init__.py +7 -0
- kumoai/experimental/rfm/pquery/executor.py +102 -0
- kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
- kumoai/experimental/rfm/rfm.py +1130 -0
- kumoai/experimental/rfm/utils.py +344 -0
- kumoai/formatting.py +30 -0
- kumoai/futures.py +99 -0
- kumoai/graph/__init__.py +12 -0
- kumoai/graph/column.py +106 -0
- kumoai/graph/graph.py +948 -0
- kumoai/graph/table.py +838 -0
- kumoai/jobs.py +80 -0
- kumoai/kumolib.cpython-313-x86_64-linux-gnu.so +0 -0
- kumoai/mixin.py +28 -0
- kumoai/pquery/__init__.py +25 -0
- kumoai/pquery/prediction_table.py +287 -0
- kumoai/pquery/predictive_query.py +637 -0
- kumoai/pquery/training_table.py +424 -0
- kumoai/spcs.py +123 -0
- kumoai/testing/__init__.py +8 -0
- kumoai/testing/decorators.py +57 -0
- kumoai/trainer/__init__.py +42 -0
- kumoai/trainer/baseline_trainer.py +93 -0
- kumoai/trainer/config.py +2 -0
- kumoai/trainer/job.py +1192 -0
- kumoai/trainer/online_serving.py +258 -0
- kumoai/trainer/trainer.py +475 -0
- kumoai/trainer/util.py +103 -0
- kumoai/utils/__init__.py +10 -0
- kumoai/utils/datasets.py +83 -0
- kumoai/utils/forecasting.py +209 -0
- kumoai/utils/progress_logger.py +177 -0
- kumoai-2.13.0.dev202511131731.dist-info/METADATA +60 -0
- kumoai-2.13.0.dev202511131731.dist-info/RECORD +98 -0
- kumoai-2.13.0.dev202511131731.dist-info/WHEEL +6 -0
- kumoai-2.13.0.dev202511131731.dist-info/licenses/LICENSE +9 -0
- kumoai-2.13.0.dev202511131731.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from kumoapi.typing import Dtype, Stype
|
|
5
|
+
|
|
6
|
+
MAX_CAT = 100
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def contains_multicategorical(
|
|
10
|
+
ser: pd.Series,
|
|
11
|
+
column_name: str,
|
|
12
|
+
dtype: Dtype,
|
|
13
|
+
) -> bool:
|
|
14
|
+
|
|
15
|
+
if not Stype.multicategorical.supports_dtype(dtype):
|
|
16
|
+
return False
|
|
17
|
+
|
|
18
|
+
if dtype == Dtype.stringlist:
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
ser = ser.iloc[:500]
|
|
22
|
+
ser = ser.dropna()
|
|
23
|
+
|
|
24
|
+
num_unique: int = 0
|
|
25
|
+
if dtype == Dtype.string:
|
|
26
|
+
ser = ser.astype(str)
|
|
27
|
+
text = '\n'.join(ser)
|
|
28
|
+
|
|
29
|
+
white_list = {';', ':', '|', '\t'}
|
|
30
|
+
candidates: dict[str, int] = defaultdict(int)
|
|
31
|
+
for char in text:
|
|
32
|
+
if char in white_list:
|
|
33
|
+
candidates[char] += 1
|
|
34
|
+
|
|
35
|
+
if len(candidates) == 0:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
num_unique = ser.nunique()
|
|
39
|
+
|
|
40
|
+
sep = max(candidates, key=candidates.get) # type: ignore
|
|
41
|
+
ser = ser.str.split(sep)
|
|
42
|
+
|
|
43
|
+
num_unique_multi = ser.explode().nunique()
|
|
44
|
+
|
|
45
|
+
if dtype.is_list():
|
|
46
|
+
return num_unique_multi <= MAX_CAT
|
|
47
|
+
|
|
48
|
+
return num_unique > 1.5 * num_unique_multi and num_unique_multi <= MAX_CAT
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from dateutil.parser import UnknownTimezoneWarning
|
|
6
|
+
from kumoapi.typing import Dtype, Stype
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
|
|
10
|
+
if not Stype.timestamp.supports_dtype(dtype):
|
|
11
|
+
return False
|
|
12
|
+
|
|
13
|
+
if dtype.is_timestamp():
|
|
14
|
+
return True
|
|
15
|
+
|
|
16
|
+
column_name = column_name.lower()
|
|
17
|
+
|
|
18
|
+
match = re.search(
|
|
19
|
+
('(^|_)(date|datetime|dt|time|timedate|timestamp|ts|'
|
|
20
|
+
'created|updated)(_|$)'),
|
|
21
|
+
column_name,
|
|
22
|
+
re.IGNORECASE,
|
|
23
|
+
)
|
|
24
|
+
score = 0.3 if match is not None else 0.0
|
|
25
|
+
|
|
26
|
+
ser = ser.iloc[:100]
|
|
27
|
+
ser = ser.dropna()
|
|
28
|
+
ser = ser[ser != '']
|
|
29
|
+
|
|
30
|
+
if len(ser) == 0:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
ser = ser.astype(str) # Avoid parsing numbers as unix timestamps.
|
|
34
|
+
|
|
35
|
+
with warnings.catch_warnings():
|
|
36
|
+
warnings.simplefilter('ignore', UnknownTimezoneWarning)
|
|
37
|
+
warnings.filterwarnings('ignore', message='Could not infer format')
|
|
38
|
+
mask = pd.to_datetime(ser, errors='coerce').notna()
|
|
39
|
+
score += int(mask.sum()) / len(mask)
|
|
40
|
+
|
|
41
|
+
return score >= 1.0
|