kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (122) hide show
  1. kumoai/__init__.py +300 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +223 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +471 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +207 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1796 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +210 -0
  51. kumoai/experimental/rfm/authenticate.py +432 -0
  52. kumoai/experimental/rfm/backend/__init__.py +0 -0
  53. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  54. kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
  55. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  56. kumoai/experimental/rfm/backend/local/table.py +113 -0
  57. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  58. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  59. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  60. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  61. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  62. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  63. kumoai/experimental/rfm/base/__init__.py +30 -0
  64. kumoai/experimental/rfm/base/column.py +152 -0
  65. kumoai/experimental/rfm/base/expression.py +44 -0
  66. kumoai/experimental/rfm/base/sampler.py +761 -0
  67. kumoai/experimental/rfm/base/source.py +19 -0
  68. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  69. kumoai/experimental/rfm/base/table.py +736 -0
  70. kumoai/experimental/rfm/graph.py +1237 -0
  71. kumoai/experimental/rfm/infer/__init__.py +19 -0
  72. kumoai/experimental/rfm/infer/categorical.py +40 -0
  73. kumoai/experimental/rfm/infer/dtype.py +82 -0
  74. kumoai/experimental/rfm/infer/id.py +46 -0
  75. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  76. kumoai/experimental/rfm/infer/pkey.py +128 -0
  77. kumoai/experimental/rfm/infer/stype.py +35 -0
  78. kumoai/experimental/rfm/infer/time_col.py +61 -0
  79. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  80. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  81. kumoai/experimental/rfm/pquery/executor.py +102 -0
  82. kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
  83. kumoai/experimental/rfm/relbench.py +76 -0
  84. kumoai/experimental/rfm/rfm.py +1184 -0
  85. kumoai/experimental/rfm/sagemaker.py +138 -0
  86. kumoai/experimental/rfm/task_table.py +231 -0
  87. kumoai/formatting.py +30 -0
  88. kumoai/futures.py +99 -0
  89. kumoai/graph/__init__.py +12 -0
  90. kumoai/graph/column.py +106 -0
  91. kumoai/graph/graph.py +948 -0
  92. kumoai/graph/table.py +838 -0
  93. kumoai/jobs.py +80 -0
  94. kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
  95. kumoai/mixin.py +28 -0
  96. kumoai/pquery/__init__.py +25 -0
  97. kumoai/pquery/prediction_table.py +287 -0
  98. kumoai/pquery/predictive_query.py +641 -0
  99. kumoai/pquery/training_table.py +424 -0
  100. kumoai/spcs.py +121 -0
  101. kumoai/testing/__init__.py +8 -0
  102. kumoai/testing/decorators.py +57 -0
  103. kumoai/testing/snow.py +50 -0
  104. kumoai/trainer/__init__.py +42 -0
  105. kumoai/trainer/baseline_trainer.py +93 -0
  106. kumoai/trainer/config.py +2 -0
  107. kumoai/trainer/distilled_trainer.py +175 -0
  108. kumoai/trainer/job.py +1192 -0
  109. kumoai/trainer/online_serving.py +258 -0
  110. kumoai/trainer/trainer.py +475 -0
  111. kumoai/trainer/util.py +103 -0
  112. kumoai/utils/__init__.py +11 -0
  113. kumoai/utils/datasets.py +83 -0
  114. kumoai/utils/display.py +51 -0
  115. kumoai/utils/forecasting.py +209 -0
  116. kumoai/utils/progress_logger.py +343 -0
  117. kumoai/utils/sql.py +3 -0
  118. kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
  119. kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
  120. kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
  121. kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
  122. kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
@@ -0,0 +1,19 @@
1
+ from .dtype import infer_dtype
2
+ from .id import contains_id
3
+ from .timestamp import contains_timestamp
4
+ from .categorical import contains_categorical
5
+ from .multicategorical import contains_multicategorical
6
+ from .stype import infer_stype
7
+ from .pkey import infer_primary_key
8
+ from .time_col import infer_time_column
9
+
10
+ __all__ = [
11
+ 'infer_dtype',
12
+ 'contains_id',
13
+ 'contains_timestamp',
14
+ 'contains_categorical',
15
+ 'contains_multicategorical',
16
+ 'infer_stype',
17
+ 'infer_primary_key',
18
+ 'infer_time_column',
19
+ ]
@@ -0,0 +1,40 @@
1
+ import re
2
+
3
+ import pandas as pd
4
+ from kumoapi.typing import Dtype, Stype
5
+
6
+
7
+ def contains_categorical(
8
+ ser: pd.Series,
9
+ column_name: str,
10
+ dtype: Dtype,
11
+ ) -> bool:
12
+
13
+ if not Stype.categorical.supports_dtype(dtype):
14
+ return False
15
+
16
+ if Dtype == Dtype.bool:
17
+ return True
18
+
19
+ if dtype.is_numerical():
20
+ match = re.search(
21
+ (r'(^|_)(price|sales|amount|quantity|total|cost|score|rating|'
22
+ 'avg|average|recency|age|num|pos|number|position)(_|$)'),
23
+ column_name,
24
+ re.IGNORECASE,
25
+ )
26
+ if match is not None:
27
+ return False
28
+
29
+ ser = ser.iloc[:1000]
30
+ ser = ser.dropna()
31
+
32
+ num_unique = ser.nunique()
33
+
34
+ if num_unique < 20:
35
+ return True
36
+
37
+ if dtype.is_string():
38
+ return num_unique / len(ser) <= 0.5
39
+
40
+ return num_unique / len(ser) <= 0.05
@@ -0,0 +1,82 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pyarrow as pa
4
+ from kumoapi.typing import Dtype
5
+
6
+ PANDAS_TO_DTYPE: dict[str, Dtype] = {
7
+ 'bool': Dtype.bool,
8
+ 'boolean': Dtype.bool,
9
+ 'int8': Dtype.int,
10
+ 'int16': Dtype.int,
11
+ 'int32': Dtype.int,
12
+ 'int64': Dtype.int,
13
+ 'float': Dtype.float,
14
+ 'double': Dtype.float,
15
+ 'float16': Dtype.float,
16
+ 'float32': Dtype.float,
17
+ 'float64': Dtype.float,
18
+ 'object': Dtype.string,
19
+ 'string': Dtype.string,
20
+ 'string[python]': Dtype.string,
21
+ 'string[pyarrow]': Dtype.string,
22
+ 'binary': Dtype.binary,
23
+ 'binary[python]': Dtype.binary,
24
+ 'binary[pyarrow]': Dtype.binary,
25
+ }
26
+
27
+
28
+ def infer_dtype(ser: pd.Series) -> Dtype:
29
+ """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
30
+
31
+ Args:
32
+ ser: A :class:`pandas.Series` to analyze.
33
+
34
+ Returns:
35
+ The data type.
36
+ """
37
+ if pd.api.types.is_datetime64_any_dtype(ser.dtype):
38
+ return Dtype.date
39
+ if pd.api.types.is_timedelta64_dtype(ser.dtype):
40
+ return Dtype.timedelta
41
+ if isinstance(ser.dtype, pd.CategoricalDtype):
42
+ return Dtype.string
43
+
44
+ if (pd.api.types.is_object_dtype(ser.dtype)
45
+ and not isinstance(ser.dtype, pd.ArrowDtype)):
46
+ index = ser.iloc[:1000].first_valid_index()
47
+ if index is not None and pd.api.types.is_list_like(ser[index]):
48
+ pos = ser.index.get_loc(index)
49
+ assert isinstance(pos, int)
50
+ ser = ser.iloc[pos:pos + 1000].dropna()
51
+ arr = pa.array(ser.tolist())
52
+ ser = pd.Series(arr, dtype=pd.ArrowDtype(arr.type))
53
+
54
+ if isinstance(ser.dtype, pd.ArrowDtype):
55
+ if (pa.types.is_list(ser.dtype.pyarrow_dtype)
56
+ or pa.types.is_fixed_size_list(ser.dtype.pyarrow_dtype)):
57
+ elem_dtype = ser.dtype.pyarrow_dtype.value_type
58
+ if pa.types.is_integer(elem_dtype):
59
+ return Dtype.intlist
60
+ if pa.types.is_floating(elem_dtype):
61
+ return Dtype.floatlist
62
+ if pa.types.is_decimal(elem_dtype):
63
+ return Dtype.floatlist
64
+ if pa.types.is_string(elem_dtype):
65
+ return Dtype.stringlist
66
+ if pa.types.is_null(elem_dtype):
67
+ return Dtype.floatlist
68
+
69
+ if isinstance(ser.dtype, np.dtype):
70
+ dtype_str = str(ser.dtype).lower()
71
+ elif isinstance(ser.dtype, pd.api.extensions.ExtensionDtype):
72
+ dtype_str = ser.dtype.name.lower()
73
+ dtype_str = dtype_str.split('[')[0] # Remove backend metadata
74
+ elif isinstance(ser.dtype, pa.DataType):
75
+ dtype_str = str(ser.dtype).lower()
76
+ else:
77
+ dtype_str = 'object'
78
+
79
+ if dtype_str not in PANDAS_TO_DTYPE:
80
+ raise ValueError(f"Unsupported data type '{ser.dtype}'")
81
+
82
+ return PANDAS_TO_DTYPE[dtype_str]
@@ -0,0 +1,46 @@
1
+ import re
2
+
3
+ import pandas as pd
4
+ from kumoapi.typing import Dtype, Stype
5
+
6
+ # Column names suffixes that end in "id" but should not be given the ID stype.
7
+ _IGNORED_ID_SUFFIXES = [
8
+ 'bid',
9
+ 'acid',
10
+ 'grid',
11
+ 'maid',
12
+ 'paid',
13
+ 'raid',
14
+ 'void',
15
+ 'avoid',
16
+ 'braid',
17
+ 'covid',
18
+ 'fluid',
19
+ 'rabid',
20
+ 'solid',
21
+ 'hybrid',
22
+ 'inlaid',
23
+ 'liquid',
24
+ ]
25
+
26
+
27
+ def contains_id(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
28
+ if not Stype.ID.supports_dtype(dtype):
29
+ return False
30
+
31
+ column_name = column_name.lower()
32
+
33
+ match = re.search(
34
+ r'(^|_)(id|hash|key|code|uuid)(_|$)',
35
+ column_name,
36
+ re.IGNORECASE,
37
+ )
38
+ if match is not None:
39
+ return True
40
+
41
+ if not column_name.endswith('id'):
42
+ return False
43
+ for suffix in _IGNORED_ID_SUFFIXES:
44
+ if column_name.endswith(suffix):
45
+ return False
46
+ return True
@@ -0,0 +1,48 @@
1
+ from collections import defaultdict
2
+
3
+ import pandas as pd
4
+ from kumoapi.typing import Dtype, Stype
5
+
6
+ MAX_CAT = 100
7
+
8
+
9
+ def contains_multicategorical(
10
+ ser: pd.Series,
11
+ column_name: str,
12
+ dtype: Dtype,
13
+ ) -> bool:
14
+
15
+ if not Stype.multicategorical.supports_dtype(dtype):
16
+ return False
17
+
18
+ if dtype == Dtype.stringlist:
19
+ return True
20
+
21
+ ser = ser.iloc[:500]
22
+ ser = ser.dropna()
23
+
24
+ num_unique: int = 0
25
+ if dtype == Dtype.string:
26
+ ser = ser.astype(str)
27
+ text = '\n'.join(ser)
28
+
29
+ white_list = {';', ':', '|', '\t'}
30
+ candidates: dict[str, int] = defaultdict(int)
31
+ for char in text:
32
+ if char in white_list:
33
+ candidates[char] += 1
34
+
35
+ if len(candidates) == 0:
36
+ return False
37
+
38
+ num_unique = ser.nunique()
39
+
40
+ sep = max(candidates, key=candidates.get) # type: ignore
41
+ ser = ser.str.split(sep)
42
+
43
+ num_unique_multi = ser.astype('object').explode().nunique()
44
+
45
+ if dtype.is_list():
46
+ return num_unique_multi <= MAX_CAT
47
+
48
+ return num_unique > 1.5 * num_unique_multi and num_unique_multi <= MAX_CAT
@@ -0,0 +1,128 @@
1
+ import re
2
+ import warnings
3
+
4
+ import pandas as pd
5
+
6
+
7
+ def infer_primary_key(
8
+ table_name: str,
9
+ df: pd.DataFrame,
10
+ candidates: list[str],
11
+ ) -> str | None:
12
+ r"""Auto-detect potential primary key column.
13
+
14
+ Args:
15
+ table_name: The table name.
16
+ df: The pandas DataFrame to analyze.
17
+ candidates: A list of potential candidates.
18
+
19
+ Returns:
20
+ The name of the detected primary key, or ``None`` if not found.
21
+ """
22
+ if len(candidates) == 0:
23
+ return None
24
+
25
+ # A list of (potentially modified) table names that are eligible to match
26
+ # with a primary key, i.e.:
27
+ # - UserInfo -> User
28
+ # - snakecase <-> camelcase
29
+ # - camelcase <-> snakecase
30
+ # - plural <-> singular (users -> user, eligibilities -> eligibility)
31
+ # - verb -> noun (qualifying -> qualify)
32
+ _table_names = {table_name}
33
+ if table_name.lower().endswith('_info'):
34
+ _table_names.add(table_name[:-5])
35
+ elif table_name.lower().endswith('info'):
36
+ _table_names.add(table_name[:-4])
37
+
38
+ table_names = set()
39
+ for _table_name in _table_names:
40
+ table_names.add(_table_name.lower())
41
+ snakecase = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', _table_name)
42
+ snakecase = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', snakecase)
43
+ table_names.add(snakecase.lower())
44
+ camelcase = _table_name.replace('_', '')
45
+ table_names.add(camelcase.lower())
46
+ if _table_name.lower().endswith('s'):
47
+ table_names.add(_table_name.lower()[:-1])
48
+ table_names.add(snakecase.lower()[:-1])
49
+ table_names.add(camelcase.lower()[:-1])
50
+ else:
51
+ table_names.add(_table_name.lower() + 's')
52
+ table_names.add(snakecase.lower() + 's')
53
+ table_names.add(camelcase.lower() + 's')
54
+ if _table_name.lower().endswith('ies'):
55
+ table_names.add(_table_name.lower()[:-3] + 'y')
56
+ table_names.add(snakecase.lower()[:-3] + 'y')
57
+ table_names.add(camelcase.lower()[:-3] + 'y')
58
+ elif _table_name.lower().endswith('y'):
59
+ table_names.add(_table_name.lower()[:-1] + 'ies')
60
+ table_names.add(snakecase.lower()[:-1] + 'ies')
61
+ table_names.add(camelcase.lower()[:-1] + 'ies')
62
+ if _table_name.lower().endswith('ing'):
63
+ table_names.add(_table_name.lower()[:-3])
64
+ table_names.add(snakecase.lower()[:-3])
65
+ table_names.add(camelcase.lower()[:-3])
66
+
67
+ scores: list[tuple[str, int]] = []
68
+ for col_name in candidates:
69
+ col_name_lower = col_name.lower()
70
+
71
+ score = 0
72
+
73
+ if col_name_lower == 'id':
74
+ score += 4
75
+
76
+ for table_name_lower in table_names:
77
+
78
+ if col_name_lower == table_name_lower:
79
+ score += 4 # USER -> USER
80
+ break
81
+
82
+ for suffix in ['id', 'hash', 'key', 'code', 'uuid']:
83
+ if not col_name_lower.endswith(suffix):
84
+ continue
85
+
86
+ if col_name_lower == f'{table_name_lower}_{suffix}':
87
+ score += 5 # USER -> USER_ID
88
+ break
89
+
90
+ if col_name_lower == f'{table_name_lower}{suffix}':
91
+ score += 5 # User -> UserId
92
+ break
93
+
94
+ if col_name_lower.endswith(f'{table_name_lower}_{suffix}'):
95
+ score += 2
96
+
97
+ if col_name_lower.endswith(f'{table_name_lower}{suffix}'):
98
+ score += 2
99
+
100
+ # `rel-bench` hard-coding :(
101
+ if table_name == 'studies' and col_name == 'nct_id':
102
+ score += 1
103
+
104
+ ser = df[col_name].iloc[:1_000_000]
105
+ score += 3 * (ser.nunique() / len(ser))
106
+
107
+ scores.append((col_name, score))
108
+
109
+ scores = [x for x in scores if x[-1] >= 4]
110
+ scores.sort(key=lambda x: x[-1], reverse=True)
111
+
112
+ if len(scores) == 0:
113
+ return None
114
+
115
+ if len(scores) == 1:
116
+ return scores[0][0]
117
+
118
+ # In case of multiple candidates, only return one if its score is unique:
119
+ if scores[0][1] != scores[1][1]:
120
+ return scores[0][0]
121
+
122
+ max_score = max(scores, key=lambda x: x[1])
123
+ candidates = [col_name for col_name, score in scores if score == max_score]
124
+ warnings.warn(f"Found multiple potential primary keys in table "
125
+ f"'{table_name}': {candidates}. Please specify the primary "
126
+ f"key for this table manually.")
127
+
128
+ return None
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+ from kumoapi.typing import Dtype, Stype
3
+
4
+ from kumoai.experimental.rfm.infer import (
5
+ contains_categorical,
6
+ contains_id,
7
+ contains_multicategorical,
8
+ contains_timestamp,
9
+ )
10
+
11
+
12
+ def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
13
+ """Infers the :class:`Stype` from a :class:`pandas.Series`.
14
+
15
+ Args:
16
+ ser: A :class:`pandas.Series` to analyze.
17
+ column_name: The column name.
18
+ dtype: The data type.
19
+
20
+ Returns:
21
+ The semantic type.
22
+ """
23
+ if contains_id(ser, column_name, dtype):
24
+ return Stype.ID
25
+
26
+ if contains_timestamp(ser, column_name, dtype):
27
+ return Stype.timestamp
28
+
29
+ if contains_multicategorical(ser, column_name, dtype):
30
+ return Stype.multicategorical
31
+
32
+ if contains_categorical(ser, column_name, dtype):
33
+ return Stype.categorical
34
+
35
+ return dtype.default_stype
@@ -0,0 +1,61 @@
1
+ import re
2
+ import warnings
3
+
4
+ import pandas as pd
5
+
6
+
7
+ def infer_time_column(
8
+ df: pd.DataFrame,
9
+ candidates: list[str],
10
+ ) -> str | None:
11
+ r"""Auto-detect potential time column.
12
+
13
+ Args:
14
+ df: The pandas DataFrame to analyze.
15
+ candidates: A list of potential candidates.
16
+
17
+ Returns:
18
+ The name of the detected time column, or ``None`` if not found.
19
+ """
20
+ candidates = [ # Exclude all candidates with `*last*` in column names:
21
+ col_name for col_name in candidates
22
+ if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
23
+ ]
24
+
25
+ if len(candidates) == 0:
26
+ return None
27
+
28
+ if len(candidates) == 1:
29
+ return candidates[0]
30
+
31
+ # If there exists a dedicated `create*` column, use it as time column:
32
+ create_candidates = [
33
+ candidate for candidate in candidates
34
+ if candidate.lower().startswith('create')
35
+ ]
36
+ if len(create_candidates) == 1:
37
+ return create_candidates[0]
38
+ if len(create_candidates) > 1:
39
+ candidates = create_candidates
40
+
41
+ # Find the most optimal time column. Usually, it is the one pointing to
42
+ # the oldest timestamps:
43
+ with warnings.catch_warnings():
44
+ warnings.filterwarnings('ignore', message='Could not infer format')
45
+ min_timestamp_dict = {
46
+ key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
47
+ for key in candidates
48
+ }
49
+ min_timestamp_dict = {
50
+ key: value.min().tz_localize(None)
51
+ for key, value in min_timestamp_dict.items()
52
+ }
53
+ min_timestamp_dict = {
54
+ key: value
55
+ for key, value in min_timestamp_dict.items() if not pd.isna(value)
56
+ }
57
+
58
+ if len(min_timestamp_dict) == 0:
59
+ return None
60
+
61
+ return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
@@ -0,0 +1,41 @@
1
+ import re
2
+ import warnings
3
+
4
+ import pandas as pd
5
+ from dateutil.parser import UnknownTimezoneWarning
6
+ from kumoapi.typing import Dtype, Stype
7
+
8
+
9
+ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
10
+ if not Stype.timestamp.supports_dtype(dtype):
11
+ return False
12
+
13
+ if dtype.is_timestamp():
14
+ return True
15
+
16
+ column_name = column_name.lower()
17
+
18
+ match = re.search(
19
+ ('(^|_)(date|datetime|dt|time|timedate|timestamp|ts|'
20
+ 'created|updated)(_|$)'),
21
+ column_name,
22
+ re.IGNORECASE,
23
+ )
24
+ score = 0.3 if match is not None else 0.0
25
+
26
+ ser = ser.iloc[:100]
27
+ ser = ser.dropna()
28
+ ser = ser[ser != '']
29
+
30
+ if len(ser) == 0:
31
+ return False
32
+
33
+ ser = ser.astype(str) # Avoid parsing numbers as unix timestamps.
34
+
35
+ with warnings.catch_warnings():
36
+ warnings.simplefilter('ignore', UnknownTimezoneWarning)
37
+ warnings.filterwarnings('ignore', message='Could not infer format')
38
+ mask = pd.to_datetime(ser, errors='coerce').notna()
39
+ score += int(mask.sum()) / len(mask)
40
+
41
+ return score >= 1.0
@@ -0,0 +1,7 @@
1
+ from .executor import PQueryExecutor
2
+ from .pandas_executor import PQueryPandasExecutor
3
+
4
+ __all__ = [
5
+ 'PQueryExecutor',
6
+ 'PQueryPandasExecutor',
7
+ ]
@@ -0,0 +1,102 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generic, TypeVar
3
+
4
+ from kumoapi.pquery import ValidatedPredictiveQuery
5
+ from kumoapi.pquery.AST import (
6
+ Aggregation,
7
+ Column,
8
+ Condition,
9
+ Filter,
10
+ Join,
11
+ LogicalOperation,
12
+ )
13
+
14
+ TableData = TypeVar('TableData')
15
+ ColumnData = TypeVar('ColumnData')
16
+ IndexData = TypeVar('IndexData')
17
+
18
+
19
+ class PQueryExecutor(Generic[TableData, ColumnData, IndexData], ABC):
20
+ @abstractmethod
21
+ def execute_column(
22
+ self,
23
+ column: Column,
24
+ feat_dict: dict[str, TableData],
25
+ filter_na: bool = True,
26
+ ) -> tuple[ColumnData, IndexData]:
27
+ pass
28
+
29
+ @abstractmethod
30
+ def execute_aggregation(
31
+ self,
32
+ aggr: Aggregation,
33
+ feat_dict: dict[str, TableData],
34
+ time_dict: dict[str, ColumnData],
35
+ batch_dict: dict[str, IndexData],
36
+ anchor_time: ColumnData,
37
+ filter_na: bool = True,
38
+ num_forecasts: int = 1,
39
+ ) -> tuple[ColumnData, IndexData]:
40
+ pass
41
+
42
+ @abstractmethod
43
+ def execute_condition(
44
+ self,
45
+ condition: Condition,
46
+ feat_dict: dict[str, TableData],
47
+ time_dict: dict[str, ColumnData],
48
+ batch_dict: dict[str, IndexData],
49
+ anchor_time: ColumnData,
50
+ filter_na: bool = True,
51
+ num_forecasts: int = 1,
52
+ ) -> tuple[ColumnData, IndexData]:
53
+ pass
54
+
55
+ @abstractmethod
56
+ def execute_logical_operation(
57
+ self,
58
+ logical_operation: LogicalOperation,
59
+ feat_dict: dict[str, TableData],
60
+ time_dict: dict[str, ColumnData],
61
+ batch_dict: dict[str, IndexData],
62
+ anchor_time: ColumnData,
63
+ filter_na: bool = True,
64
+ num_forecasts: int = 1,
65
+ ) -> tuple[ColumnData, IndexData]:
66
+ pass
67
+
68
+ @abstractmethod
69
+ def execute_join(
70
+ self,
71
+ join: Join,
72
+ feat_dict: dict[str, TableData],
73
+ time_dict: dict[str, ColumnData],
74
+ batch_dict: dict[str, IndexData],
75
+ anchor_time: ColumnData,
76
+ filter_na: bool = True,
77
+ num_forecasts: int = 1,
78
+ ) -> tuple[ColumnData, IndexData]:
79
+ pass
80
+
81
+ @abstractmethod
82
+ def execute_filter(
83
+ self,
84
+ filter: Filter,
85
+ feat_dict: dict[str, TableData],
86
+ time_dict: dict[str, ColumnData],
87
+ batch_dict: dict[str, IndexData],
88
+ anchor_time: ColumnData,
89
+ ) -> tuple[ColumnData, IndexData]:
90
+ pass
91
+
92
+ @abstractmethod
93
+ def execute(
94
+ self,
95
+ query: ValidatedPredictiveQuery,
96
+ feat_dict: dict[str, TableData],
97
+ time_dict: dict[str, ColumnData],
98
+ batch_dict: dict[str, IndexData],
99
+ anchor_time: ColumnData,
100
+ num_forecasts: int = 1,
101
+ ) -> tuple[ColumnData, IndexData]:
102
+ pass