kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kumoai might be problematic. Click here for more details.
- kumoai/__init__.py +300 -0
- kumoai/_logging.py +29 -0
- kumoai/_singleton.py +25 -0
- kumoai/_version.py +1 -0
- kumoai/artifact_export/__init__.py +9 -0
- kumoai/artifact_export/config.py +209 -0
- kumoai/artifact_export/job.py +108 -0
- kumoai/client/__init__.py +5 -0
- kumoai/client/client.py +223 -0
- kumoai/client/connector.py +110 -0
- kumoai/client/endpoints.py +150 -0
- kumoai/client/graph.py +120 -0
- kumoai/client/jobs.py +471 -0
- kumoai/client/online.py +78 -0
- kumoai/client/pquery.py +207 -0
- kumoai/client/rfm.py +112 -0
- kumoai/client/source_table.py +53 -0
- kumoai/client/table.py +101 -0
- kumoai/client/utils.py +130 -0
- kumoai/codegen/__init__.py +19 -0
- kumoai/codegen/cli.py +100 -0
- kumoai/codegen/context.py +16 -0
- kumoai/codegen/edits.py +473 -0
- kumoai/codegen/exceptions.py +10 -0
- kumoai/codegen/generate.py +222 -0
- kumoai/codegen/handlers/__init__.py +4 -0
- kumoai/codegen/handlers/connector.py +118 -0
- kumoai/codegen/handlers/graph.py +71 -0
- kumoai/codegen/handlers/pquery.py +62 -0
- kumoai/codegen/handlers/table.py +109 -0
- kumoai/codegen/handlers/utils.py +42 -0
- kumoai/codegen/identity.py +114 -0
- kumoai/codegen/loader.py +93 -0
- kumoai/codegen/naming.py +94 -0
- kumoai/codegen/registry.py +121 -0
- kumoai/connector/__init__.py +31 -0
- kumoai/connector/base.py +153 -0
- kumoai/connector/bigquery_connector.py +200 -0
- kumoai/connector/databricks_connector.py +213 -0
- kumoai/connector/file_upload_connector.py +189 -0
- kumoai/connector/glue_connector.py +150 -0
- kumoai/connector/s3_connector.py +278 -0
- kumoai/connector/snowflake_connector.py +252 -0
- kumoai/connector/source_table.py +471 -0
- kumoai/connector/utils.py +1796 -0
- kumoai/databricks.py +14 -0
- kumoai/encoder/__init__.py +4 -0
- kumoai/exceptions.py +26 -0
- kumoai/experimental/__init__.py +0 -0
- kumoai/experimental/rfm/__init__.py +210 -0
- kumoai/experimental/rfm/authenticate.py +432 -0
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +113 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
- kumoai/experimental/rfm/backend/snow/table.py +242 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
- kumoai/experimental/rfm/base/__init__.py +30 -0
- kumoai/experimental/rfm/base/column.py +152 -0
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +761 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +143 -0
- kumoai/experimental/rfm/base/table.py +736 -0
- kumoai/experimental/rfm/graph.py +1237 -0
- kumoai/experimental/rfm/infer/__init__.py +19 -0
- kumoai/experimental/rfm/infer/categorical.py +40 -0
- kumoai/experimental/rfm/infer/dtype.py +82 -0
- kumoai/experimental/rfm/infer/id.py +46 -0
- kumoai/experimental/rfm/infer/multicategorical.py +48 -0
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/infer/timestamp.py +41 -0
- kumoai/experimental/rfm/pquery/__init__.py +7 -0
- kumoai/experimental/rfm/pquery/executor.py +102 -0
- kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +1184 -0
- kumoai/experimental/rfm/sagemaker.py +138 -0
- kumoai/experimental/rfm/task_table.py +231 -0
- kumoai/formatting.py +30 -0
- kumoai/futures.py +99 -0
- kumoai/graph/__init__.py +12 -0
- kumoai/graph/column.py +106 -0
- kumoai/graph/graph.py +948 -0
- kumoai/graph/table.py +838 -0
- kumoai/jobs.py +80 -0
- kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
- kumoai/mixin.py +28 -0
- kumoai/pquery/__init__.py +25 -0
- kumoai/pquery/prediction_table.py +287 -0
- kumoai/pquery/predictive_query.py +641 -0
- kumoai/pquery/training_table.py +424 -0
- kumoai/spcs.py +121 -0
- kumoai/testing/__init__.py +8 -0
- kumoai/testing/decorators.py +57 -0
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/__init__.py +42 -0
- kumoai/trainer/baseline_trainer.py +93 -0
- kumoai/trainer/config.py +2 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/trainer/job.py +1192 -0
- kumoai/trainer/online_serving.py +258 -0
- kumoai/trainer/trainer.py +475 -0
- kumoai/trainer/util.py +103 -0
- kumoai/utils/__init__.py +11 -0
- kumoai/utils/datasets.py +83 -0
- kumoai/utils/display.py +51 -0
- kumoai/utils/forecasting.py +209 -0
- kumoai/utils/progress_logger.py +343 -0
- kumoai/utils/sql.py +3 -0
- kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
- kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
- kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
- kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
- kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Configuration-based identity system for codegen deduplication."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from kumoai.connector import (
|
|
10
|
+
BigQueryConnector,
|
|
11
|
+
DatabricksConnector,
|
|
12
|
+
FileUploadConnector,
|
|
13
|
+
S3Connector,
|
|
14
|
+
SnowflakeConnector,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_config_id(obj: Any) -> str:
|
|
19
|
+
"""Return configuration-based identity for codegen deduplication.
|
|
20
|
+
|
|
21
|
+
Objects with identical configurations get the same config_id, enabling
|
|
22
|
+
variable reuse during code generation. Uses get_editable_attributes() to
|
|
23
|
+
automatically capture all configurable properties.
|
|
24
|
+
|
|
25
|
+
This configuration-based identity is used for deduplication only of
|
|
26
|
+
object which are SAFE to be shared
|
|
27
|
+
when they have the same configuration always.
|
|
28
|
+
For example, a S3Connector
|
|
29
|
+
with the same root_dir and name can always be shared,
|
|
30
|
+
it does not matter if some other objects wants to copy it.
|
|
31
|
+
|
|
32
|
+
But for certain objects like Graphs,
|
|
33
|
+
some object may want 2 copies of the same graph,
|
|
34
|
+
and there we should not use this config_id
|
|
35
|
+
and always use the memory address.
|
|
36
|
+
For example, a Graph with the same tables and edges can always be shared,
|
|
37
|
+
it does not matter if some other objects wants to copy it.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
obj: Object to get configuration ID for
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Configuration-based identity string
|
|
44
|
+
|
|
45
|
+
Note:
|
|
46
|
+
- For deduplication only, not cycle detection (use id() for cycles)
|
|
47
|
+
- Only applied to connector types for now; other objects use memory ID
|
|
48
|
+
"""
|
|
49
|
+
generic_object_types = (S3Connector, BigQueryConnector, SnowflakeConnector,
|
|
50
|
+
DatabricksConnector, FileUploadConnector)
|
|
51
|
+
if isinstance(obj, generic_object_types):
|
|
52
|
+
return _get_generic_config_id(obj)
|
|
53
|
+
else:
|
|
54
|
+
return f"id_{id(obj)}"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _get_generic_config_id(obj: Any) -> str:
|
|
58
|
+
"""Generate config ID by hashing object type and editable attributes.
|
|
59
|
+
|
|
60
|
+
Uses get_editable_attributes() to capture all configurable properties,
|
|
61
|
+
then creates a SHA256 hash for consistent identity.
|
|
62
|
+
"""
|
|
63
|
+
try:
|
|
64
|
+
# Import here to avoid circular imports
|
|
65
|
+
from kumoai.codegen.edits import get_editable_attributes
|
|
66
|
+
|
|
67
|
+
# Get object type name
|
|
68
|
+
obj_type = type(obj).__name__
|
|
69
|
+
|
|
70
|
+
# Get all editable attributes
|
|
71
|
+
editable_attrs = get_editable_attributes(obj)
|
|
72
|
+
|
|
73
|
+
# Build configuration dict
|
|
74
|
+
config: dict[str, Any] = {'type': obj_type, 'attributes': {}}
|
|
75
|
+
|
|
76
|
+
# Extract values for all editable attributes
|
|
77
|
+
for attr_name in sorted(editable_attrs): # Sort for consistent hashing
|
|
78
|
+
try:
|
|
79
|
+
attr_value = getattr(obj, attr_name)
|
|
80
|
+
config['attributes'][attr_name] = _serialize_value(attr_value)
|
|
81
|
+
except (AttributeError, RuntimeError, TypeError):
|
|
82
|
+
# Skip attributes that can't be accessed
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
# Create hash from configuration
|
|
86
|
+
config_str = json.dumps(config, sort_keys=True)
|
|
87
|
+
config_hash = hashlib.sha256(config_str.encode()).hexdigest()[:16]
|
|
88
|
+
|
|
89
|
+
return f"{obj_type}_{config_hash}"
|
|
90
|
+
|
|
91
|
+
except Exception:
|
|
92
|
+
# Fallback to memory address if hashing fails
|
|
93
|
+
return f"{type(obj).__name__}_{id(obj)}"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _serialize_value(value: Any) -> Any:
|
|
97
|
+
"""Convert value to JSON-serializable format for consistent hashing.
|
|
98
|
+
|
|
99
|
+
Handles nested objects by recursively applying config-based identity.
|
|
100
|
+
"""
|
|
101
|
+
if value is None:
|
|
102
|
+
return None
|
|
103
|
+
elif isinstance(value, (str, int, float, bool)):
|
|
104
|
+
return value
|
|
105
|
+
elif isinstance(value, (list, tuple)):
|
|
106
|
+
return [_serialize_value(item) for item in value]
|
|
107
|
+
elif isinstance(value, dict):
|
|
108
|
+
return {k: _serialize_value(v) for k, v in value.items()}
|
|
109
|
+
elif hasattr(value, '__dict__'):
|
|
110
|
+
# For objects with __dict__, recurse into their config_id
|
|
111
|
+
return get_config_id(value)
|
|
112
|
+
else:
|
|
113
|
+
# For other types, convert to string
|
|
114
|
+
return str(value)
|
kumoai/codegen/loader.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import kumoai as kumo
|
|
4
|
+
from kumoai.exceptions import HTTPException
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _get_supported_entities() -> dict[str, type]:
|
|
8
|
+
"""Get all supported entity types from registered handlers."""
|
|
9
|
+
from kumoai.codegen.registry import REG
|
|
10
|
+
|
|
11
|
+
return {cls.__name__: cls for cls in REG.keys()}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# This map is for converting string names to Python types.
|
|
15
|
+
# The keys should be the string a user would provide.
|
|
16
|
+
# Auto-generated from handlers registry while serving as supported
|
|
17
|
+
# entities filter.
|
|
18
|
+
ENTITY_TYPE_MAP = _get_supported_entities()
|
|
19
|
+
|
|
20
|
+
# This map contains prefixes for type inference from their ID.
|
|
21
|
+
ENTITY_PREFIX_MAP = {
|
|
22
|
+
# Job Types
|
|
23
|
+
"gen-traintable-job": kumo.TrainingTable,
|
|
24
|
+
"trainingjob": kumo.TrainingJob,
|
|
25
|
+
"bp-job": kumo.BatchPredictionJob,
|
|
26
|
+
# Query Types
|
|
27
|
+
"pquery": kumo.PredictiveQuery,
|
|
28
|
+
# Note: Table IDs don't have prefixes, so they require explicit
|
|
29
|
+
# --entity-class
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _load_with_class(entity_id: str, entity_class: type) -> object:
|
|
34
|
+
"""Helper to load an entity when the class is known."""
|
|
35
|
+
# Order of attempts: get_by_name, load, constructor
|
|
36
|
+
if hasattr(entity_class, "get_by_name"):
|
|
37
|
+
return entity_class.get_by_name(entity_id)
|
|
38
|
+
elif hasattr(entity_class, "load"):
|
|
39
|
+
return entity_class.load(entity_id)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
# For jobs like TrainingJob, BatchPredictionJob,
|
|
43
|
+
# FileUploadConnector
|
|
44
|
+
return entity_class(entity_id)
|
|
45
|
+
except (TypeError, AttributeError):
|
|
46
|
+
pass # Fall through to the error
|
|
47
|
+
|
|
48
|
+
raise NotImplementedError(
|
|
49
|
+
f"Don't know how to load object of type {entity_class.__name__}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def load_from_id(
|
|
53
|
+
entity_id: str,
|
|
54
|
+
entity_class_str: str | None = None,
|
|
55
|
+
) -> object:
|
|
56
|
+
"""Load a Kumo object as an SDK object.
|
|
57
|
+
- If entity_class_str is provided, it's used to find the type.
|
|
58
|
+
- If not, the type is inferred from the ID prefix.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
# Scenario A: Explicit class string provided
|
|
62
|
+
if entity_class_str:
|
|
63
|
+
if entity_class_str not in ENTITY_TYPE_MAP:
|
|
64
|
+
raise ValueError(f"Unknown entity_class '{entity_class_str}'. "
|
|
65
|
+
f"Supported types are: "
|
|
66
|
+
f"{', '.join(ENTITY_TYPE_MAP.keys())}")
|
|
67
|
+
entity_class = ENTITY_TYPE_MAP[entity_class_str]
|
|
68
|
+
return _load_with_class(entity_id, entity_class)
|
|
69
|
+
|
|
70
|
+
# Scenario B: No class string provided, so infer from ID
|
|
71
|
+
# prefix
|
|
72
|
+
prefix = entity_id.split("-", 1)[0].lower()
|
|
73
|
+
if prefix in ENTITY_PREFIX_MAP:
|
|
74
|
+
inferred_class = ENTITY_PREFIX_MAP[prefix]
|
|
75
|
+
return _load_with_class(entity_id, inferred_class)
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"Could not infer entity type from ID '{entity_id}'. "
|
|
79
|
+
"For an entity with a non-prefixed ID"
|
|
80
|
+
"(like a Connector, Graph, or Table), "
|
|
81
|
+
"please provide the 'entity_class' parameter. "
|
|
82
|
+
"Supported prefixes are: " +
|
|
83
|
+
", ".join(ENTITY_PREFIX_MAP.keys()) +
|
|
84
|
+
"\n and supported classes are: " +
|
|
85
|
+
", ".join(ENTITY_TYPE_MAP.keys()))
|
|
86
|
+
except (HTTPException, ValueError) as e:
|
|
87
|
+
class_name = (entity_class_str
|
|
88
|
+
if entity_class_str else "inferred type")
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Failed to load entity '{entity_id}' of type {class_name}") from e
|
|
91
|
+
except Exception as e:
|
|
92
|
+
raise ValueError(f"An unexpected error occurred while"
|
|
93
|
+
f"loading entity '{entity_id}'") from e
|
kumoai/codegen/naming.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import keyword
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from kumoai.codegen.identity import get_config_id
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _sanitize_identifier(name: str) -> str:
|
|
11
|
+
"""Sanitize a name to be a valid Python identifier."""
|
|
12
|
+
if not name:
|
|
13
|
+
return "obj"
|
|
14
|
+
|
|
15
|
+
sanitized = "".join(char if char.isalnum() else "_"
|
|
16
|
+
for char in name.lower())
|
|
17
|
+
sanitized = "_".join(filter(None, sanitized.split("_")))
|
|
18
|
+
|
|
19
|
+
if not sanitized:
|
|
20
|
+
return "obj"
|
|
21
|
+
|
|
22
|
+
if sanitized[0].isdigit():
|
|
23
|
+
sanitized = f"_{sanitized}"
|
|
24
|
+
|
|
25
|
+
if keyword.iskeyword(sanitized) or sanitized in dir(__builtins__):
|
|
26
|
+
sanitized = f"{sanitized}_"
|
|
27
|
+
|
|
28
|
+
return sanitized
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NameManager:
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
self._counts: defaultdict[str, int] = defaultdict(int)
|
|
34
|
+
self._names: dict[str, str] = {} # config_id -> variable_name
|
|
35
|
+
|
|
36
|
+
def assign_entity_variable(self, obj: Any) -> str:
|
|
37
|
+
config_id = get_config_id(obj)
|
|
38
|
+
if config_id in self._names:
|
|
39
|
+
return self._names[config_id]
|
|
40
|
+
|
|
41
|
+
entity_name = ""
|
|
42
|
+
if hasattr(obj, "name") and obj.name:
|
|
43
|
+
entity_name = str(obj.name)
|
|
44
|
+
elif hasattr(obj, "source_name") and obj.source_name:
|
|
45
|
+
entity_name = str(obj.source_name)
|
|
46
|
+
|
|
47
|
+
type_name = obj.__class__.__name__.lower()
|
|
48
|
+
|
|
49
|
+
if entity_name:
|
|
50
|
+
sanitized_name = _sanitize_identifier(entity_name)
|
|
51
|
+
base_name = (sanitized_name if sanitized_name.replace("_", "")
|
|
52
|
+
== type_name else f"{sanitized_name}_{type_name}")
|
|
53
|
+
else:
|
|
54
|
+
base_name = type_name
|
|
55
|
+
|
|
56
|
+
self._counts[base_name] += 1
|
|
57
|
+
name = f"{base_name}_{self._counts[base_name]}"
|
|
58
|
+
self._names[config_id] = name
|
|
59
|
+
return name
|
|
60
|
+
|
|
61
|
+
def assign_temp_variable(self, path: str, value: Any) -> str:
|
|
62
|
+
base_name = self._get_base_name_for_temp(path, value)
|
|
63
|
+
self._counts[base_name] += 1
|
|
64
|
+
return f"{base_name}_{self._counts[base_name]}"
|
|
65
|
+
|
|
66
|
+
def _get_base_name_for_temp(self, path: str, value: Any) -> str:
|
|
67
|
+
if path:
|
|
68
|
+
if "." in path:
|
|
69
|
+
parts = path.split(".")
|
|
70
|
+
for part in reversed(parts):
|
|
71
|
+
if part and not part.startswith("["):
|
|
72
|
+
return part.split("[")[0]
|
|
73
|
+
if "[" in path:
|
|
74
|
+
return path.split("[")[0]
|
|
75
|
+
if not path.startswith("["):
|
|
76
|
+
return path
|
|
77
|
+
|
|
78
|
+
primitives = (type(None), str, int, float, bool, list, dict, set,
|
|
79
|
+
tuple)
|
|
80
|
+
if not isinstance(value, primitives):
|
|
81
|
+
import re
|
|
82
|
+
class_name = type(value).__name__
|
|
83
|
+
return re.sub(r"(?<!^)(?=[A-Z])", "_", class_name).lower()
|
|
84
|
+
|
|
85
|
+
if isinstance(value, list):
|
|
86
|
+
return "list"
|
|
87
|
+
if isinstance(value, dict):
|
|
88
|
+
return "dict"
|
|
89
|
+
if isinstance(value, set):
|
|
90
|
+
return "set"
|
|
91
|
+
if isinstance(value, tuple):
|
|
92
|
+
return "tuple"
|
|
93
|
+
|
|
94
|
+
return "temp_obj"
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Callable,
|
|
8
|
+
Dict,
|
|
9
|
+
List,
|
|
10
|
+
NamedTuple,
|
|
11
|
+
Optional,
|
|
12
|
+
Sequence,
|
|
13
|
+
Type,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from kumoai.codegen.context import CodegenContext
|
|
17
|
+
from kumoai.codegen.edits import UniversalReplacementEdit
|
|
18
|
+
from kumoai.codegen.identity import get_config_id
|
|
19
|
+
from kumoai.codegen.naming import NameManager
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Handler(NamedTuple):
|
|
23
|
+
parents: Callable[[object, CodegenContext],
|
|
24
|
+
List[object]] # Added codegen_ctx parameter
|
|
25
|
+
required_imports: Callable[[object], List[str]]
|
|
26
|
+
emit_lines: Callable[[object, str, dict, CodegenContext],
|
|
27
|
+
List[str]] # Added codegen_ctx parameter
|
|
28
|
+
detect_edits: Optional[Callable[[object, object, NameManager],
|
|
29
|
+
Sequence[UniversalReplacementEdit]]]
|
|
30
|
+
get_parent_map: Optional[Callable[[object], dict[str, dict[str,
|
|
31
|
+
Any]]]] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
REG: dict[Type, Handler] = {}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def register_shared_parents(ctx: CodegenContext, obj: object,
|
|
38
|
+
handler: Handler) -> None:
|
|
39
|
+
"""Register parents that this handler wants
|
|
40
|
+
to share with other handlers.
|
|
41
|
+
"""
|
|
42
|
+
if handler.get_parent_map:
|
|
43
|
+
parent_map = handler.get_parent_map(obj)
|
|
44
|
+
# parent_map format: {object_id: {key: parent_obj}}
|
|
45
|
+
for obj_id, shared_data in parent_map.items():
|
|
46
|
+
ctx.shared_parents[obj_id] = shared_data
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def lookup_shared_parent(ctx: CodegenContext, obj: object, key: str) -> Any:
|
|
50
|
+
"""Look up a shared parent by key from another handler using config ID."""
|
|
51
|
+
config_id = get_config_id(obj)
|
|
52
|
+
return ctx.shared_parents.get(config_id, {}).get(key)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def store_shared_parent(ctx: CodegenContext, obj: object, key: str,
|
|
56
|
+
parent_obj: object) -> None:
|
|
57
|
+
"""Store a shared parent for an object."""
|
|
58
|
+
config_id = get_config_id(obj)
|
|
59
|
+
if config_id not in ctx.shared_parents:
|
|
60
|
+
ctx.shared_parents[config_id] = {}
|
|
61
|
+
ctx.shared_parents[config_id][key] = parent_obj
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def store_object_var(ctx: CodegenContext, obj: object, var_name: str) -> None:
|
|
65
|
+
"""Store the variable name for an object using config_id."""
|
|
66
|
+
config_id = get_config_id(obj)
|
|
67
|
+
ctx.object_to_var[config_id] = var_name
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_object_var(ctx: CodegenContext, obj: object) -> str:
|
|
71
|
+
"""Get the variable name for an object using config_id."""
|
|
72
|
+
config_id = get_config_id(obj)
|
|
73
|
+
var_name = ctx.object_to_var.get(config_id)
|
|
74
|
+
if not var_name:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"No variable name found for object {type(obj).__name__} "
|
|
77
|
+
f"with config_id {config_id}")
|
|
78
|
+
return var_name
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def init_execution_env(ctx: CodegenContext) -> None:
|
|
82
|
+
"""Initialize the execution environment in context."""
|
|
83
|
+
import kumoai as kumo
|
|
84
|
+
ctx.execution_env = {"kumo": kumo}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def execute_in_env(ctx: CodegenContext, lines: list[str],
|
|
88
|
+
imports: Optional[list[str]] = None) -> None:
|
|
89
|
+
"""Execute lines in the context's execution environment."""
|
|
90
|
+
if imports:
|
|
91
|
+
for import_line in imports:
|
|
92
|
+
exec(import_line, ctx.execution_env)
|
|
93
|
+
|
|
94
|
+
for line in lines:
|
|
95
|
+
if line.strip() and not line.strip().startswith("#"):
|
|
96
|
+
exec(line, ctx.execution_env)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_from_env(ctx: CodegenContext, var_name: str) -> Any:
|
|
100
|
+
"""Get an object from the context's execution environment."""
|
|
101
|
+
return ctx.execution_env.get(var_name)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _discover_and_register_handlers() -> None:
|
|
105
|
+
"""Dynamically discover and import all modules in the 'handlers' folders,
|
|
106
|
+
call their `get_handlers` function, and register the returned handlers.
|
|
107
|
+
"""
|
|
108
|
+
from . import handlers
|
|
109
|
+
|
|
110
|
+
handlers_dir = handlers.__path__
|
|
111
|
+
prefix = f"{handlers.__name__}."
|
|
112
|
+
|
|
113
|
+
for _, module_name, _ in pkgutil.iter_modules(handlers_dir, prefix):
|
|
114
|
+
module = importlib.import_module(module_name)
|
|
115
|
+
if hasattr(module, "get_handlers"):
|
|
116
|
+
handlers_to_register: Dict[Type, Handler] = (module.get_handlers())
|
|
117
|
+
for cls, handler in handlers_to_register.items():
|
|
118
|
+
REG[cls] = handler
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
_discover_and_register_handlers()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .base import Connector
|
|
2
|
+
from .s3_connector import S3Connector
|
|
3
|
+
from .snowflake_connector import SnowflakeConnector
|
|
4
|
+
from .databricks_connector import DatabricksConnector
|
|
5
|
+
from .bigquery_connector import BigQueryConnector
|
|
6
|
+
from .file_upload_connector import FileUploadConnector
|
|
7
|
+
from .glue_connector import GlueConnector
|
|
8
|
+
from .source_table import (
|
|
9
|
+
SourceTable,
|
|
10
|
+
SourceTableFuture,
|
|
11
|
+
LLMSourceTableFuture,
|
|
12
|
+
SourceColumn,
|
|
13
|
+
)
|
|
14
|
+
from .utils import upload_table, delete_uploaded_table, replace_table
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'Connector',
|
|
18
|
+
'S3Connector',
|
|
19
|
+
'SnowflakeConnector',
|
|
20
|
+
'DatabricksConnector',
|
|
21
|
+
'BigQueryConnector',
|
|
22
|
+
'FileUploadConnector',
|
|
23
|
+
'GlueConnector',
|
|
24
|
+
'SourceTable',
|
|
25
|
+
'SourceTableFuture',
|
|
26
|
+
'LLMSourceTableFuture',
|
|
27
|
+
'SourceColumn',
|
|
28
|
+
'upload_table',
|
|
29
|
+
'delete_uploaded_table',
|
|
30
|
+
'replace_table',
|
|
31
|
+
]
|
kumoai/connector/base.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Set, Union
|
|
3
|
+
|
|
4
|
+
from kumoapi.data_source import DataSourceType
|
|
5
|
+
from kumoapi.source_table import (
|
|
6
|
+
BigQuerySourceTableRequest,
|
|
7
|
+
DatabricksSourceTableRequest,
|
|
8
|
+
S3SourceTableRequest,
|
|
9
|
+
SnowflakeSourceTableRequest,
|
|
10
|
+
SourceTableConfigRequest,
|
|
11
|
+
SourceTableConfigResponse,
|
|
12
|
+
SourceTableDataRequest,
|
|
13
|
+
SourceTableDataResponse,
|
|
14
|
+
SourceTableListRequest,
|
|
15
|
+
SourceTableListResponse,
|
|
16
|
+
SourceTableValidateRequest,
|
|
17
|
+
SourceTableValidateResponse,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from kumoai import global_state
|
|
21
|
+
from kumoai.connector.source_table import SourceTable
|
|
22
|
+
from kumoai.exceptions import HTTPException
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Connector(ABC):
|
|
26
|
+
r"""A connector to a backing data source, that can be used to create Kumo
|
|
27
|
+
tables.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
_validated_tables: Set[str] = set()
|
|
31
|
+
|
|
32
|
+
# Metadata ################################################################
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def source_type(self) -> DataSourceType:
|
|
37
|
+
r"""Returns the data source type accessible by this connector."""
|
|
38
|
+
raise NotImplementedError()
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def name(self) -> str:
|
|
43
|
+
r"""Returns the name of the connector.
|
|
44
|
+
|
|
45
|
+
.. note::
|
|
46
|
+
If the connector does not support naming, the name refers to an
|
|
47
|
+
internal specifier.
|
|
48
|
+
"""
|
|
49
|
+
raise NotImplementedError()
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def _source_table_request(
|
|
53
|
+
self,
|
|
54
|
+
table_names: List[str],
|
|
55
|
+
) -> Union[S3SourceTableRequest, BigQuerySourceTableRequest,
|
|
56
|
+
DatabricksSourceTableRequest, SnowflakeSourceTableRequest]:
|
|
57
|
+
raise NotImplementedError()
|
|
58
|
+
|
|
59
|
+
# Tables ##################################################################
|
|
60
|
+
|
|
61
|
+
def table_names(self) -> List[str]:
|
|
62
|
+
r"""Returns a list of table names accessible through this connector."""
|
|
63
|
+
return self._list_tables().table_names
|
|
64
|
+
|
|
65
|
+
def has_table(self, name: str) -> bool:
|
|
66
|
+
r"""Returns :obj:`True` if the table exists in this connector,
|
|
67
|
+
:obj:`False` otherwise.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
name: The table name.
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
resp = self._validate_table(name)
|
|
74
|
+
return resp.is_valid
|
|
75
|
+
except HTTPException:
|
|
76
|
+
# In case of HTTPException, Kumo backend doesn't have api
|
|
77
|
+
# implemented and we skip check by returns True.
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
def table(self, name: str) -> SourceTable:
|
|
81
|
+
r"""Returns a :class:`~kumoai.connector.SourceTable` object
|
|
82
|
+
corresponding to a source table behind this connector. A source table
|
|
83
|
+
is a view into the raw data of table :obj:`name`. To use a source
|
|
84
|
+
table in Kumo, you will need to construct a
|
|
85
|
+
:class:`~kumoai.graph.Table` from the source table.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
name: The table name.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
:class:`ValueError`: if :obj:`name` does not exist in the backing
|
|
92
|
+
connector.
|
|
93
|
+
"""
|
|
94
|
+
if not self.has_table(name):
|
|
95
|
+
raise ValueError(f"The table '{name}' does not exist in {self}. "
|
|
96
|
+
f"Please check the existence of the source data.")
|
|
97
|
+
|
|
98
|
+
return SourceTable(name=name, connector=self)
|
|
99
|
+
|
|
100
|
+
def _validate_table(self, table_name: str) -> SourceTableValidateResponse:
|
|
101
|
+
if table_name in self._validated_tables:
|
|
102
|
+
return SourceTableValidateResponse(is_valid=True, msg='')
|
|
103
|
+
|
|
104
|
+
req = SourceTableValidateRequest(
|
|
105
|
+
connector_id=self.name,
|
|
106
|
+
table_name=table_name,
|
|
107
|
+
source_type=self.source_type,
|
|
108
|
+
)
|
|
109
|
+
ret = global_state.client.source_table_api.validate_table(req)
|
|
110
|
+
|
|
111
|
+
# Cache the result for the whole session.
|
|
112
|
+
if ret.is_valid:
|
|
113
|
+
self._validated_tables.add(table_name)
|
|
114
|
+
return ret
|
|
115
|
+
|
|
116
|
+
def _list_tables(self) -> SourceTableListResponse:
|
|
117
|
+
req = SourceTableListRequest(connector_id=self.name,
|
|
118
|
+
source_type=self.source_type)
|
|
119
|
+
return global_state.client.source_table_api.list_tables(req)
|
|
120
|
+
|
|
121
|
+
def _get_table_data(
|
|
122
|
+
self,
|
|
123
|
+
table_names: List[str],
|
|
124
|
+
sample_rows: int,
|
|
125
|
+
) -> SourceTableDataResponse:
|
|
126
|
+
req = SourceTableDataRequest(
|
|
127
|
+
source_table_request=self._source_table_request(table_names),
|
|
128
|
+
sample_rows=sample_rows,
|
|
129
|
+
)
|
|
130
|
+
return global_state.client.source_table_api.get_table_data(req)
|
|
131
|
+
|
|
132
|
+
def _get_table_config(self, table_name: str) -> SourceTableConfigResponse:
|
|
133
|
+
# TODO(manan): rest backend for this is a bit broken, it never returns
|
|
134
|
+
# directories...
|
|
135
|
+
req = SourceTableConfigRequest(connector_id=self.name,
|
|
136
|
+
table_name=table_name,
|
|
137
|
+
source_type=self.source_type)
|
|
138
|
+
return global_state.client.source_table_api.get_table_config(req)
|
|
139
|
+
|
|
140
|
+
# Class properties ########################################################
|
|
141
|
+
|
|
142
|
+
def __hash__(self) -> int:
|
|
143
|
+
return hash(self.__dict__)
|
|
144
|
+
|
|
145
|
+
def __contains__(self, name: str) -> bool:
|
|
146
|
+
return self.has_table(name)
|
|
147
|
+
|
|
148
|
+
def __getitem__(self, name: str) -> SourceTable:
|
|
149
|
+
return self.table(name)
|
|
150
|
+
|
|
151
|
+
def __repr__(self) -> str:
|
|
152
|
+
# TODO(manan): class-overrideable metadata?
|
|
153
|
+
return f'{self.__class__.__name__}()'
|