kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (122) hide show
  1. kumoai/__init__.py +300 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +223 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +471 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +207 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1796 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +210 -0
  51. kumoai/experimental/rfm/authenticate.py +432 -0
  52. kumoai/experimental/rfm/backend/__init__.py +0 -0
  53. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  54. kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
  55. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  56. kumoai/experimental/rfm/backend/local/table.py +113 -0
  57. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  58. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  59. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  60. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  61. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  62. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  63. kumoai/experimental/rfm/base/__init__.py +30 -0
  64. kumoai/experimental/rfm/base/column.py +152 -0
  65. kumoai/experimental/rfm/base/expression.py +44 -0
  66. kumoai/experimental/rfm/base/sampler.py +761 -0
  67. kumoai/experimental/rfm/base/source.py +19 -0
  68. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  69. kumoai/experimental/rfm/base/table.py +736 -0
  70. kumoai/experimental/rfm/graph.py +1237 -0
  71. kumoai/experimental/rfm/infer/__init__.py +19 -0
  72. kumoai/experimental/rfm/infer/categorical.py +40 -0
  73. kumoai/experimental/rfm/infer/dtype.py +82 -0
  74. kumoai/experimental/rfm/infer/id.py +46 -0
  75. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  76. kumoai/experimental/rfm/infer/pkey.py +128 -0
  77. kumoai/experimental/rfm/infer/stype.py +35 -0
  78. kumoai/experimental/rfm/infer/time_col.py +61 -0
  79. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  80. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  81. kumoai/experimental/rfm/pquery/executor.py +102 -0
  82. kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
  83. kumoai/experimental/rfm/relbench.py +76 -0
  84. kumoai/experimental/rfm/rfm.py +1184 -0
  85. kumoai/experimental/rfm/sagemaker.py +138 -0
  86. kumoai/experimental/rfm/task_table.py +231 -0
  87. kumoai/formatting.py +30 -0
  88. kumoai/futures.py +99 -0
  89. kumoai/graph/__init__.py +12 -0
  90. kumoai/graph/column.py +106 -0
  91. kumoai/graph/graph.py +948 -0
  92. kumoai/graph/table.py +838 -0
  93. kumoai/jobs.py +80 -0
  94. kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
  95. kumoai/mixin.py +28 -0
  96. kumoai/pquery/__init__.py +25 -0
  97. kumoai/pquery/prediction_table.py +287 -0
  98. kumoai/pquery/predictive_query.py +641 -0
  99. kumoai/pquery/training_table.py +424 -0
  100. kumoai/spcs.py +121 -0
  101. kumoai/testing/__init__.py +8 -0
  102. kumoai/testing/decorators.py +57 -0
  103. kumoai/testing/snow.py +50 -0
  104. kumoai/trainer/__init__.py +42 -0
  105. kumoai/trainer/baseline_trainer.py +93 -0
  106. kumoai/trainer/config.py +2 -0
  107. kumoai/trainer/distilled_trainer.py +175 -0
  108. kumoai/trainer/job.py +1192 -0
  109. kumoai/trainer/online_serving.py +258 -0
  110. kumoai/trainer/trainer.py +475 -0
  111. kumoai/trainer/util.py +103 -0
  112. kumoai/utils/__init__.py +11 -0
  113. kumoai/utils/datasets.py +83 -0
  114. kumoai/utils/display.py +51 -0
  115. kumoai/utils/forecasting.py +209 -0
  116. kumoai/utils/progress_logger.py +343 -0
  117. kumoai/utils/sql.py +3 -0
  118. kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
  119. kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
  120. kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
  121. kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
  122. kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
@@ -0,0 +1,114 @@
1
+ """Configuration-based identity system for codegen deduplication."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ from typing import Any
8
+
9
+ from kumoai.connector import (
10
+ BigQueryConnector,
11
+ DatabricksConnector,
12
+ FileUploadConnector,
13
+ S3Connector,
14
+ SnowflakeConnector,
15
+ )
16
+
17
+
18
+ def get_config_id(obj: Any) -> str:
19
+ """Return configuration-based identity for codegen deduplication.
20
+
21
+ Objects with identical configurations get the same config_id, enabling
22
+ variable reuse during code generation. Uses get_editable_attributes() to
23
+ automatically capture all configurable properties.
24
+
25
+ This configuration-based identity is used for deduplication only of
26
+ object which are SAFE to be shared
27
+ when they have the same configuration always.
28
+ For example, a S3Connector
29
+ with the same root_dir and name can always be shared,
30
+ it does not matter if some other objects wants to copy it.
31
+
32
+ But for certain objects like Graphs,
33
+ some object may want 2 copies of the same graph,
34
+ and there we should not use this config_id
35
+ and always use the memory address.
36
+ For example, a Graph with the same tables and edges can always be shared,
37
+ it does not matter if some other objects wants to copy it.
38
+
39
+ Args:
40
+ obj: Object to get configuration ID for
41
+
42
+ Returns:
43
+ Configuration-based identity string
44
+
45
+ Note:
46
+ - For deduplication only, not cycle detection (use id() for cycles)
47
+ - Only applied to connector types for now; other objects use memory ID
48
+ """
49
+ generic_object_types = (S3Connector, BigQueryConnector, SnowflakeConnector,
50
+ DatabricksConnector, FileUploadConnector)
51
+ if isinstance(obj, generic_object_types):
52
+ return _get_generic_config_id(obj)
53
+ else:
54
+ return f"id_{id(obj)}"
55
+
56
+
57
+ def _get_generic_config_id(obj: Any) -> str:
58
+ """Generate config ID by hashing object type and editable attributes.
59
+
60
+ Uses get_editable_attributes() to capture all configurable properties,
61
+ then creates a SHA256 hash for consistent identity.
62
+ """
63
+ try:
64
+ # Import here to avoid circular imports
65
+ from kumoai.codegen.edits import get_editable_attributes
66
+
67
+ # Get object type name
68
+ obj_type = type(obj).__name__
69
+
70
+ # Get all editable attributes
71
+ editable_attrs = get_editable_attributes(obj)
72
+
73
+ # Build configuration dict
74
+ config: dict[str, Any] = {'type': obj_type, 'attributes': {}}
75
+
76
+ # Extract values for all editable attributes
77
+ for attr_name in sorted(editable_attrs): # Sort for consistent hashing
78
+ try:
79
+ attr_value = getattr(obj, attr_name)
80
+ config['attributes'][attr_name] = _serialize_value(attr_value)
81
+ except (AttributeError, RuntimeError, TypeError):
82
+ # Skip attributes that can't be accessed
83
+ continue
84
+
85
+ # Create hash from configuration
86
+ config_str = json.dumps(config, sort_keys=True)
87
+ config_hash = hashlib.sha256(config_str.encode()).hexdigest()[:16]
88
+
89
+ return f"{obj_type}_{config_hash}"
90
+
91
+ except Exception:
92
+ # Fallback to memory address if hashing fails
93
+ return f"{type(obj).__name__}_{id(obj)}"
94
+
95
+
96
+ def _serialize_value(value: Any) -> Any:
97
+ """Convert value to JSON-serializable format for consistent hashing.
98
+
99
+ Handles nested objects by recursively applying config-based identity.
100
+ """
101
+ if value is None:
102
+ return None
103
+ elif isinstance(value, (str, int, float, bool)):
104
+ return value
105
+ elif isinstance(value, (list, tuple)):
106
+ return [_serialize_value(item) for item in value]
107
+ elif isinstance(value, dict):
108
+ return {k: _serialize_value(v) for k, v in value.items()}
109
+ elif hasattr(value, '__dict__'):
110
+ # For objects with __dict__, recurse into their config_id
111
+ return get_config_id(value)
112
+ else:
113
+ # For other types, convert to string
114
+ return str(value)
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ import kumoai as kumo
4
+ from kumoai.exceptions import HTTPException
5
+
6
+
7
+ def _get_supported_entities() -> dict[str, type]:
8
+ """Get all supported entity types from registered handlers."""
9
+ from kumoai.codegen.registry import REG
10
+
11
+ return {cls.__name__: cls for cls in REG.keys()}
12
+
13
+
14
+ # This map is for converting string names to Python types.
15
+ # The keys should be the string a user would provide.
16
+ # Auto-generated from handlers registry while serving as supported
17
+ # entities filter.
18
+ ENTITY_TYPE_MAP = _get_supported_entities()
19
+
20
+ # This map contains prefixes for type inference from their ID.
21
+ ENTITY_PREFIX_MAP = {
22
+ # Job Types
23
+ "gen-traintable-job": kumo.TrainingTable,
24
+ "trainingjob": kumo.TrainingJob,
25
+ "bp-job": kumo.BatchPredictionJob,
26
+ # Query Types
27
+ "pquery": kumo.PredictiveQuery,
28
+ # Note: Table IDs don't have prefixes, so they require explicit
29
+ # --entity-class
30
+ }
31
+
32
+
33
+ def _load_with_class(entity_id: str, entity_class: type) -> object:
34
+ """Helper to load an entity when the class is known."""
35
+ # Order of attempts: get_by_name, load, constructor
36
+ if hasattr(entity_class, "get_by_name"):
37
+ return entity_class.get_by_name(entity_id)
38
+ elif hasattr(entity_class, "load"):
39
+ return entity_class.load(entity_id)
40
+
41
+ try:
42
+ # For jobs like TrainingJob, BatchPredictionJob,
43
+ # FileUploadConnector
44
+ return entity_class(entity_id)
45
+ except (TypeError, AttributeError):
46
+ pass # Fall through to the error
47
+
48
+ raise NotImplementedError(
49
+ f"Don't know how to load object of type {entity_class.__name__}")
50
+
51
+
52
+ def load_from_id(
53
+ entity_id: str,
54
+ entity_class_str: str | None = None,
55
+ ) -> object:
56
+ """Load a Kumo object as an SDK object.
57
+ - If entity_class_str is provided, it's used to find the type.
58
+ - If not, the type is inferred from the ID prefix.
59
+ """
60
+ try:
61
+ # Scenario A: Explicit class string provided
62
+ if entity_class_str:
63
+ if entity_class_str not in ENTITY_TYPE_MAP:
64
+ raise ValueError(f"Unknown entity_class '{entity_class_str}'. "
65
+ f"Supported types are: "
66
+ f"{', '.join(ENTITY_TYPE_MAP.keys())}")
67
+ entity_class = ENTITY_TYPE_MAP[entity_class_str]
68
+ return _load_with_class(entity_id, entity_class)
69
+
70
+ # Scenario B: No class string provided, so infer from ID
71
+ # prefix
72
+ prefix = entity_id.split("-", 1)[0].lower()
73
+ if prefix in ENTITY_PREFIX_MAP:
74
+ inferred_class = ENTITY_PREFIX_MAP[prefix]
75
+ return _load_with_class(entity_id, inferred_class)
76
+ else:
77
+ raise ValueError(
78
+ f"Could not infer entity type from ID '{entity_id}'. "
79
+ "For an entity with a non-prefixed ID"
80
+ "(like a Connector, Graph, or Table), "
81
+ "please provide the 'entity_class' parameter. "
82
+ "Supported prefixes are: " +
83
+ ", ".join(ENTITY_PREFIX_MAP.keys()) +
84
+ "\n and supported classes are: " +
85
+ ", ".join(ENTITY_TYPE_MAP.keys()))
86
+ except (HTTPException, ValueError) as e:
87
+ class_name = (entity_class_str
88
+ if entity_class_str else "inferred type")
89
+ raise ValueError(
90
+ f"Failed to load entity '{entity_id}' of type {class_name}") from e
91
+ except Exception as e:
92
+ raise ValueError(f"An unexpected error occurred while"
93
+ f"loading entity '{entity_id}'") from e
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import keyword
4
+ from collections import defaultdict
5
+ from typing import Any
6
+
7
+ from kumoai.codegen.identity import get_config_id
8
+
9
+
10
+ def _sanitize_identifier(name: str) -> str:
11
+ """Sanitize a name to be a valid Python identifier."""
12
+ if not name:
13
+ return "obj"
14
+
15
+ sanitized = "".join(char if char.isalnum() else "_"
16
+ for char in name.lower())
17
+ sanitized = "_".join(filter(None, sanitized.split("_")))
18
+
19
+ if not sanitized:
20
+ return "obj"
21
+
22
+ if sanitized[0].isdigit():
23
+ sanitized = f"_{sanitized}"
24
+
25
+ if keyword.iskeyword(sanitized) or sanitized in dir(__builtins__):
26
+ sanitized = f"{sanitized}_"
27
+
28
+ return sanitized
29
+
30
+
31
+ class NameManager:
32
+ def __init__(self) -> None:
33
+ self._counts: defaultdict[str, int] = defaultdict(int)
34
+ self._names: dict[str, str] = {} # config_id -> variable_name
35
+
36
+ def assign_entity_variable(self, obj: Any) -> str:
37
+ config_id = get_config_id(obj)
38
+ if config_id in self._names:
39
+ return self._names[config_id]
40
+
41
+ entity_name = ""
42
+ if hasattr(obj, "name") and obj.name:
43
+ entity_name = str(obj.name)
44
+ elif hasattr(obj, "source_name") and obj.source_name:
45
+ entity_name = str(obj.source_name)
46
+
47
+ type_name = obj.__class__.__name__.lower()
48
+
49
+ if entity_name:
50
+ sanitized_name = _sanitize_identifier(entity_name)
51
+ base_name = (sanitized_name if sanitized_name.replace("_", "")
52
+ == type_name else f"{sanitized_name}_{type_name}")
53
+ else:
54
+ base_name = type_name
55
+
56
+ self._counts[base_name] += 1
57
+ name = f"{base_name}_{self._counts[base_name]}"
58
+ self._names[config_id] = name
59
+ return name
60
+
61
+ def assign_temp_variable(self, path: str, value: Any) -> str:
62
+ base_name = self._get_base_name_for_temp(path, value)
63
+ self._counts[base_name] += 1
64
+ return f"{base_name}_{self._counts[base_name]}"
65
+
66
+ def _get_base_name_for_temp(self, path: str, value: Any) -> str:
67
+ if path:
68
+ if "." in path:
69
+ parts = path.split(".")
70
+ for part in reversed(parts):
71
+ if part and not part.startswith("["):
72
+ return part.split("[")[0]
73
+ if "[" in path:
74
+ return path.split("[")[0]
75
+ if not path.startswith("["):
76
+ return path
77
+
78
+ primitives = (type(None), str, int, float, bool, list, dict, set,
79
+ tuple)
80
+ if not isinstance(value, primitives):
81
+ import re
82
+ class_name = type(value).__name__
83
+ return re.sub(r"(?<!^)(?=[A-Z])", "_", class_name).lower()
84
+
85
+ if isinstance(value, list):
86
+ return "list"
87
+ if isinstance(value, dict):
88
+ return "dict"
89
+ if isinstance(value, set):
90
+ return "set"
91
+ if isinstance(value, tuple):
92
+ return "tuple"
93
+
94
+ return "temp_obj"
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import pkgutil
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ NamedTuple,
11
+ Optional,
12
+ Sequence,
13
+ Type,
14
+ )
15
+
16
+ from kumoai.codegen.context import CodegenContext
17
+ from kumoai.codegen.edits import UniversalReplacementEdit
18
+ from kumoai.codegen.identity import get_config_id
19
+ from kumoai.codegen.naming import NameManager
20
+
21
+
22
+ class Handler(NamedTuple):
23
+ parents: Callable[[object, CodegenContext],
24
+ List[object]] # Added codegen_ctx parameter
25
+ required_imports: Callable[[object], List[str]]
26
+ emit_lines: Callable[[object, str, dict, CodegenContext],
27
+ List[str]] # Added codegen_ctx parameter
28
+ detect_edits: Optional[Callable[[object, object, NameManager],
29
+ Sequence[UniversalReplacementEdit]]]
30
+ get_parent_map: Optional[Callable[[object], dict[str, dict[str,
31
+ Any]]]] = None
32
+
33
+
34
+ REG: dict[Type, Handler] = {}
35
+
36
+
37
+ def register_shared_parents(ctx: CodegenContext, obj: object,
38
+ handler: Handler) -> None:
39
+ """Register parents that this handler wants
40
+ to share with other handlers.
41
+ """
42
+ if handler.get_parent_map:
43
+ parent_map = handler.get_parent_map(obj)
44
+ # parent_map format: {object_id: {key: parent_obj}}
45
+ for obj_id, shared_data in parent_map.items():
46
+ ctx.shared_parents[obj_id] = shared_data
47
+
48
+
49
+ def lookup_shared_parent(ctx: CodegenContext, obj: object, key: str) -> Any:
50
+ """Look up a shared parent by key from another handler using config ID."""
51
+ config_id = get_config_id(obj)
52
+ return ctx.shared_parents.get(config_id, {}).get(key)
53
+
54
+
55
+ def store_shared_parent(ctx: CodegenContext, obj: object, key: str,
56
+ parent_obj: object) -> None:
57
+ """Store a shared parent for an object."""
58
+ config_id = get_config_id(obj)
59
+ if config_id not in ctx.shared_parents:
60
+ ctx.shared_parents[config_id] = {}
61
+ ctx.shared_parents[config_id][key] = parent_obj
62
+
63
+
64
+ def store_object_var(ctx: CodegenContext, obj: object, var_name: str) -> None:
65
+ """Store the variable name for an object using config_id."""
66
+ config_id = get_config_id(obj)
67
+ ctx.object_to_var[config_id] = var_name
68
+
69
+
70
+ def get_object_var(ctx: CodegenContext, obj: object) -> str:
71
+ """Get the variable name for an object using config_id."""
72
+ config_id = get_config_id(obj)
73
+ var_name = ctx.object_to_var.get(config_id)
74
+ if not var_name:
75
+ raise ValueError(
76
+ f"No variable name found for object {type(obj).__name__} "
77
+ f"with config_id {config_id}")
78
+ return var_name
79
+
80
+
81
+ def init_execution_env(ctx: CodegenContext) -> None:
82
+ """Initialize the execution environment in context."""
83
+ import kumoai as kumo
84
+ ctx.execution_env = {"kumo": kumo}
85
+
86
+
87
+ def execute_in_env(ctx: CodegenContext, lines: list[str],
88
+ imports: Optional[list[str]] = None) -> None:
89
+ """Execute lines in the context's execution environment."""
90
+ if imports:
91
+ for import_line in imports:
92
+ exec(import_line, ctx.execution_env)
93
+
94
+ for line in lines:
95
+ if line.strip() and not line.strip().startswith("#"):
96
+ exec(line, ctx.execution_env)
97
+
98
+
99
+ def get_from_env(ctx: CodegenContext, var_name: str) -> Any:
100
+ """Get an object from the context's execution environment."""
101
+ return ctx.execution_env.get(var_name)
102
+
103
+
104
+ def _discover_and_register_handlers() -> None:
105
+ """Dynamically discover and import all modules in the 'handlers' folders,
106
+ call their `get_handlers` function, and register the returned handlers.
107
+ """
108
+ from . import handlers
109
+
110
+ handlers_dir = handlers.__path__
111
+ prefix = f"{handlers.__name__}."
112
+
113
+ for _, module_name, _ in pkgutil.iter_modules(handlers_dir, prefix):
114
+ module = importlib.import_module(module_name)
115
+ if hasattr(module, "get_handlers"):
116
+ handlers_to_register: Dict[Type, Handler] = (module.get_handlers())
117
+ for cls, handler in handlers_to_register.items():
118
+ REG[cls] = handler
119
+
120
+
121
+ _discover_and_register_handlers()
@@ -0,0 +1,31 @@
1
+ from .base import Connector
2
+ from .s3_connector import S3Connector
3
+ from .snowflake_connector import SnowflakeConnector
4
+ from .databricks_connector import DatabricksConnector
5
+ from .bigquery_connector import BigQueryConnector
6
+ from .file_upload_connector import FileUploadConnector
7
+ from .glue_connector import GlueConnector
8
+ from .source_table import (
9
+ SourceTable,
10
+ SourceTableFuture,
11
+ LLMSourceTableFuture,
12
+ SourceColumn,
13
+ )
14
+ from .utils import upload_table, delete_uploaded_table, replace_table
15
+
16
+ __all__ = [
17
+ 'Connector',
18
+ 'S3Connector',
19
+ 'SnowflakeConnector',
20
+ 'DatabricksConnector',
21
+ 'BigQueryConnector',
22
+ 'FileUploadConnector',
23
+ 'GlueConnector',
24
+ 'SourceTable',
25
+ 'SourceTableFuture',
26
+ 'LLMSourceTableFuture',
27
+ 'SourceColumn',
28
+ 'upload_table',
29
+ 'delete_uploaded_table',
30
+ 'replace_table',
31
+ ]
@@ -0,0 +1,153 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Set, Union
3
+
4
+ from kumoapi.data_source import DataSourceType
5
+ from kumoapi.source_table import (
6
+ BigQuerySourceTableRequest,
7
+ DatabricksSourceTableRequest,
8
+ S3SourceTableRequest,
9
+ SnowflakeSourceTableRequest,
10
+ SourceTableConfigRequest,
11
+ SourceTableConfigResponse,
12
+ SourceTableDataRequest,
13
+ SourceTableDataResponse,
14
+ SourceTableListRequest,
15
+ SourceTableListResponse,
16
+ SourceTableValidateRequest,
17
+ SourceTableValidateResponse,
18
+ )
19
+
20
+ from kumoai import global_state
21
+ from kumoai.connector.source_table import SourceTable
22
+ from kumoai.exceptions import HTTPException
23
+
24
+
25
+ class Connector(ABC):
26
+ r"""A connector to a backing data source, that can be used to create Kumo
27
+ tables.
28
+ """
29
+
30
+ _validated_tables: Set[str] = set()
31
+
32
+ # Metadata ################################################################
33
+
34
+ @property
35
+ @abstractmethod
36
+ def source_type(self) -> DataSourceType:
37
+ r"""Returns the data source type accessible by this connector."""
38
+ raise NotImplementedError()
39
+
40
+ @property
41
+ @abstractmethod
42
+ def name(self) -> str:
43
+ r"""Returns the name of the connector.
44
+
45
+ .. note::
46
+ If the connector does not support naming, the name refers to an
47
+ internal specifier.
48
+ """
49
+ raise NotImplementedError()
50
+
51
+ @abstractmethod
52
+ def _source_table_request(
53
+ self,
54
+ table_names: List[str],
55
+ ) -> Union[S3SourceTableRequest, BigQuerySourceTableRequest,
56
+ DatabricksSourceTableRequest, SnowflakeSourceTableRequest]:
57
+ raise NotImplementedError()
58
+
59
+ # Tables ##################################################################
60
+
61
+ def table_names(self) -> List[str]:
62
+ r"""Returns a list of table names accessible through this connector."""
63
+ return self._list_tables().table_names
64
+
65
+ def has_table(self, name: str) -> bool:
66
+ r"""Returns :obj:`True` if the table exists in this connector,
67
+ :obj:`False` otherwise.
68
+
69
+ Args:
70
+ name: The table name.
71
+ """
72
+ try:
73
+ resp = self._validate_table(name)
74
+ return resp.is_valid
75
+ except HTTPException:
76
+ # In case of HTTPException, Kumo backend doesn't have api
77
+ # implemented and we skip check by returns True.
78
+ return True
79
+
80
+ def table(self, name: str) -> SourceTable:
81
+ r"""Returns a :class:`~kumoai.connector.SourceTable` object
82
+ corresponding to a source table behind this connector. A source table
83
+ is a view into the raw data of table :obj:`name`. To use a source
84
+ table in Kumo, you will need to construct a
85
+ :class:`~kumoai.graph.Table` from the source table.
86
+
87
+ Args:
88
+ name: The table name.
89
+
90
+ Raises:
91
+ :class:`ValueError`: if :obj:`name` does not exist in the backing
92
+ connector.
93
+ """
94
+ if not self.has_table(name):
95
+ raise ValueError(f"The table '{name}' does not exist in {self}. "
96
+ f"Please check the existence of the source data.")
97
+
98
+ return SourceTable(name=name, connector=self)
99
+
100
+ def _validate_table(self, table_name: str) -> SourceTableValidateResponse:
101
+ if table_name in self._validated_tables:
102
+ return SourceTableValidateResponse(is_valid=True, msg='')
103
+
104
+ req = SourceTableValidateRequest(
105
+ connector_id=self.name,
106
+ table_name=table_name,
107
+ source_type=self.source_type,
108
+ )
109
+ ret = global_state.client.source_table_api.validate_table(req)
110
+
111
+ # Cache the result for the whole session.
112
+ if ret.is_valid:
113
+ self._validated_tables.add(table_name)
114
+ return ret
115
+
116
+ def _list_tables(self) -> SourceTableListResponse:
117
+ req = SourceTableListRequest(connector_id=self.name,
118
+ source_type=self.source_type)
119
+ return global_state.client.source_table_api.list_tables(req)
120
+
121
+ def _get_table_data(
122
+ self,
123
+ table_names: List[str],
124
+ sample_rows: int,
125
+ ) -> SourceTableDataResponse:
126
+ req = SourceTableDataRequest(
127
+ source_table_request=self._source_table_request(table_names),
128
+ sample_rows=sample_rows,
129
+ )
130
+ return global_state.client.source_table_api.get_table_data(req)
131
+
132
+ def _get_table_config(self, table_name: str) -> SourceTableConfigResponse:
133
+ # TODO(manan): rest backend for this is a bit broken, it never returns
134
+ # directories...
135
+ req = SourceTableConfigRequest(connector_id=self.name,
136
+ table_name=table_name,
137
+ source_type=self.source_type)
138
+ return global_state.client.source_table_api.get_table_config(req)
139
+
140
+ # Class properties ########################################################
141
+
142
+ def __hash__(self) -> int:
143
+ return hash(self.__dict__)
144
+
145
+ def __contains__(self, name: str) -> bool:
146
+ return self.has_table(name)
147
+
148
+ def __getitem__(self, name: str) -> SourceTable:
149
+ return self.table(name)
150
+
151
+ def __repr__(self) -> str:
152
+ # TODO(manan): class-overrideable metadata?
153
+ return f'{self.__class__.__name__}()'