databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +32 -7
- databao_context_engine/build_sources/__init__.py +4 -0
- databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
- databao_context_engine/build_sources/build_service.py +53 -0
- databao_context_engine/build_sources/build_wiring.py +82 -0
- databao_context_engine/build_sources/export_results.py +41 -0
- databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
- databao_context_engine/cli/add_datasource_config.py +49 -44
- databao_context_engine/cli/commands.py +40 -55
- databao_context_engine/cli/info.py +3 -2
- databao_context_engine/databao_context_engine.py +127 -0
- databao_context_engine/databao_context_project_manager.py +147 -30
- databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
- databao_context_engine/datasources/datasource_context.py +90 -0
- databao_context_engine/datasources/datasource_discovery.py +143 -0
- databao_context_engine/datasources/types.py +194 -0
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/init_project.py +25 -3
- databao_context_engine/introspection/property_extract.py +76 -57
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/errors.py +2 -8
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +26 -30
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/llm/service.py +1 -3
- databao_context_engine/mcp/mcp_runner.py +4 -2
- databao_context_engine/mcp/mcp_server.py +9 -11
- databao_context_engine/plugin_loader.py +110 -0
- databao_context_engine/pluginlib/build_plugin.py +12 -29
- databao_context_engine/pluginlib/config.py +16 -2
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
- databao_context_engine/plugins/databases/base_introspector.py +11 -12
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
- databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
- databao_context_engine/plugins/databases/introspection_scope.py +11 -9
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/duckdb_tools.py +18 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +58 -52
- databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/info.py +34 -2
- databao_context_engine/project/init_project.py +16 -7
- databao_context_engine/project/layout.py +14 -15
- databao_context_engine/retrieve_embeddings/__init__.py +3 -0
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
- databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
- databao_context_engine/serialization/__init__.py +0 -0
- databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
- databao_context_engine/services/chunk_embedding_service.py +23 -11
- databao_context_engine/services/factories.py +1 -46
- databao_context_engine/services/persistence_service.py +11 -11
- databao_context_engine/storage/connection.py +11 -7
- databao_context_engine/storage/exceptions/exceptions.py +2 -2
- databao_context_engine/storage/migrate.py +3 -5
- databao_context_engine/storage/migrations/V01__init.sql +6 -31
- databao_context_engine/storage/models.py +2 -23
- databao_context_engine/storage/repositories/chunk_repository.py +16 -12
- databao_context_engine/storage/repositories/factories.py +1 -12
- databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
- databao_context_engine/system/properties.py +4 -2
- databao_context_engine-0.1.5.dist-info/METADATA +228 -0
- databao_context_engine-0.1.5.dist-info/RECORD +135 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
- databao_context_engine/build_sources/internal/build_service.py +0 -77
- databao_context_engine/build_sources/internal/build_wiring.py +0 -52
- databao_context_engine/build_sources/internal/export_results.py +0 -43
- databao_context_engine/build_sources/public/api.py +0 -4
- databao_context_engine/databao_engine.py +0 -85
- databao_context_engine/datasource_config/add_config.py +0 -50
- databao_context_engine/datasource_config/datasource_context.py +0 -60
- databao_context_engine/mcp/all_results_tool.py +0 -5
- databao_context_engine/mcp/retrieve_tool.py +0 -22
- databao_context_engine/plugins/databases/athena_introspector.py +0 -101
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/project/datasource_discovery.py +0 -141
- databao_context_engine/project/runs.py +0 -39
- databao_context_engine/project/types.py +0 -134
- databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
- databao_context_engine/retrieve_embeddings/public/api.py +0 -3
- databao_context_engine/services/run_name_policy.py +0 -8
- databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
- databao_context_engine/storage/repositories/run_repository.py +0 -157
- databao_context_engine-0.1.1.dist-info/METADATA +0 -186
- databao_context_engine-0.1.1.dist-info/RECORD +0 -135
- /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
- /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
- /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
- /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from databao_context_engine.introspection.property_extract import get_property_list_from_type
|
|
2
|
+
from databao_context_engine.pluginlib.build_plugin import (
|
|
3
|
+
BuildDatasourcePlugin,
|
|
4
|
+
BuildFilePlugin,
|
|
5
|
+
BuildPlugin,
|
|
6
|
+
DatasourceType,
|
|
7
|
+
)
|
|
8
|
+
from databao_context_engine.pluginlib.config import ConfigPropertyDefinition, CustomiseConfigProperties
|
|
9
|
+
from databao_context_engine.plugins.plugin_loader import (
|
|
10
|
+
load_plugins,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatabaoContextPluginLoader:
|
|
15
|
+
"""Loader for plugins installed in the current environment."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, plugins_by_type: dict[DatasourceType, BuildPlugin] | None = None):
|
|
18
|
+
"""Initialize the DatabaoContextEngine.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
plugins_by_type: Override the list of plugins loaded from the environment.
|
|
22
|
+
Typical usage should not provide this argument and leave it as None.
|
|
23
|
+
"""
|
|
24
|
+
self._all_plugins_by_type = load_plugins() if plugins_by_type is None else plugins_by_type
|
|
25
|
+
|
|
26
|
+
def get_all_supported_datasource_types(self, exclude_file_plugins: bool = False) -> set[DatasourceType]:
|
|
27
|
+
"""Return the list of all supported datasource types.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
exclude_file_plugins: If True, do not return datasource types from plugins that deal with raw files.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
A set of all DatasourceType supported in the current installation environment.
|
|
34
|
+
"""
|
|
35
|
+
if exclude_file_plugins:
|
|
36
|
+
return {
|
|
37
|
+
datasource_type
|
|
38
|
+
for (datasource_type, plugin) in self._all_plugins_by_type.items()
|
|
39
|
+
if not isinstance(plugin, BuildFilePlugin)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return set(self._all_plugins_by_type.keys())
|
|
43
|
+
|
|
44
|
+
def get_plugin_for_datasource_type(self, datasource_type: DatasourceType) -> BuildPlugin:
|
|
45
|
+
"""Return the plugin able to build a context for the given datasource type.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
datasource_type: The type of datasource for which to retrieve the plugin.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The plugin able to build a context for the given datasource type.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If no plugin is found for the given datasource type.
|
|
55
|
+
"""
|
|
56
|
+
if datasource_type not in self._all_plugins_by_type:
|
|
57
|
+
raise ValueError(f"No plugin found for type '{datasource_type.full_type}'")
|
|
58
|
+
|
|
59
|
+
return self._all_plugins_by_type[datasource_type]
|
|
60
|
+
|
|
61
|
+
def get_config_file_type_for_datasource_type(self, datasource_type: DatasourceType) -> type:
|
|
62
|
+
"""Return the type of the config file for the given datasource type.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
datasource_type: The type of datasource for which to retrieve the config file type.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The type of the config file for the given datasource type.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: If no plugin is found for the given datasource type.
|
|
72
|
+
ValueError: If the plugin does not support config files.
|
|
73
|
+
"""
|
|
74
|
+
plugin = self.get_plugin_for_datasource_type(datasource_type)
|
|
75
|
+
|
|
76
|
+
if isinstance(plugin, BuildDatasourcePlugin):
|
|
77
|
+
return plugin.config_file_type
|
|
78
|
+
|
|
79
|
+
raise ValueError(
|
|
80
|
+
f'Impossible to get a config file type for datasource type "{datasource_type.full_type}". The corresponding plugin is a {type(plugin).__name__} but should be a BuildDatasourcePlugin'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def get_config_file_structure_for_datasource_type(
|
|
84
|
+
self, datasource_type: DatasourceType
|
|
85
|
+
) -> list[ConfigPropertyDefinition]:
|
|
86
|
+
"""Return the property structure of the config file for the given datasource type.
|
|
87
|
+
|
|
88
|
+
This can be used to generate a form for the user to fill in the config file.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
datasource_type: The type of datasource for which to retrieve the config file structure.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The structure of the config file for the given datasource type.
|
|
95
|
+
This structure is a list of ConfigPropertyDefinition objects.
|
|
96
|
+
Each object in the list describes a property of the config file and its potential nested properties.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ValueError: If no plugin is found for the given datasource type.
|
|
100
|
+
ValueError: If the plugin does not support config files.
|
|
101
|
+
"""
|
|
102
|
+
plugin = self.get_plugin_for_datasource_type(datasource_type)
|
|
103
|
+
|
|
104
|
+
if isinstance(plugin, CustomiseConfigProperties):
|
|
105
|
+
return plugin.get_config_file_properties()
|
|
106
|
+
if isinstance(plugin, BuildDatasourcePlugin):
|
|
107
|
+
return get_property_list_from_type(plugin.config_file_type)
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f'Impossible to create a config for datasource type "{datasource_type.full_type}". The corresponding plugin is a {type(plugin).__name__} but should be a BuildDatasourcePlugin or CustomiseConfigProperties'
|
|
110
|
+
)
|
|
@@ -5,9 +5,7 @@ from typing import Any, Protocol, runtime_checkable
|
|
|
5
5
|
|
|
6
6
|
@dataclass
|
|
7
7
|
class EmbeddableChunk:
|
|
8
|
-
"""
|
|
9
|
-
A chunk that will be embedded as a vector and used when searching context from a given AI prompt
|
|
10
|
-
"""
|
|
8
|
+
"""A chunk that will be embedded as a vector and used when searching context from a given AI prompt."""
|
|
11
9
|
|
|
12
10
|
embeddable_text: str
|
|
13
11
|
"""
|
|
@@ -48,19 +46,19 @@ class BuildDatasourcePlugin[T](BaseBuildPlugin, Protocol):
|
|
|
48
46
|
"""
|
|
49
47
|
|
|
50
48
|
def check_connection(self, full_type: str, datasource_name: str, file_config: T) -> None:
|
|
51
|
-
"""
|
|
52
|
-
Checks whether the configuration to the datasource is working.
|
|
49
|
+
"""Check whether the configuration to the datasource is working.
|
|
53
50
|
|
|
54
51
|
The function is expected to succeed without a result if the connection is working.
|
|
55
52
|
If something is wrong with the connection, the function should raise an Exception
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
NotSupportedError: If the plugin doesn't support this method.
|
|
56
56
|
"""
|
|
57
57
|
raise NotSupportedError("This method is not implemented for this plugin")
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
class DefaultBuildDatasourcePlugin(BuildDatasourcePlugin[dict[str, Any]], Protocol):
|
|
61
|
-
"""
|
|
62
|
-
Use this as a base class for plugins that don't need a specific config file type.
|
|
63
|
-
"""
|
|
61
|
+
"""Use this as a base class for plugins that don't need a specific config file type."""
|
|
64
62
|
|
|
65
63
|
config_file_type: type[dict[str, Any]] = dict[str, Any]
|
|
66
64
|
|
|
@@ -75,7 +73,7 @@ class BuildFilePlugin(BaseBuildPlugin, Protocol):
|
|
|
75
73
|
|
|
76
74
|
|
|
77
75
|
class NotSupportedError(RuntimeError):
|
|
78
|
-
"""Exception raised by methods not supported by a plugin"""
|
|
76
|
+
"""Exception raised by methods not supported by a plugin."""
|
|
79
77
|
|
|
80
78
|
|
|
81
79
|
BuildPlugin = BuildDatasourcePlugin | BuildFilePlugin
|
|
@@ -83,25 +81,10 @@ BuildPlugin = BuildDatasourcePlugin | BuildFilePlugin
|
|
|
83
81
|
|
|
84
82
|
@dataclass(kw_only=True, frozen=True)
|
|
85
83
|
class DatasourceType:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def __post_init__(self):
|
|
89
|
-
type_segments = self.full_type.split("/")
|
|
90
|
-
if len(type_segments) != 2:
|
|
91
|
-
raise ValueError(f"Invalid DatasourceType: {self.full_type}")
|
|
92
|
-
|
|
93
|
-
@property
|
|
94
|
-
def main_type(self) -> str:
|
|
95
|
-
return self.full_type.split("/")[0]
|
|
84
|
+
"""The type of a Datasource.
|
|
96
85
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
@property
|
|
102
|
-
def subtype(self) -> str:
|
|
103
|
-
return self.full_type.split("/")[1]
|
|
86
|
+
Attributes:
|
|
87
|
+
full_type: The full type of the datasource, in the format `<main_type>/<subtype>`.
|
|
88
|
+
"""
|
|
104
89
|
|
|
105
|
-
|
|
106
|
-
def from_main_and_subtypes(main_type: str, subtype: str) -> "DatasourceType":
|
|
107
|
-
return DatasourceType(full_type=f"{main_type}/{subtype}")
|
|
90
|
+
full_type: str
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from dataclasses import dataclass
|
|
2
4
|
from typing import Any, Protocol, runtime_checkable
|
|
3
5
|
|
|
@@ -17,12 +19,23 @@ class DuckDBSecret(BaseModel):
|
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
@dataclass(kw_only=True)
|
|
20
|
-
class
|
|
22
|
+
class ConfigUnionPropertyDefinition:
|
|
23
|
+
property_key: str
|
|
24
|
+
types: tuple[type, ...]
|
|
25
|
+
type_properties: dict[type, list[ConfigPropertyDefinition]]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(kw_only=True)
|
|
29
|
+
class ConfigSinglePropertyDefinition:
|
|
21
30
|
property_key: str
|
|
22
31
|
required: bool
|
|
23
32
|
property_type: type | None = str
|
|
24
33
|
default_value: str | None = None
|
|
25
|
-
nested_properties: list[
|
|
34
|
+
nested_properties: list[ConfigPropertyDefinition] | None = None
|
|
35
|
+
secret: bool = False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
ConfigPropertyDefinition = ConfigSinglePropertyDefinition | ConfigUnionPropertyDefinition
|
|
26
39
|
|
|
27
40
|
|
|
28
41
|
@dataclass(kw_only=True)
|
|
@@ -30,6 +43,7 @@ class ConfigPropertyAnnotation:
|
|
|
30
43
|
required: bool = False
|
|
31
44
|
default_value: str | None = None
|
|
32
45
|
ignored_for_config_wizard: bool = False
|
|
46
|
+
secret: bool = False
|
|
33
47
|
|
|
34
48
|
|
|
35
49
|
@runtime_checkable
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from databao_context_engine.plugins.
|
|
2
|
-
from databao_context_engine.plugins.databases.
|
|
1
|
+
from databao_context_engine.plugins.databases.athena.athena_introspector import AthenaConfigFile, AthenaIntrospector
|
|
2
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class AthenaDbPlugin(BaseDatabasePlugin[AthenaConfigFile]):
|
|
6
6
|
id = "jetbrains/athena"
|
|
7
7
|
name = "Athena DB Plugin"
|
|
8
|
-
supported = {"
|
|
8
|
+
supported = {"athena"}
|
|
9
9
|
config_file_type = AthenaConfigFile
|
|
10
10
|
|
|
11
11
|
def __init__(self):
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Annotated, Any
|
|
4
|
+
|
|
5
|
+
from pyathena import connect
|
|
6
|
+
from pyathena.cursor import DictCursor
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
|
|
10
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
11
|
+
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
12
|
+
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
13
|
+
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AwsProfileAuth(BaseModel):
|
|
17
|
+
profile_name: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AwsIamAuth(BaseModel):
|
|
21
|
+
aws_access_key_id: Annotated[str, ConfigPropertyAnnotation(secret=True)]
|
|
22
|
+
aws_secret_access_key: Annotated[str, ConfigPropertyAnnotation(secret=True)]
|
|
23
|
+
session_token: str | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AwsAssumeRoleAuth(BaseModel):
|
|
27
|
+
role_arn: str | None = None
|
|
28
|
+
role_session_name: str | None = None
|
|
29
|
+
source_profile: str | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AwsDefaultAuth(BaseModel):
|
|
33
|
+
# Uses environment variables, instance profile, ECS task role
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class AthenaConnectionProperties(BaseModel):
|
|
38
|
+
region_name: str
|
|
39
|
+
schema_name: str = "default"
|
|
40
|
+
catalog: str | None = "awsdatacatalog"
|
|
41
|
+
work_group: str | None = None
|
|
42
|
+
s3_staging_dir: str | None = None
|
|
43
|
+
auth: AwsIamAuth | AwsProfileAuth | AwsDefaultAuth | AwsAssumeRoleAuth
|
|
44
|
+
additional_properties: dict[str, Any] = {}
|
|
45
|
+
|
|
46
|
+
def to_athena_kwargs(self) -> dict[str, Any]:
|
|
47
|
+
kwargs = self.model_dump(
|
|
48
|
+
exclude={
|
|
49
|
+
"additional_properties": True,
|
|
50
|
+
},
|
|
51
|
+
exclude_none=True,
|
|
52
|
+
)
|
|
53
|
+
auth_fields = kwargs.pop("auth", {})
|
|
54
|
+
kwargs.update(auth_fields)
|
|
55
|
+
kwargs.update(self.additional_properties)
|
|
56
|
+
return kwargs
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class AthenaConfigFile(BaseDatabaseConfigFile):
|
|
60
|
+
type: str = Field(default="athena")
|
|
61
|
+
connection: AthenaConnectionProperties
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
|
|
65
|
+
_IGNORED_SCHEMAS = {
|
|
66
|
+
"information_schema",
|
|
67
|
+
}
|
|
68
|
+
supports_catalogs = True
|
|
69
|
+
|
|
70
|
+
def _connect(self, file_config: AthenaConfigFile, *, catalog: str | None = None) -> Any:
|
|
71
|
+
return connect(**file_config.connection.to_athena_kwargs(), cursor_class=DictCursor)
|
|
72
|
+
|
|
73
|
+
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
74
|
+
with connection.cursor() as cur:
|
|
75
|
+
cur.execute(sql, params or {})
|
|
76
|
+
return cur.fetchall()
|
|
77
|
+
|
|
78
|
+
def _get_catalogs(self, connection, file_config: AthenaConfigFile) -> list[str]:
|
|
79
|
+
catalog = file_config.connection.catalog or self._resolve_pseudo_catalog_name(file_config)
|
|
80
|
+
return [catalog]
|
|
81
|
+
|
|
82
|
+
def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
|
|
83
|
+
if not catalogs:
|
|
84
|
+
return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
|
|
85
|
+
catalog = catalogs[0]
|
|
86
|
+
sql = f"SELECT schema_name, catalog_name FROM {catalog}.information_schema.schemata"
|
|
87
|
+
return SQLQuery(sql, None)
|
|
88
|
+
|
|
89
|
+
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
90
|
+
if not schemas:
|
|
91
|
+
return []
|
|
92
|
+
|
|
93
|
+
comps = self._component_queries(catalog, schemas)
|
|
94
|
+
results: dict[str, list[dict]] = {}
|
|
95
|
+
|
|
96
|
+
for name, q in comps.items():
|
|
97
|
+
results[name] = self._fetchall_dicts(connection, q, None)
|
|
98
|
+
|
|
99
|
+
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
100
|
+
schemas=schemas,
|
|
101
|
+
rels=results.get("relations", []),
|
|
102
|
+
cols=results.get("columns", []),
|
|
103
|
+
pk_cols=[],
|
|
104
|
+
uq_cols=[],
|
|
105
|
+
checks=[],
|
|
106
|
+
fk_cols=[],
|
|
107
|
+
idx_cols=[],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _component_queries(self, catalog: str, schemas: list[str]) -> dict[str, str]:
|
|
111
|
+
schemas_in = ", ".join(self._quote_literal(s) for s in schemas)
|
|
112
|
+
return {
|
|
113
|
+
"relations": self._sql_relations(catalog, schemas_in),
|
|
114
|
+
"columns": self._sql_columns(catalog, schemas_in),
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
def _sql_relations(self, catalog: str, schemas_in: str) -> str:
|
|
118
|
+
return f"""
|
|
119
|
+
SELECT
|
|
120
|
+
table_schema AS schema_name,
|
|
121
|
+
table_name,
|
|
122
|
+
CASE table_type
|
|
123
|
+
WHEN 'BASE TABLE' THEN 'table'
|
|
124
|
+
WHEN 'VIEW' THEN 'view'
|
|
125
|
+
ELSE LOWER(table_type)
|
|
126
|
+
END AS kind,
|
|
127
|
+
NULL AS description
|
|
128
|
+
FROM
|
|
129
|
+
{catalog}.information_schema.tables
|
|
130
|
+
WHERE
|
|
131
|
+
table_schema IN ({schemas_in})
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def _sql_columns(self, catalog: str, schemas_in: str) -> str:
|
|
135
|
+
return f"""
|
|
136
|
+
SELECT
|
|
137
|
+
table_schema AS schema_name,
|
|
138
|
+
table_name,
|
|
139
|
+
column_name,
|
|
140
|
+
ordinal_position,
|
|
141
|
+
data_type,
|
|
142
|
+
is_nullable
|
|
143
|
+
FROM
|
|
144
|
+
{catalog}.information_schema.columns
|
|
145
|
+
WHERE
|
|
146
|
+
table_schema IN ({schemas_in})
|
|
147
|
+
ORDER BY
|
|
148
|
+
table_schema,
|
|
149
|
+
table_name,
|
|
150
|
+
ordinal_position
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
def _resolve_pseudo_catalog_name(self, file_config: AthenaConfigFile) -> str:
|
|
154
|
+
return "awsdatacatalog"
|
|
155
|
+
|
|
156
|
+
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
157
|
+
sql = f'SELECT * FROM "{schema}"."{table}" LIMIT %(limit)s'
|
|
158
|
+
return SQLQuery(sql, {"limit": limit})
|
|
159
|
+
|
|
160
|
+
def _quote_literal(self, value: str) -> str:
|
|
161
|
+
return "'" + str(value).replace("'", "''") + "'"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, TypeVar
|
|
3
|
+
from typing import Annotated, Any, TypeVar
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, ConfigDict, Field
|
|
6
6
|
|
|
@@ -8,6 +8,7 @@ from databao_context_engine.pluginlib.build_plugin import (
|
|
|
8
8
|
BuildDatasourcePlugin,
|
|
9
9
|
EmbeddableChunk,
|
|
10
10
|
)
|
|
11
|
+
from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
|
|
11
12
|
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector
|
|
12
13
|
from databao_context_engine.plugins.databases.database_chunker import build_database_chunks
|
|
13
14
|
from databao_context_engine.plugins.databases.introspection_scope import IntrospectionScope
|
|
@@ -17,7 +18,9 @@ class BaseDatabaseConfigFile(BaseModel):
|
|
|
17
18
|
model_config = ConfigDict(populate_by_name=True)
|
|
18
19
|
name: str | None = Field(default=None)
|
|
19
20
|
type: str
|
|
20
|
-
introspection_scope:
|
|
21
|
+
introspection_scope: Annotated[
|
|
22
|
+
IntrospectionScope | None, ConfigPropertyAnnotation(ignored_for_config_wizard=True)
|
|
23
|
+
] = Field(default=None, alias="introspection-scope")
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
T = TypeVar("T", bound=BaseDatabaseConfigFile)
|
|
@@ -34,9 +37,7 @@ class BaseDatabasePlugin(BuildDatasourcePlugin[T]):
|
|
|
34
37
|
return self.supported
|
|
35
38
|
|
|
36
39
|
def build_context(self, full_type: str, datasource_name: str, file_config: T) -> Any:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
return introspection_result
|
|
40
|
+
return self._introspector.introspect_database(file_config)
|
|
40
41
|
|
|
41
42
|
def check_connection(self, full_type: str, datasource_name: str, file_config: T) -> None:
|
|
42
43
|
self._introspector.check_connection(file_config)
|
|
@@ -40,7 +40,7 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
|
|
|
40
40
|
|
|
41
41
|
discovered_schemas_per_catalog: dict[str, list[str]] = {}
|
|
42
42
|
for catalog in catalogs:
|
|
43
|
-
with self.
|
|
43
|
+
with self._connect(file_config, catalog=catalog) as conn:
|
|
44
44
|
discovered_schemas_per_catalog[catalog] = self._list_schemas_for_catalog(conn, catalog)
|
|
45
45
|
scope = scope_matcher.filter_scopes(catalogs, discovered_schemas_per_catalog)
|
|
46
46
|
|
|
@@ -50,7 +50,7 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
|
|
|
50
50
|
if not schemas_to_introspect:
|
|
51
51
|
continue
|
|
52
52
|
|
|
53
|
-
with self.
|
|
53
|
+
with self._connect(file_config, catalog=catalog) as catalog_connection:
|
|
54
54
|
introspected_schemas = self.collect_catalog_model(catalog_connection, catalog, schemas_to_introspect)
|
|
55
55
|
|
|
56
56
|
if not introspected_schemas:
|
|
@@ -74,9 +74,9 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
|
|
|
74
74
|
if self.supports_catalogs:
|
|
75
75
|
sql = "SELECT catalog_name, schema_name FROM information_schema.schemata WHERE catalog_name = ANY(%s)"
|
|
76
76
|
return SQLQuery(sql, (catalogs,))
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
|
|
78
|
+
sql = "SELECT schema_name FROM information_schema.schemata"
|
|
79
|
+
return SQLQuery(sql, None)
|
|
80
80
|
|
|
81
81
|
def _list_schemas_for_catalog(self, connection: Any, catalog: str) -> list[str]:
|
|
82
82
|
sql_query = self._sql_list_schemas([catalog] if self.supports_catalogs else None)
|
|
@@ -108,7 +108,12 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
|
|
|
108
108
|
return samples
|
|
109
109
|
|
|
110
110
|
@abstractmethod
|
|
111
|
-
def _connect(self, file_config: T):
|
|
111
|
+
def _connect(self, file_config: T, *, catalog: str | None = None) -> Any:
|
|
112
|
+
"""Connect to the database.
|
|
113
|
+
|
|
114
|
+
If the `catalog` argument is provided, the connection is "scoped" to that catalog. For engines that don’t need a new connection,
|
|
115
|
+
return a connection with the session set/USE’d to that catalog.
|
|
116
|
+
"""
|
|
112
117
|
raise NotImplementedError
|
|
113
118
|
|
|
114
119
|
@abstractmethod
|
|
@@ -119,12 +124,6 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
|
|
|
119
124
|
def _get_catalogs(self, connection, file_config: T) -> list[str]:
|
|
120
125
|
raise NotImplementedError
|
|
121
126
|
|
|
122
|
-
@abstractmethod
|
|
123
|
-
def _connect_to_catalog(self, file_config: T, catalog: str):
|
|
124
|
-
"""Return a connection scoped to `catalog`. For engines that
|
|
125
|
-
don’t need a new connection, return a connection with the
|
|
126
|
-
session set/USE’d to that catalog."""
|
|
127
|
-
|
|
128
127
|
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
129
128
|
raise NotImplementedError
|
|
130
129
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
-
from databao_context_engine.plugins.databases.clickhouse_introspector import (
|
|
1
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.clickhouse.clickhouse_introspector import (
|
|
3
3
|
ClickhouseConfigFile,
|
|
4
4
|
ClickhouseIntrospector,
|
|
5
5
|
)
|
|
@@ -8,7 +8,7 @@ from databao_context_engine.plugins.databases.clickhouse_introspector import (
|
|
|
8
8
|
class ClickhouseDbPlugin(BaseDatabasePlugin[ClickhouseConfigFile]):
|
|
9
9
|
id = "jetbrains/clickhouse"
|
|
10
10
|
name = "Clickhouse DB Plugin"
|
|
11
|
-
supported = {"
|
|
11
|
+
supported = {"clickhouse"}
|
|
12
12
|
config_file_type = ClickhouseConfigFile
|
|
13
13
|
|
|
14
14
|
def __init__(self):
|
|
@@ -1,21 +1,34 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Annotated, Any
|
|
4
4
|
|
|
5
5
|
import clickhouse_connect
|
|
6
|
-
from pydantic import Field
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
7
|
|
|
8
|
-
from databao_context_engine.
|
|
8
|
+
from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
|
|
9
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
9
10
|
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
10
11
|
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
11
12
|
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
12
13
|
|
|
13
14
|
|
|
15
|
+
class ClickhouseConnectionProperties(BaseModel):
|
|
16
|
+
host: Annotated[str, ConfigPropertyAnnotation(default_value="localhost", required=True)]
|
|
17
|
+
port: int | None = None
|
|
18
|
+
database: str | None = None
|
|
19
|
+
username: str | None = None
|
|
20
|
+
password: Annotated[str, ConfigPropertyAnnotation(secret=True)]
|
|
21
|
+
additional_properties: dict[str, Any] = {}
|
|
22
|
+
|
|
23
|
+
def to_clickhouse_kwargs(self) -> dict[str, Any]:
|
|
24
|
+
kwargs = self.model_dump(exclude={"additional_properties"}, exclude_none=True)
|
|
25
|
+
kwargs.update(self.additional_properties)
|
|
26
|
+
return kwargs
|
|
27
|
+
|
|
28
|
+
|
|
14
29
|
class ClickhouseConfigFile(BaseDatabaseConfigFile):
|
|
15
|
-
type: str = Field(default="
|
|
16
|
-
connection:
|
|
17
|
-
description="Connection parameters for the Clickhouse database. It can contain any of the keys supported by the Clickhouse connection library (see https://clickhouse.com/docs/integrations/language-clients/python/driver-api#connection-arguments)"
|
|
18
|
-
)
|
|
30
|
+
type: str = Field(default="clickhouse")
|
|
31
|
+
connection: ClickhouseConnectionProperties
|
|
19
32
|
|
|
20
33
|
|
|
21
34
|
class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
|
|
@@ -23,15 +36,10 @@ class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
|
|
|
23
36
|
|
|
24
37
|
supports_catalogs = True
|
|
25
38
|
|
|
26
|
-
def _connect(self, file_config: ClickhouseConfigFile):
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
return clickhouse_connect.get_client(**connection)
|
|
32
|
-
|
|
33
|
-
def _connect_to_catalog(self, file_config: ClickhouseConfigFile, catalog: str):
|
|
34
|
-
return self._connect(file_config)
|
|
39
|
+
def _connect(self, file_config: ClickhouseConfigFile, *, catalog: str | None = None):
|
|
40
|
+
return clickhouse_connect.get_client(
|
|
41
|
+
**file_config.connection.to_clickhouse_kwargs(),
|
|
42
|
+
)
|
|
35
43
|
|
|
36
44
|
def _get_catalogs(self, connection, file_config: ClickhouseConfigFile) -> list[str]:
|
|
37
45
|
return ["clickhouse"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.duckdb.duckdb_introspector import DuckDBConfigFile, DuckDBIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DuckDbPlugin(BaseDatabasePlugin[DuckDBConfigFile]):
|
|
6
|
+
id = "jetbrains/duckdb"
|
|
7
|
+
name = "DuckDB Plugin"
|
|
8
|
+
supported = {"duckdb"}
|
|
9
|
+
config_file_type = DuckDBConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(DuckDBIntrospector())
|
databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py}
RENAMED
|
@@ -3,19 +3,20 @@ from __future__ import annotations
|
|
|
3
3
|
import duckdb
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
5
|
|
|
6
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
6
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
7
7
|
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
8
8
|
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
9
9
|
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
10
|
+
from databao_context_engine.plugins.duckdb_tools import fetchall_dicts
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class DuckDBConfigFile(BaseDatabaseConfigFile):
|
|
13
|
-
type: str = Field(default="
|
|
14
|
+
type: str = Field(default="duckdb")
|
|
14
15
|
connection: DuckDBConnectionConfig
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class DuckDBConnectionConfig(BaseModel):
|
|
18
|
-
|
|
19
|
+
database_path: str = Field(description="Path to the DuckDB database file")
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
|
|
@@ -23,13 +24,10 @@ class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
|
|
|
23
24
|
_IGNORED_SCHEMAS = {"information_schema", "pg_catalog"}
|
|
24
25
|
supports_catalogs = True
|
|
25
26
|
|
|
26
|
-
def _connect(self, file_config: DuckDBConfigFile):
|
|
27
|
-
database_path = str(file_config.connection.
|
|
27
|
+
def _connect(self, file_config: DuckDBConfigFile, *, catalog: str | None = None):
|
|
28
|
+
database_path = str(file_config.connection.database_path)
|
|
28
29
|
return duckdb.connect(database=database_path)
|
|
29
30
|
|
|
30
|
-
def _connect_to_catalog(self, file_config: DuckDBConfigFile, catalog: str):
|
|
31
|
-
return self._connect(file_config)
|
|
32
|
-
|
|
33
31
|
def _get_catalogs(self, connection, file_config: DuckDBConfigFile) -> list[str]:
|
|
34
32
|
rows = self._fetchall_dicts(connection, "SELECT database_name FROM duckdb_databases();", None)
|
|
35
33
|
catalogs = [r["database_name"] for r in rows if r.get("database_name")]
|
|
@@ -319,7 +317,4 @@ class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
|
|
|
319
317
|
|
|
320
318
|
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
321
319
|
cur = connection.cursor()
|
|
322
|
-
cur
|
|
323
|
-
columns = [desc[0].lower() for desc in cur.description] if cur.description else []
|
|
324
|
-
rows = cur.fetchall()
|
|
325
|
-
return [dict(zip(columns, row)) for row in rows]
|
|
320
|
+
return fetchall_dicts(cur, sql, params)
|
|
@@ -5,13 +5,13 @@ from databao_context_engine.plugins.databases.databases_types import (
|
|
|
5
5
|
CheckConstraint,
|
|
6
6
|
DatabaseColumn,
|
|
7
7
|
DatabasePartitionInfo,
|
|
8
|
+
DatabaseSchema,
|
|
8
9
|
DatabaseTable,
|
|
9
10
|
DatasetKind,
|
|
10
11
|
ForeignKey,
|
|
11
12
|
ForeignKeyColumnMap,
|
|
12
13
|
Index,
|
|
13
14
|
KeyConstraint,
|
|
14
|
-
DatabaseSchema,
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
|