databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. databao_context_engine/__init__.py +32 -7
  2. databao_context_engine/build_sources/__init__.py +4 -0
  3. databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
  4. databao_context_engine/build_sources/build_service.py +53 -0
  5. databao_context_engine/build_sources/build_wiring.py +82 -0
  6. databao_context_engine/build_sources/export_results.py +41 -0
  7. databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
  8. databao_context_engine/cli/add_datasource_config.py +49 -44
  9. databao_context_engine/cli/commands.py +40 -55
  10. databao_context_engine/cli/info.py +3 -2
  11. databao_context_engine/databao_context_engine.py +127 -0
  12. databao_context_engine/databao_context_project_manager.py +147 -30
  13. databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
  14. databao_context_engine/datasources/datasource_context.py +90 -0
  15. databao_context_engine/datasources/datasource_discovery.py +143 -0
  16. databao_context_engine/datasources/types.py +194 -0
  17. databao_context_engine/generate_configs_schemas.py +4 -5
  18. databao_context_engine/init_project.py +25 -3
  19. databao_context_engine/introspection/property_extract.py +76 -57
  20. databao_context_engine/llm/__init__.py +10 -0
  21. databao_context_engine/llm/api.py +57 -0
  22. databao_context_engine/llm/descriptions/ollama.py +1 -3
  23. databao_context_engine/llm/errors.py +2 -8
  24. databao_context_engine/llm/factory.py +5 -2
  25. databao_context_engine/llm/install.py +26 -30
  26. databao_context_engine/llm/runtime.py +3 -5
  27. databao_context_engine/llm/service.py +1 -3
  28. databao_context_engine/mcp/mcp_runner.py +4 -2
  29. databao_context_engine/mcp/mcp_server.py +9 -11
  30. databao_context_engine/plugin_loader.py +110 -0
  31. databao_context_engine/pluginlib/build_plugin.py +12 -29
  32. databao_context_engine/pluginlib/config.py +16 -2
  33. databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
  34. databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
  35. databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
  36. databao_context_engine/plugins/databases/base_introspector.py +11 -12
  37. databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
  38. databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
  39. databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
  40. databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
  41. databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
  42. databao_context_engine/plugins/databases/introspection_scope.py +11 -9
  43. databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
  44. databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
  45. databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
  46. databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
  47. databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
  48. databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
  49. databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
  50. databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
  51. databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
  52. databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
  53. databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
  54. databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
  55. databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
  56. databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
  57. databao_context_engine/plugins/duckdb_tools.py +18 -0
  58. databao_context_engine/plugins/files/__init__.py +0 -0
  59. databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
  60. databao_context_engine/plugins/plugin_loader.py +58 -52
  61. databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
  62. databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
  63. databao_context_engine/project/info.py +34 -2
  64. databao_context_engine/project/init_project.py +16 -7
  65. databao_context_engine/project/layout.py +14 -15
  66. databao_context_engine/retrieve_embeddings/__init__.py +3 -0
  67. databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
  68. databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
  69. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
  70. databao_context_engine/serialization/__init__.py +0 -0
  71. databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
  72. databao_context_engine/services/chunk_embedding_service.py +23 -11
  73. databao_context_engine/services/factories.py +1 -46
  74. databao_context_engine/services/persistence_service.py +11 -11
  75. databao_context_engine/storage/connection.py +11 -7
  76. databao_context_engine/storage/exceptions/exceptions.py +2 -2
  77. databao_context_engine/storage/migrate.py +3 -5
  78. databao_context_engine/storage/migrations/V01__init.sql +6 -31
  79. databao_context_engine/storage/models.py +2 -23
  80. databao_context_engine/storage/repositories/chunk_repository.py +16 -12
  81. databao_context_engine/storage/repositories/factories.py +1 -12
  82. databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
  83. databao_context_engine/system/properties.py +4 -2
  84. databao_context_engine-0.1.5.dist-info/METADATA +228 -0
  85. databao_context_engine-0.1.5.dist-info/RECORD +135 -0
  86. {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
  87. databao_context_engine/build_sources/internal/build_service.py +0 -77
  88. databao_context_engine/build_sources/internal/build_wiring.py +0 -52
  89. databao_context_engine/build_sources/internal/export_results.py +0 -43
  90. databao_context_engine/build_sources/public/api.py +0 -4
  91. databao_context_engine/databao_engine.py +0 -85
  92. databao_context_engine/datasource_config/add_config.py +0 -50
  93. databao_context_engine/datasource_config/datasource_context.py +0 -60
  94. databao_context_engine/mcp/all_results_tool.py +0 -5
  95. databao_context_engine/mcp/retrieve_tool.py +0 -22
  96. databao_context_engine/plugins/databases/athena_introspector.py +0 -101
  97. databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
  98. databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
  99. databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
  100. databao_context_engine/project/datasource_discovery.py +0 -141
  101. databao_context_engine/project/runs.py +0 -39
  102. databao_context_engine/project/types.py +0 -134
  103. databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
  104. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
  105. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
  106. databao_context_engine/retrieve_embeddings/public/api.py +0 -3
  107. databao_context_engine/services/run_name_policy.py +0 -8
  108. databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
  109. databao_context_engine/storage/repositories/run_repository.py +0 -157
  110. databao_context_engine-0.1.1.dist-info/METADATA +0 -186
  111. databao_context_engine-0.1.1.dist-info/RECORD +0 -135
  112. /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
  113. /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
  114. /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
  115. /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
  116. /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
  117. /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
  118. {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,110 @@
1
+ from databao_context_engine.introspection.property_extract import get_property_list_from_type
2
+ from databao_context_engine.pluginlib.build_plugin import (
3
+ BuildDatasourcePlugin,
4
+ BuildFilePlugin,
5
+ BuildPlugin,
6
+ DatasourceType,
7
+ )
8
+ from databao_context_engine.pluginlib.config import ConfigPropertyDefinition, CustomiseConfigProperties
9
+ from databao_context_engine.plugins.plugin_loader import (
10
+ load_plugins,
11
+ )
12
+
13
+
14
+ class DatabaoContextPluginLoader:
15
+ """Loader for plugins installed in the current environment."""
16
+
17
+ def __init__(self, plugins_by_type: dict[DatasourceType, BuildPlugin] | None = None):
18
+ """Initialize the DatabaoContextEngine.
19
+
20
+ Args:
21
+ plugins_by_type: Override the list of plugins loaded from the environment.
22
+ Typical usage should not provide this argument and leave it as None.
23
+ """
24
+ self._all_plugins_by_type = load_plugins() if plugins_by_type is None else plugins_by_type
25
+
26
+ def get_all_supported_datasource_types(self, exclude_file_plugins: bool = False) -> set[DatasourceType]:
27
+ """Return the list of all supported datasource types.
28
+
29
+ Args:
30
+ exclude_file_plugins: If True, do not return datasource types from plugins that deal with raw files.
31
+
32
+ Returns:
33
+ A set of all DatasourceType supported in the current installation environment.
34
+ """
35
+ if exclude_file_plugins:
36
+ return {
37
+ datasource_type
38
+ for (datasource_type, plugin) in self._all_plugins_by_type.items()
39
+ if not isinstance(plugin, BuildFilePlugin)
40
+ }
41
+
42
+ return set(self._all_plugins_by_type.keys())
43
+
44
+ def get_plugin_for_datasource_type(self, datasource_type: DatasourceType) -> BuildPlugin:
45
+ """Return the plugin able to build a context for the given datasource type.
46
+
47
+ Args:
48
+ datasource_type: The type of datasource for which to retrieve the plugin.
49
+
50
+ Returns:
51
+ The plugin able to build a context for the given datasource type.
52
+
53
+ Raises:
54
+ ValueError: If no plugin is found for the given datasource type.
55
+ """
56
+ if datasource_type not in self._all_plugins_by_type:
57
+ raise ValueError(f"No plugin found for type '{datasource_type.full_type}'")
58
+
59
+ return self._all_plugins_by_type[datasource_type]
60
+
61
+ def get_config_file_type_for_datasource_type(self, datasource_type: DatasourceType) -> type:
62
+ """Return the type of the config file for the given datasource type.
63
+
64
+ Args:
65
+ datasource_type: The type of datasource for which to retrieve the config file type.
66
+
67
+ Returns:
68
+ The type of the config file for the given datasource type.
69
+
70
+ Raises:
71
+ ValueError: If no plugin is found for the given datasource type.
72
+ ValueError: If the plugin does not support config files.
73
+ """
74
+ plugin = self.get_plugin_for_datasource_type(datasource_type)
75
+
76
+ if isinstance(plugin, BuildDatasourcePlugin):
77
+ return plugin.config_file_type
78
+
79
+ raise ValueError(
80
+ f'Impossible to get a config file type for datasource type "{datasource_type.full_type}". The corresponding plugin is a {type(plugin).__name__} but should be a BuildDatasourcePlugin'
81
+ )
82
+
83
+ def get_config_file_structure_for_datasource_type(
84
+ self, datasource_type: DatasourceType
85
+ ) -> list[ConfigPropertyDefinition]:
86
+ """Return the property structure of the config file for the given datasource type.
87
+
88
+ This can be used to generate a form for the user to fill in the config file.
89
+
90
+ Args:
91
+ datasource_type: The type of datasource for which to retrieve the config file structure.
92
+
93
+ Returns:
94
+ The structure of the config file for the given datasource type.
95
+ This structure is a list of ConfigPropertyDefinition objects.
96
+ Each object in the list describes a property of the config file and its potential nested properties.
97
+
98
+ Raises:
99
+ ValueError: If no plugin is found for the given datasource type.
100
+ ValueError: If the plugin does not support config files.
101
+ """
102
+ plugin = self.get_plugin_for_datasource_type(datasource_type)
103
+
104
+ if isinstance(plugin, CustomiseConfigProperties):
105
+ return plugin.get_config_file_properties()
106
+ if isinstance(plugin, BuildDatasourcePlugin):
107
+ return get_property_list_from_type(plugin.config_file_type)
108
+ raise ValueError(
109
+ f'Impossible to create a config for datasource type "{datasource_type.full_type}". The corresponding plugin is a {type(plugin).__name__} but should be a BuildDatasourcePlugin or CustomiseConfigProperties'
110
+ )
@@ -5,9 +5,7 @@ from typing import Any, Protocol, runtime_checkable
5
5
 
6
6
  @dataclass
7
7
  class EmbeddableChunk:
8
- """
9
- A chunk that will be embedded as a vector and used when searching context from a given AI prompt
10
- """
8
+ """A chunk that will be embedded as a vector and used when searching context from a given AI prompt."""
11
9
 
12
10
  embeddable_text: str
13
11
  """
@@ -48,19 +46,19 @@ class BuildDatasourcePlugin[T](BaseBuildPlugin, Protocol):
48
46
  """
49
47
 
50
48
  def check_connection(self, full_type: str, datasource_name: str, file_config: T) -> None:
51
- """
52
- Checks whether the configuration to the datasource is working.
49
+ """Check whether the configuration to the datasource is working.
53
50
 
54
51
  The function is expected to succeed without a result if the connection is working.
55
52
  If something is wrong with the connection, the function should raise an Exception
53
+
54
+ Raises:
55
+ NotSupportedError: If the plugin doesn't support this method.
56
56
  """
57
57
  raise NotSupportedError("This method is not implemented for this plugin")
58
58
 
59
59
 
60
60
  class DefaultBuildDatasourcePlugin(BuildDatasourcePlugin[dict[str, Any]], Protocol):
61
- """
62
- Use this as a base class for plugins that don't need a specific config file type.
63
- """
61
+ """Use this as a base class for plugins that don't need a specific config file type."""
64
62
 
65
63
  config_file_type: type[dict[str, Any]] = dict[str, Any]
66
64
 
@@ -75,7 +73,7 @@ class BuildFilePlugin(BaseBuildPlugin, Protocol):
75
73
 
76
74
 
77
75
  class NotSupportedError(RuntimeError):
78
- """Exception raised by methods not supported by a plugin"""
76
+ """Exception raised by methods not supported by a plugin."""
79
77
 
80
78
 
81
79
  BuildPlugin = BuildDatasourcePlugin | BuildFilePlugin
@@ -83,25 +81,10 @@ BuildPlugin = BuildDatasourcePlugin | BuildFilePlugin
83
81
 
84
82
  @dataclass(kw_only=True, frozen=True)
85
83
  class DatasourceType:
86
- full_type: str
87
-
88
- def __post_init__(self):
89
- type_segments = self.full_type.split("/")
90
- if len(type_segments) != 2:
91
- raise ValueError(f"Invalid DatasourceType: {self.full_type}")
92
-
93
- @property
94
- def main_type(self) -> str:
95
- return self.full_type.split("/")[0]
84
+ """The type of a Datasource.
96
85
 
97
- @property
98
- def config_folder(self) -> str:
99
- return self.main_type
100
-
101
- @property
102
- def subtype(self) -> str:
103
- return self.full_type.split("/")[1]
86
+ Attributes:
87
+ full_type: The full type of the datasource, in the format `<main_type>/<subtype>`.
88
+ """
104
89
 
105
- @staticmethod
106
- def from_main_and_subtypes(main_type: str, subtype: str) -> "DatasourceType":
107
- return DatasourceType(full_type=f"{main_type}/{subtype}")
90
+ full_type: str
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from dataclasses import dataclass
2
4
  from typing import Any, Protocol, runtime_checkable
3
5
 
@@ -17,12 +19,23 @@ class DuckDBSecret(BaseModel):
17
19
 
18
20
 
19
21
  @dataclass(kw_only=True)
20
- class ConfigPropertyDefinition:
22
+ class ConfigUnionPropertyDefinition:
23
+ property_key: str
24
+ types: tuple[type, ...]
25
+ type_properties: dict[type, list[ConfigPropertyDefinition]]
26
+
27
+
28
+ @dataclass(kw_only=True)
29
+ class ConfigSinglePropertyDefinition:
21
30
  property_key: str
22
31
  required: bool
23
32
  property_type: type | None = str
24
33
  default_value: str | None = None
25
- nested_properties: list["ConfigPropertyDefinition"] | None = None
34
+ nested_properties: list[ConfigPropertyDefinition] | None = None
35
+ secret: bool = False
36
+
37
+
38
+ ConfigPropertyDefinition = ConfigSinglePropertyDefinition | ConfigUnionPropertyDefinition
26
39
 
27
40
 
28
41
  @dataclass(kw_only=True)
@@ -30,6 +43,7 @@ class ConfigPropertyAnnotation:
30
43
  required: bool = False
31
44
  default_value: str | None = None
32
45
  ignored_for_config_wizard: bool = False
46
+ secret: bool = False
33
47
 
34
48
 
35
49
  @runtime_checkable
@@ -1,11 +1,11 @@
1
- from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
- from databao_context_engine.plugins.databases.athena_introspector import AthenaConfigFile, AthenaIntrospector
1
+ from databao_context_engine.plugins.databases.athena.athena_introspector import AthenaConfigFile, AthenaIntrospector
2
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
3
3
 
4
4
 
5
5
  class AthenaDbPlugin(BaseDatabasePlugin[AthenaConfigFile]):
6
6
  id = "jetbrains/athena"
7
7
  name = "Athena DB Plugin"
8
- supported = {"databases/athena"}
8
+ supported = {"athena"}
9
9
  config_file_type = AthenaConfigFile
10
10
 
11
11
  def __init__(self):
@@ -0,0 +1,161 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Annotated, Any
4
+
5
+ from pyathena import connect
6
+ from pyathena.cursor import DictCursor
7
+ from pydantic import BaseModel, Field
8
+
9
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
10
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
11
+ from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
12
+ from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
13
+ from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
14
+
15
+
16
+ class AwsProfileAuth(BaseModel):
17
+ profile_name: str
18
+
19
+
20
+ class AwsIamAuth(BaseModel):
21
+ aws_access_key_id: Annotated[str, ConfigPropertyAnnotation(secret=True)]
22
+ aws_secret_access_key: Annotated[str, ConfigPropertyAnnotation(secret=True)]
23
+ session_token: str | None = None
24
+
25
+
26
+ class AwsAssumeRoleAuth(BaseModel):
27
+ role_arn: str | None = None
28
+ role_session_name: str | None = None
29
+ source_profile: str | None = None
30
+
31
+
32
+ class AwsDefaultAuth(BaseModel):
33
+ # Uses environment variables, instance profile, ECS task role
34
+ pass
35
+
36
+
37
+ class AthenaConnectionProperties(BaseModel):
38
+ region_name: str
39
+ schema_name: str = "default"
40
+ catalog: str | None = "awsdatacatalog"
41
+ work_group: str | None = None
42
+ s3_staging_dir: str | None = None
43
+ auth: AwsIamAuth | AwsProfileAuth | AwsDefaultAuth | AwsAssumeRoleAuth
44
+ additional_properties: dict[str, Any] = {}
45
+
46
+ def to_athena_kwargs(self) -> dict[str, Any]:
47
+ kwargs = self.model_dump(
48
+ exclude={
49
+ "additional_properties": True,
50
+ },
51
+ exclude_none=True,
52
+ )
53
+ auth_fields = kwargs.pop("auth", {})
54
+ kwargs.update(auth_fields)
55
+ kwargs.update(self.additional_properties)
56
+ return kwargs
57
+
58
+
59
+ class AthenaConfigFile(BaseDatabaseConfigFile):
60
+ type: str = Field(default="athena")
61
+ connection: AthenaConnectionProperties
62
+
63
+
64
+ class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
65
+ _IGNORED_SCHEMAS = {
66
+ "information_schema",
67
+ }
68
+ supports_catalogs = True
69
+
70
+ def _connect(self, file_config: AthenaConfigFile, *, catalog: str | None = None) -> Any:
71
+ return connect(**file_config.connection.to_athena_kwargs(), cursor_class=DictCursor)
72
+
73
+ def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
74
+ with connection.cursor() as cur:
75
+ cur.execute(sql, params or {})
76
+ return cur.fetchall()
77
+
78
+ def _get_catalogs(self, connection, file_config: AthenaConfigFile) -> list[str]:
79
+ catalog = file_config.connection.catalog or self._resolve_pseudo_catalog_name(file_config)
80
+ return [catalog]
81
+
82
+ def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
83
+ if not catalogs:
84
+ return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
85
+ catalog = catalogs[0]
86
+ sql = f"SELECT schema_name, catalog_name FROM {catalog}.information_schema.schemata"
87
+ return SQLQuery(sql, None)
88
+
89
+ def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
90
+ if not schemas:
91
+ return []
92
+
93
+ comps = self._component_queries(catalog, schemas)
94
+ results: dict[str, list[dict]] = {}
95
+
96
+ for name, q in comps.items():
97
+ results[name] = self._fetchall_dicts(connection, q, None)
98
+
99
+ return IntrospectionModelBuilder.build_schemas_from_components(
100
+ schemas=schemas,
101
+ rels=results.get("relations", []),
102
+ cols=results.get("columns", []),
103
+ pk_cols=[],
104
+ uq_cols=[],
105
+ checks=[],
106
+ fk_cols=[],
107
+ idx_cols=[],
108
+ )
109
+
110
+ def _component_queries(self, catalog: str, schemas: list[str]) -> dict[str, str]:
111
+ schemas_in = ", ".join(self._quote_literal(s) for s in schemas)
112
+ return {
113
+ "relations": self._sql_relations(catalog, schemas_in),
114
+ "columns": self._sql_columns(catalog, schemas_in),
115
+ }
116
+
117
+ def _sql_relations(self, catalog: str, schemas_in: str) -> str:
118
+ return f"""
119
+ SELECT
120
+ table_schema AS schema_name,
121
+ table_name,
122
+ CASE table_type
123
+ WHEN 'BASE TABLE' THEN 'table'
124
+ WHEN 'VIEW' THEN 'view'
125
+ ELSE LOWER(table_type)
126
+ END AS kind,
127
+ NULL AS description
128
+ FROM
129
+ {catalog}.information_schema.tables
130
+ WHERE
131
+ table_schema IN ({schemas_in})
132
+ """
133
+
134
+ def _sql_columns(self, catalog: str, schemas_in: str) -> str:
135
+ return f"""
136
+ SELECT
137
+ table_schema AS schema_name,
138
+ table_name,
139
+ column_name,
140
+ ordinal_position,
141
+ data_type,
142
+ is_nullable
143
+ FROM
144
+ {catalog}.information_schema.columns
145
+ WHERE
146
+ table_schema IN ({schemas_in})
147
+ ORDER BY
148
+ table_schema,
149
+ table_name,
150
+ ordinal_position
151
+ """
152
+
153
+ def _resolve_pseudo_catalog_name(self, file_config: AthenaConfigFile) -> str:
154
+ return "awsdatacatalog"
155
+
156
+ def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
157
+ sql = f'SELECT * FROM "{schema}"."{table}" LIMIT %(limit)s'
158
+ return SQLQuery(sql, {"limit": limit})
159
+
160
+ def _quote_literal(self, value: str) -> str:
161
+ return "'" + str(value).replace("'", "''") + "'"
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, TypeVar
3
+ from typing import Annotated, Any, TypeVar
4
4
 
5
5
  from pydantic import BaseModel, ConfigDict, Field
6
6
 
@@ -8,6 +8,7 @@ from databao_context_engine.pluginlib.build_plugin import (
8
8
  BuildDatasourcePlugin,
9
9
  EmbeddableChunk,
10
10
  )
11
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
11
12
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector
12
13
  from databao_context_engine.plugins.databases.database_chunker import build_database_chunks
13
14
  from databao_context_engine.plugins.databases.introspection_scope import IntrospectionScope
@@ -17,7 +18,9 @@ class BaseDatabaseConfigFile(BaseModel):
17
18
  model_config = ConfigDict(populate_by_name=True)
18
19
  name: str | None = Field(default=None)
19
20
  type: str
20
- introspection_scope: IntrospectionScope | None = Field(default=None, alias="introspection-scope")
21
+ introspection_scope: Annotated[
22
+ IntrospectionScope | None, ConfigPropertyAnnotation(ignored_for_config_wizard=True)
23
+ ] = Field(default=None, alias="introspection-scope")
21
24
 
22
25
 
23
26
  T = TypeVar("T", bound=BaseDatabaseConfigFile)
@@ -34,9 +37,7 @@ class BaseDatabasePlugin(BuildDatasourcePlugin[T]):
34
37
  return self.supported
35
38
 
36
39
  def build_context(self, full_type: str, datasource_name: str, file_config: T) -> Any:
37
- introspection_result = self._introspector.introspect_database(file_config)
38
-
39
- return introspection_result
40
+ return self._introspector.introspect_database(file_config)
40
41
 
41
42
  def check_connection(self, full_type: str, datasource_name: str, file_config: T) -> None:
42
43
  self._introspector.check_connection(file_config)
@@ -40,7 +40,7 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
40
40
 
41
41
  discovered_schemas_per_catalog: dict[str, list[str]] = {}
42
42
  for catalog in catalogs:
43
- with self._connect_to_catalog(file_config, catalog) as conn:
43
+ with self._connect(file_config, catalog=catalog) as conn:
44
44
  discovered_schemas_per_catalog[catalog] = self._list_schemas_for_catalog(conn, catalog)
45
45
  scope = scope_matcher.filter_scopes(catalogs, discovered_schemas_per_catalog)
46
46
 
@@ -50,7 +50,7 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
50
50
  if not schemas_to_introspect:
51
51
  continue
52
52
 
53
- with self._connect_to_catalog(file_config, catalog) as catalog_connection:
53
+ with self._connect(file_config, catalog=catalog) as catalog_connection:
54
54
  introspected_schemas = self.collect_catalog_model(catalog_connection, catalog, schemas_to_introspect)
55
55
 
56
56
  if not introspected_schemas:
@@ -74,9 +74,9 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
74
74
  if self.supports_catalogs:
75
75
  sql = "SELECT catalog_name, schema_name FROM information_schema.schemata WHERE catalog_name = ANY(%s)"
76
76
  return SQLQuery(sql, (catalogs,))
77
- else:
78
- sql = "SELECT schema_name FROM information_schema.schemata"
79
- return SQLQuery(sql, None)
77
+
78
+ sql = "SELECT schema_name FROM information_schema.schemata"
79
+ return SQLQuery(sql, None)
80
80
 
81
81
  def _list_schemas_for_catalog(self, connection: Any, catalog: str) -> list[str]:
82
82
  sql_query = self._sql_list_schemas([catalog] if self.supports_catalogs else None)
@@ -108,7 +108,12 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
108
108
  return samples
109
109
 
110
110
  @abstractmethod
111
- def _connect(self, file_config: T):
111
+ def _connect(self, file_config: T, *, catalog: str | None = None) -> Any:
112
+ """Connect to the database.
113
+
114
+ If the `catalog` argument is provided, the connection is "scoped" to that catalog. For engines that don’t need a new connection,
115
+ return a connection with the session set/USE’d to that catalog.
116
+ """
112
117
  raise NotImplementedError
113
118
 
114
119
  @abstractmethod
@@ -119,12 +124,6 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
119
124
  def _get_catalogs(self, connection, file_config: T) -> list[str]:
120
125
  raise NotImplementedError
121
126
 
122
- @abstractmethod
123
- def _connect_to_catalog(self, file_config: T, catalog: str):
124
- """Return a connection scoped to `catalog`. For engines that
125
- don’t need a new connection, return a connection with the
126
- session set/USE’d to that catalog."""
127
-
128
127
  def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
129
128
  raise NotImplementedError
130
129
 
@@ -1,5 +1,5 @@
1
- from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
- from databao_context_engine.plugins.databases.clickhouse_introspector import (
1
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.clickhouse.clickhouse_introspector import (
3
3
  ClickhouseConfigFile,
4
4
  ClickhouseIntrospector,
5
5
  )
@@ -8,7 +8,7 @@ from databao_context_engine.plugins.databases.clickhouse_introspector import (
8
8
  class ClickhouseDbPlugin(BaseDatabasePlugin[ClickhouseConfigFile]):
9
9
  id = "jetbrains/clickhouse"
10
10
  name = "Clickhouse DB Plugin"
11
- supported = {"databases/clickhouse"}
11
+ supported = {"clickhouse"}
12
12
  config_file_type = ClickhouseConfigFile
13
13
 
14
14
  def __init__(self):
@@ -1,21 +1,34 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Mapping
3
+ from typing import Annotated, Any
4
4
 
5
5
  import clickhouse_connect
6
- from pydantic import Field
6
+ from pydantic import BaseModel, Field
7
7
 
8
- from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
8
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
9
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
9
10
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
10
11
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
11
12
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
12
13
 
13
14
 
15
+ class ClickhouseConnectionProperties(BaseModel):
16
+ host: Annotated[str, ConfigPropertyAnnotation(default_value="localhost", required=True)]
17
+ port: int | None = None
18
+ database: str | None = None
19
+ username: str | None = None
20
+ password: Annotated[str, ConfigPropertyAnnotation(secret=True)]
21
+ additional_properties: dict[str, Any] = {}
22
+
23
+ def to_clickhouse_kwargs(self) -> dict[str, Any]:
24
+ kwargs = self.model_dump(exclude={"additional_properties"}, exclude_none=True)
25
+ kwargs.update(self.additional_properties)
26
+ return kwargs
27
+
28
+
14
29
  class ClickhouseConfigFile(BaseDatabaseConfigFile):
15
- type: str = Field(default="databases/clickhouse")
16
- connection: dict[str, Any] = Field(
17
- description="Connection parameters for the Clickhouse database. It can contain any of the keys supported by the Clickhouse connection library (see https://clickhouse.com/docs/integrations/language-clients/python/driver-api#connection-arguments)"
18
- )
30
+ type: str = Field(default="clickhouse")
31
+ connection: ClickhouseConnectionProperties
19
32
 
20
33
 
21
34
  class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
@@ -23,15 +36,10 @@ class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
23
36
 
24
37
  supports_catalogs = True
25
38
 
26
- def _connect(self, file_config: ClickhouseConfigFile):
27
- connection = file_config.connection
28
- if not isinstance(connection, Mapping):
29
- raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
30
-
31
- return clickhouse_connect.get_client(**connection)
32
-
33
- def _connect_to_catalog(self, file_config: ClickhouseConfigFile, catalog: str):
34
- return self._connect(file_config)
39
+ def _connect(self, file_config: ClickhouseConfigFile, *, catalog: str | None = None):
40
+ return clickhouse_connect.get_client(
41
+ **file_config.connection.to_clickhouse_kwargs(),
42
+ )
35
43
 
36
44
  def _get_catalogs(self, connection, file_config: ClickhouseConfigFile) -> list[str]:
37
45
  return ["clickhouse"]
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.duckdb.duckdb_introspector import DuckDBConfigFile, DuckDBIntrospector
3
+
4
+
5
+ class DuckDbPlugin(BaseDatabasePlugin[DuckDBConfigFile]):
6
+ id = "jetbrains/duckdb"
7
+ name = "DuckDB Plugin"
8
+ supported = {"duckdb"}
9
+ config_file_type = DuckDBConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(DuckDBIntrospector())
@@ -3,19 +3,20 @@ from __future__ import annotations
3
3
  import duckdb
4
4
  from pydantic import BaseModel, Field
5
5
 
6
- from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
6
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
7
7
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
8
8
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
9
9
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
10
+ from databao_context_engine.plugins.duckdb_tools import fetchall_dicts
10
11
 
11
12
 
12
13
  class DuckDBConfigFile(BaseDatabaseConfigFile):
13
- type: str = Field(default="databases/duckdb")
14
+ type: str = Field(default="duckdb")
14
15
  connection: DuckDBConnectionConfig
15
16
 
16
17
 
17
18
  class DuckDBConnectionConfig(BaseModel):
18
- database: str = Field(description="Path to the DuckDB database file")
19
+ database_path: str = Field(description="Path to the DuckDB database file")
19
20
 
20
21
 
21
22
  class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
@@ -23,13 +24,10 @@ class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
23
24
  _IGNORED_SCHEMAS = {"information_schema", "pg_catalog"}
24
25
  supports_catalogs = True
25
26
 
26
- def _connect(self, file_config: DuckDBConfigFile):
27
- database_path = str(file_config.connection.database)
27
+ def _connect(self, file_config: DuckDBConfigFile, *, catalog: str | None = None):
28
+ database_path = str(file_config.connection.database_path)
28
29
  return duckdb.connect(database=database_path)
29
30
 
30
- def _connect_to_catalog(self, file_config: DuckDBConfigFile, catalog: str):
31
- return self._connect(file_config)
32
-
33
31
  def _get_catalogs(self, connection, file_config: DuckDBConfigFile) -> list[str]:
34
32
  rows = self._fetchall_dicts(connection, "SELECT database_name FROM duckdb_databases();", None)
35
33
  catalogs = [r["database_name"] for r in rows if r.get("database_name")]
@@ -319,7 +317,4 @@ class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
319
317
 
320
318
  def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
321
319
  cur = connection.cursor()
322
- cur.execute(sql, params or [])
323
- columns = [desc[0].lower() for desc in cur.description] if cur.description else []
324
- rows = cur.fetchall()
325
- return [dict(zip(columns, row)) for row in rows]
320
+ return fetchall_dicts(cur, sql, params)
@@ -5,13 +5,13 @@ from databao_context_engine.plugins.databases.databases_types import (
5
5
  CheckConstraint,
6
6
  DatabaseColumn,
7
7
  DatabasePartitionInfo,
8
+ DatabaseSchema,
8
9
  DatabaseTable,
9
10
  DatasetKind,
10
11
  ForeignKey,
11
12
  ForeignKeyColumnMap,
12
13
  Index,
13
14
  KeyConstraint,
14
- DatabaseSchema,
15
15
  )
16
16
 
17
17