databao-context-engine 0.1.2__py3-none-any.whl → 0.1.4.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. databao_context_engine/__init__.py +18 -6
  2. databao_context_engine/build_sources/__init__.py +4 -0
  3. databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +27 -23
  4. databao_context_engine/build_sources/build_service.py +53 -0
  5. databao_context_engine/build_sources/build_wiring.py +84 -0
  6. databao_context_engine/build_sources/export_results.py +41 -0
  7. databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +3 -7
  8. databao_context_engine/cli/add_datasource_config.py +41 -15
  9. databao_context_engine/cli/commands.py +12 -43
  10. databao_context_engine/cli/info.py +3 -2
  11. databao_context_engine/databao_context_engine.py +137 -0
  12. databao_context_engine/databao_context_project_manager.py +96 -6
  13. databao_context_engine/datasources/add_config.py +34 -0
  14. databao_context_engine/{datasource_config → datasources}/check_config.py +18 -7
  15. databao_context_engine/datasources/datasource_context.py +93 -0
  16. databao_context_engine/{project → datasources}/datasource_discovery.py +17 -16
  17. databao_context_engine/{project → datasources}/types.py +64 -15
  18. databao_context_engine/init_project.py +25 -3
  19. databao_context_engine/introspection/property_extract.py +67 -53
  20. databao_context_engine/llm/errors.py +2 -8
  21. databao_context_engine/llm/install.py +13 -20
  22. databao_context_engine/llm/service.py +1 -3
  23. databao_context_engine/mcp/mcp_runner.py +4 -2
  24. databao_context_engine/mcp/mcp_server.py +10 -10
  25. databao_context_engine/plugin_loader.py +111 -0
  26. databao_context_engine/pluginlib/build_plugin.py +25 -9
  27. databao_context_engine/pluginlib/config.py +16 -2
  28. databao_context_engine/plugins/base_db_plugin.py +5 -2
  29. databao_context_engine/plugins/databases/athena_introspector.py +85 -22
  30. databao_context_engine/plugins/databases/base_introspector.py +5 -3
  31. databao_context_engine/plugins/databases/clickhouse_introspector.py +22 -11
  32. databao_context_engine/plugins/databases/duckdb_introspector.py +3 -5
  33. databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
  34. databao_context_engine/plugins/databases/introspection_scope.py +11 -9
  35. databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
  36. databao_context_engine/plugins/databases/mssql_introspector.py +26 -17
  37. databao_context_engine/plugins/databases/mysql_introspector.py +23 -12
  38. databao_context_engine/plugins/databases/postgresql_introspector.py +2 -2
  39. databao_context_engine/plugins/databases/snowflake_introspector.py +43 -10
  40. databao_context_engine/plugins/duckdb_tools.py +18 -0
  41. databao_context_engine/plugins/plugin_loader.py +43 -42
  42. databao_context_engine/plugins/resources/parquet_introspector.py +7 -19
  43. databao_context_engine/project/info.py +34 -2
  44. databao_context_engine/project/init_project.py +16 -7
  45. databao_context_engine/project/layout.py +3 -3
  46. databao_context_engine/retrieve_embeddings/__init__.py +3 -0
  47. databao_context_engine/retrieve_embeddings/{internal/export_results.py → export_results.py} +2 -2
  48. databao_context_engine/retrieve_embeddings/{internal/retrieve_runner.py → retrieve_runner.py} +5 -9
  49. databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +3 -17
  50. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +49 -0
  51. databao_context_engine/{serialisation → serialization}/yaml.py +1 -1
  52. databao_context_engine/services/chunk_embedding_service.py +23 -11
  53. databao_context_engine/services/factories.py +1 -46
  54. databao_context_engine/services/persistence_service.py +11 -11
  55. databao_context_engine/storage/connection.py +11 -7
  56. databao_context_engine/storage/exceptions/exceptions.py +2 -2
  57. databao_context_engine/storage/migrate.py +2 -4
  58. databao_context_engine/storage/migrations/V01__init.sql +6 -31
  59. databao_context_engine/storage/models.py +2 -23
  60. databao_context_engine/storage/repositories/chunk_repository.py +16 -12
  61. databao_context_engine/storage/repositories/factories.py +1 -12
  62. databao_context_engine/storage/repositories/vector_search_repository.py +8 -13
  63. databao_context_engine/system/properties.py +4 -2
  64. databao_context_engine-0.1.4.dev1.dist-info/METADATA +75 -0
  65. databao_context_engine-0.1.4.dev1.dist-info/RECORD +125 -0
  66. {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.4.dev1.dist-info}/WHEEL +1 -1
  67. databao_context_engine/build_sources/internal/build_service.py +0 -77
  68. databao_context_engine/build_sources/internal/build_wiring.py +0 -52
  69. databao_context_engine/build_sources/internal/export_results.py +0 -43
  70. databao_context_engine/build_sources/public/api.py +0 -4
  71. databao_context_engine/databao_engine.py +0 -85
  72. databao_context_engine/datasource_config/__init__.py +0 -0
  73. databao_context_engine/datasource_config/add_config.py +0 -50
  74. databao_context_engine/datasource_config/datasource_context.py +0 -60
  75. databao_context_engine/mcp/all_results_tool.py +0 -5
  76. databao_context_engine/mcp/retrieve_tool.py +0 -22
  77. databao_context_engine/project/runs.py +0 -39
  78. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  79. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
  80. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  81. databao_context_engine/retrieve_embeddings/public/api.py +0 -3
  82. databao_context_engine/serialisation/__init__.py +0 -0
  83. databao_context_engine/services/run_name_policy.py +0 -8
  84. databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
  85. databao_context_engine/storage/repositories/run_repository.py +0 -157
  86. databao_context_engine-0.1.2.dist-info/METADATA +0 -187
  87. databao_context_engine-0.1.2.dist-info/RECORD +0 -135
  88. /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
  89. /databao_context_engine/{build_sources/public → serialization}/__init__.py +0 -0
  90. {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.4.dev1.dist-info}/entry_points.txt +0 -0
@@ -5,9 +5,7 @@ from typing import Any, Protocol, runtime_checkable
5
5
 
6
6
  @dataclass
7
7
  class EmbeddableChunk:
8
- """
9
- A chunk that will be embedded as a vector and used when searching context from a given AI prompt
10
- """
8
+ """A chunk that will be embedded as a vector and used when searching context from a given AI prompt."""
11
9
 
12
10
  embeddable_text: str
13
11
  """
@@ -48,19 +46,19 @@ class BuildDatasourcePlugin[T](BaseBuildPlugin, Protocol):
48
46
  """
49
47
 
50
48
  def check_connection(self, full_type: str, datasource_name: str, file_config: T) -> None:
51
- """
52
- Checks whether the configuration to the datasource is working.
49
+ """Check whether the configuration to the datasource is working.
53
50
 
54
51
  The function is expected to succeed without a result if the connection is working.
55
52
  If something is wrong with the connection, the function should raise an Exception
53
+
54
+ Raises:
55
+ NotSupportedError: If the plugin doesn't support this method.
56
56
  """
57
57
  raise NotSupportedError("This method is not implemented for this plugin")
58
58
 
59
59
 
60
60
  class DefaultBuildDatasourcePlugin(BuildDatasourcePlugin[dict[str, Any]], Protocol):
61
- """
62
- Use this as a base class for plugins that don't need a specific config file type.
63
- """
61
+ """Use this as a base class for plugins that don't need a specific config file type."""
64
62
 
65
63
  config_file_type: type[dict[str, Any]] = dict[str, Any]
66
64
 
@@ -75,7 +73,7 @@ class BuildFilePlugin(BaseBuildPlugin, Protocol):
75
73
 
76
74
 
77
75
  class NotSupportedError(RuntimeError):
78
- """Exception raised by methods not supported by a plugin"""
76
+ """Exception raised by methods not supported by a plugin."""
79
77
 
80
78
 
81
79
  BuildPlugin = BuildDatasourcePlugin | BuildFilePlugin
@@ -83,6 +81,12 @@ BuildPlugin = BuildDatasourcePlugin | BuildFilePlugin
83
81
 
84
82
  @dataclass(kw_only=True, frozen=True)
85
83
  class DatasourceType:
84
+ """The type of a Datasource.
85
+
86
+ Attributes:
87
+ full_type: The full type of the datasource, in the format `<main_type>/<subtype>`.
88
+ """
89
+
86
90
  full_type: str
87
91
 
88
92
  def __post_init__(self):
@@ -92,16 +96,28 @@ class DatasourceType:
92
96
 
93
97
  @property
94
98
  def main_type(self) -> str:
99
+ """The main type of the datasource, aka the folder in which the config or raw file is located."""
95
100
  return self.full_type.split("/")[0]
96
101
 
97
102
  @property
98
103
  def config_folder(self) -> str:
104
+ """The folder in which the config or raw file is located. This is equivalent to `main_type`."""
99
105
  return self.main_type
100
106
 
101
107
  @property
102
108
  def subtype(self) -> str:
109
+ """The subtype of the datasource. This is the actual type declared in the config file or the raw file's extension."""
103
110
  return self.full_type.split("/")[1]
104
111
 
105
112
  @staticmethod
106
113
  def from_main_and_subtypes(main_type: str, subtype: str) -> "DatasourceType":
114
+ """Create a DatasourceType from its main type and subtype.
115
+
116
+ Args:
117
+ main_type: The main type (aka config folder) of the datasource.
118
+ subtype: The subtype of the datasource.
119
+
120
+ Returns:
121
+ A DatasourceType instance with the specified main type and subtype.
122
+ """
107
123
  return DatasourceType(full_type=f"{main_type}/{subtype}")
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from dataclasses import dataclass
2
4
  from typing import Any, Protocol, runtime_checkable
3
5
 
@@ -17,12 +19,23 @@ class DuckDBSecret(BaseModel):
17
19
 
18
20
 
19
21
  @dataclass(kw_only=True)
20
- class ConfigPropertyDefinition:
22
+ class ConfigUnionPropertyDefinition:
23
+ property_key: str
24
+ types: tuple[type, ...]
25
+ type_properties: dict[type, list[ConfigPropertyDefinition]]
26
+
27
+
28
+ @dataclass(kw_only=True)
29
+ class ConfigSinglePropertyDefinition:
21
30
  property_key: str
22
31
  required: bool
23
32
  property_type: type | None = str
24
33
  default_value: str | None = None
25
- nested_properties: list["ConfigPropertyDefinition"] | None = None
34
+ nested_properties: list[ConfigPropertyDefinition] | None = None
35
+ secret: bool = False
36
+
37
+
38
+ ConfigPropertyDefinition = ConfigSinglePropertyDefinition | ConfigUnionPropertyDefinition
26
39
 
27
40
 
28
41
  @dataclass(kw_only=True)
@@ -30,6 +43,7 @@ class ConfigPropertyAnnotation:
30
43
  required: bool = False
31
44
  default_value: str | None = None
32
45
  ignored_for_config_wizard: bool = False
46
+ secret: bool = False
33
47
 
34
48
 
35
49
  @runtime_checkable
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, TypeVar
3
+ from typing import Annotated, Any, TypeVar
4
4
 
5
5
  from pydantic import BaseModel, ConfigDict, Field
6
6
 
@@ -8,6 +8,7 @@ from databao_context_engine.pluginlib.build_plugin import (
8
8
  BuildDatasourcePlugin,
9
9
  EmbeddableChunk,
10
10
  )
11
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
11
12
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector
12
13
  from databao_context_engine.plugins.databases.database_chunker import build_database_chunks
13
14
  from databao_context_engine.plugins.databases.introspection_scope import IntrospectionScope
@@ -17,7 +18,9 @@ class BaseDatabaseConfigFile(BaseModel):
17
18
  model_config = ConfigDict(populate_by_name=True)
18
19
  name: str | None = Field(default=None)
19
20
  type: str
20
- introspection_scope: IntrospectionScope | None = Field(default=None, alias="introspection-scope")
21
+ introspection_scope: Annotated[
22
+ IntrospectionScope | None, ConfigPropertyAnnotation(ignored_for_config_wizard=True)
23
+ ] = Field(default=None, alias="introspection-scope")
21
24
 
22
25
 
23
26
  T = TypeVar("T", bound=BaseDatabaseConfigFile)
@@ -1,22 +1,64 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Mapping
3
+ from typing import Annotated, Any
4
4
 
5
5
  from pyathena import connect
6
6
  from pyathena.cursor import DictCursor
7
- from pydantic import Field
7
+ from pydantic import BaseModel, Field
8
8
 
9
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
9
10
  from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
10
11
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
11
12
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
12
13
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
13
14
 
14
15
 
16
+ class AwsProfileAuth(BaseModel):
17
+ profile_name: str
18
+
19
+
20
+ class AwsIamAuth(BaseModel):
21
+ aws_access_key_id: Annotated[str, ConfigPropertyAnnotation(secret=True)]
22
+ aws_secret_access_key: Annotated[str, ConfigPropertyAnnotation(secret=True)]
23
+ session_token: str | None = None
24
+
25
+
26
+ class AwsAssumeRoleAuth(BaseModel):
27
+ role_arn: str | None = None
28
+ role_session_name: str | None = None
29
+ source_profile: str | None = None
30
+
31
+
32
+ class AwsDefaultAuth(BaseModel):
33
+ # Uses environment variables, instance profile, ECS task role
34
+ pass
35
+
36
+
37
+ class AthenaConnectionProperties(BaseModel):
38
+ region_name: str
39
+ schema_name: str = "default"
40
+ catalog: str | None = "awsdatacatalog"
41
+ work_group: str | None = None
42
+ s3_staging_dir: str | None = None
43
+ auth: AwsIamAuth | AwsProfileAuth | AwsDefaultAuth | AwsAssumeRoleAuth
44
+ additional_properties: dict[str, Any] = {}
45
+
46
+ def to_athena_kwargs(self) -> dict[str, Any]:
47
+ kwargs = self.model_dump(
48
+ exclude={
49
+ "additional_properties": True,
50
+ },
51
+ exclude_none=True,
52
+ )
53
+ auth_fields = kwargs.pop("auth", {})
54
+ kwargs.update(auth_fields)
55
+ kwargs.update(self.additional_properties)
56
+ return kwargs
57
+
58
+
15
59
  class AthenaConfigFile(BaseDatabaseConfigFile):
16
- type: str = Field(default="databases/athena")
17
- connection: dict[str, Any] = Field(
18
- description="Connection parameters for the Athena database. It can contain any of the keys supported by the Athena connection library"
19
- )
60
+ type: str = Field(default="athena")
61
+ connection: AthenaConnectionProperties
20
62
 
21
63
 
22
64
  class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
@@ -26,11 +68,7 @@ class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
26
68
  supports_catalogs = True
27
69
 
28
70
  def _connect(self, file_config: AthenaConfigFile):
29
- connection = file_config.connection
30
- if not isinstance(connection, Mapping):
31
- raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
32
-
33
- return connect(**connection, cursor_class=DictCursor)
71
+ return connect(**file_config.connection.to_athena_kwargs(), cursor_class=DictCursor)
34
72
 
35
73
  def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
36
74
  with connection.cursor() as cur:
@@ -38,29 +76,28 @@ class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
38
76
  return cur.fetchall()
39
77
 
40
78
  def _get_catalogs(self, connection, file_config: AthenaConfigFile) -> list[str]:
41
- catalog = file_config.connection.get("catalog", self._resolve_pseudo_catalog_name(file_config))
79
+ catalog = file_config.connection.catalog or self._resolve_pseudo_catalog_name(file_config)
42
80
  return [catalog]
43
81
 
44
82
  def _connect_to_catalog(self, file_config: AthenaConfigFile, catalog: str):
45
- self._connect(file_config)
83
+ return self._connect(file_config)
46
84
 
47
85
  def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
48
86
  if not catalogs:
49
87
  return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
50
88
  catalog = catalogs[0]
51
- sql = "SELECT schema_name, catalog_name FROM information_schema.schemata WHERE catalog_name = %(catalog)s"
52
- return SQLQuery(sql, {"catalog": catalog})
89
+ sql = f"SELECT schema_name, catalog_name FROM {catalog}.information_schema.schemata"
90
+ return SQLQuery(sql, None)
53
91
 
54
- # TODO: Incomplete plugin. Awaiting permission access to AWS to properly develop
55
92
  def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
56
93
  if not schemas:
57
94
  return []
58
95
 
59
- comps = {"columns": self._sql_columns(catalog, schemas)}
96
+ comps = self._component_queries(catalog, schemas)
60
97
  results: dict[str, list[dict]] = {}
61
98
 
62
99
  for name, q in comps.items():
63
- results[name] = self._fetchall_dicts(connection, q.sql, q.params)
100
+ results[name] = self._fetchall_dicts(connection, q, None)
64
101
 
65
102
  return IntrospectionModelBuilder.build_schemas_from_components(
66
103
  schemas=schemas,
@@ -73,8 +110,32 @@ class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
73
110
  idx_cols=[],
74
111
  )
75
112
 
76
- def _sql_columns(self, catalog: str, schemas: list[str]) -> SQLQuery:
77
- sql = f"""
113
+ def _component_queries(self, catalog: str, schemas: list[str]) -> dict[str, str]:
114
+ schemas_in = ", ".join(self._quote_literal(s) for s in schemas)
115
+ return {
116
+ "relations": self._sql_relations(catalog, schemas_in),
117
+ "columns": self._sql_columns(catalog, schemas_in),
118
+ }
119
+
120
+ def _sql_relations(self, catalog: str, schemas_in: str) -> str:
121
+ return f"""
122
+ SELECT
123
+ table_schema AS schema_name,
124
+ table_name,
125
+ CASE table_type
126
+ WHEN 'BASE TABLE' THEN 'table'
127
+ WHEN 'VIEW' THEN 'view'
128
+ ELSE LOWER(table_type)
129
+ END AS kind,
130
+ NULL AS description
131
+ FROM
132
+ {catalog}.information_schema.tables
133
+ WHERE
134
+ table_schema IN ({schemas_in})
135
+ """
136
+
137
+ def _sql_columns(self, catalog: str, schemas_in: str) -> str:
138
+ return f"""
78
139
  SELECT
79
140
  table_schema AS schema_name,
80
141
  table_name,
@@ -85,13 +146,12 @@ class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
85
146
  FROM
86
147
  {catalog}.information_schema.columns
87
148
  WHERE
88
- table_schema IN ({schemas})
149
+ table_schema IN ({schemas_in})
89
150
  ORDER BY
90
151
  table_schema,
91
152
  table_name,
92
153
  ordinal_position
93
154
  """
94
- return SQLQuery(sql, {"schema": schemas})
95
155
 
96
156
  def _resolve_pseudo_catalog_name(self, file_config: AthenaConfigFile) -> str:
97
157
  return "awsdatacatalog"
@@ -99,3 +159,6 @@ class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
99
159
  def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
100
160
  sql = f'SELECT * FROM "{schema}"."{table}" LIMIT %(limit)s'
101
161
  return SQLQuery(sql, {"limit": limit})
162
+
163
+ def _quote_literal(self, value: str) -> str:
164
+ return "'" + str(value).replace("'", "''") + "'"
@@ -121,9 +121,11 @@ class BaseIntrospector[T: SupportsIntrospectionScope](ABC):
121
121
 
122
122
  @abstractmethod
123
123
  def _connect_to_catalog(self, file_config: T, catalog: str):
124
- """Return a connection scoped to `catalog`. For engines that
125
- don’t need a new connection, return a connection with the
126
- session set/USEd to that catalog."""
124
+ """Return a connection scoped to `catalog`.
125
+
126
+ For engines that dont need a new connection, return a connection with the
127
+ session set/USE’d to that catalog.
128
+ """
127
129
 
128
130
  def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
129
131
  raise NotImplementedError
@@ -1,21 +1,34 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Mapping
3
+ from typing import Annotated, Any
4
4
 
5
5
  import clickhouse_connect
6
- from pydantic import Field
6
+ from pydantic import BaseModel, Field
7
7
 
8
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
8
9
  from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
9
10
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
10
11
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
11
12
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
12
13
 
13
14
 
15
+ class ClickhouseConnectionProperties(BaseModel):
16
+ host: Annotated[str, ConfigPropertyAnnotation(default_value="localhost", required=True)]
17
+ port: int | None = None
18
+ database: str | None = None
19
+ username: str | None = None
20
+ password: Annotated[str, ConfigPropertyAnnotation(secret=True)]
21
+ additional_properties: dict[str, Any] = {}
22
+
23
+ def to_clickhouse_kwargs(self) -> dict[str, Any]:
24
+ kwargs = self.model_dump(exclude={"additional_properties"}, exclude_none=True)
25
+ kwargs.update(self.additional_properties)
26
+ return kwargs
27
+
28
+
14
29
  class ClickhouseConfigFile(BaseDatabaseConfigFile):
15
- type: str = Field(default="databases/clickhouse")
16
- connection: dict[str, Any] = Field(
17
- description="Connection parameters for the Clickhouse database. It can contain any of the keys supported by the Clickhouse connection library (see https://clickhouse.com/docs/integrations/language-clients/python/driver-api#connection-arguments)"
18
- )
30
+ type: str = Field(default="clickhouse")
31
+ connection: ClickhouseConnectionProperties
19
32
 
20
33
 
21
34
  class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
@@ -24,11 +37,9 @@ class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
24
37
  supports_catalogs = True
25
38
 
26
39
  def _connect(self, file_config: ClickhouseConfigFile):
27
- connection = file_config.connection
28
- if not isinstance(connection, Mapping):
29
- raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
30
-
31
- return clickhouse_connect.get_client(**connection)
40
+ return clickhouse_connect.get_client(
41
+ **file_config.connection.to_clickhouse_kwargs(),
42
+ )
32
43
 
33
44
  def _connect_to_catalog(self, file_config: ClickhouseConfigFile, catalog: str):
34
45
  return self._connect(file_config)
@@ -7,10 +7,11 @@ from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
7
7
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
8
8
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
9
9
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
10
+ from databao_context_engine.plugins.duckdb_tools import fetchall_dicts
10
11
 
11
12
 
12
13
  class DuckDBConfigFile(BaseDatabaseConfigFile):
13
- type: str = Field(default="databases/duckdb")
14
+ type: str = Field(default="duckdb")
14
15
  connection: DuckDBConnectionConfig
15
16
 
16
17
 
@@ -319,7 +320,4 @@ class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
319
320
 
320
321
  def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
321
322
  cur = connection.cursor()
322
- cur.execute(sql, params or [])
323
- columns = [desc[0].lower() for desc in cur.description] if cur.description else []
324
- rows = cur.fetchall()
325
- return [dict(zip(columns, row)) for row in rows]
323
+ return fetchall_dicts(cur, sql, params)
@@ -5,13 +5,13 @@ from databao_context_engine.plugins.databases.databases_types import (
5
5
  CheckConstraint,
6
6
  DatabaseColumn,
7
7
  DatabasePartitionInfo,
8
+ DatabaseSchema,
8
9
  DatabaseTable,
9
10
  DatasetKind,
10
11
  ForeignKey,
11
12
  ForeignKeyColumnMap,
12
13
  Index,
13
14
  KeyConstraint,
14
- DatabaseSchema,
15
15
  )
16
16
 
17
17
 
@@ -6,10 +6,11 @@ from pydantic import BaseModel, ConfigDict, field_validator, model_validator
6
6
 
7
7
 
8
8
  class ScopeIncludeRule(BaseModel):
9
- """
10
- Allowlist selector.
11
- - catalog: optional glob pattern
12
- - schemas: optional list of glob patterns (string also accepted and normalized to a list)
9
+ """Allowlist selector.
10
+
11
+ Attributes:
12
+ catalog: optional glob pattern
13
+ schemas: optional list of glob patterns (string also accepted and normalized to a list)
13
14
 
14
15
  A rule must specify at least one of: catalog, schemas.
15
16
  """
@@ -36,11 +37,12 @@ class ScopeIncludeRule(BaseModel):
36
37
 
37
38
 
38
39
  class ScopeExcludeRule(BaseModel):
39
- """
40
- Denylist selector.
41
- - catalog: optional glob pattern
42
- - schemas: optional list of glob patterns (string also accepted)
43
- - except_schemas: optional list of glob patterns (string also accepted)
40
+ """Denylist selector.
41
+
42
+ Attributes:
43
+ catalog: optional glob pattern
44
+ schemas: optional list of glob patterns (string also accepted)
45
+ except_schemas: optional list of glob patterns (string also accepted)
44
46
 
45
47
  If a target matches the rule but also matches except_schemas, it is NOT excluded by this rule.
46
48
  """
@@ -10,17 +10,14 @@ from databao_context_engine.plugins.databases.introspection_scope import (
10
10
 
11
11
  @dataclass(frozen=True)
12
12
  class ScopeSelection:
13
- """
14
- The final catalog+schema scope to introspect.
15
- """
13
+ """The final catalog+schema scope to introspect."""
16
14
 
17
15
  catalogs: list[str]
18
16
  schemas_per_catalog: dict[str, list[str]]
19
17
 
20
18
 
21
19
  class IntrospectionScopeMatcher:
22
- """
23
- Applies include/exclude rules (glob matching, case-insensitive) to a discovered set of catalogs/schemas.
20
+ """Applies include/exclude rules (glob matching, case-insensitive) to a discovered set of catalogs/schemas.
24
21
 
25
22
  Semantics:
26
23
  - If include is empty => start from "everything"
@@ -1,21 +1,36 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Mapping
3
+ from typing import Annotated, Any, Mapping
4
4
 
5
5
  from mssql_python import connect # type: ignore[import-untyped]
6
- from pydantic import Field
6
+ from pydantic import BaseModel, Field
7
7
 
8
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
8
9
  from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
9
10
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
10
11
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema, DatabaseTable
11
12
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
12
13
 
13
14
 
15
+ class MSSQLConnectionProperties(BaseModel):
16
+ host: Annotated[str, ConfigPropertyAnnotation(default_value="localhost", required=True)]
17
+ port: int | None = None
18
+ instance_name: str | None = None
19
+ database: str | None = None
20
+ user: str | None = None
21
+ password: Annotated[str, ConfigPropertyAnnotation(secret=True)]
22
+ encrypt: str | None = None
23
+ additional_properties: dict[str, Any] = {}
24
+
25
+ def to_mssql_kwargs(self) -> dict[str, Any]:
26
+ kwargs = self.model_dump(exclude={"additional_properties"}, exclude_none=True)
27
+ kwargs.update(self.additional_properties)
28
+ return kwargs
29
+
30
+
14
31
  class MSSQLConfigFile(BaseDatabaseConfigFile):
15
- type: str = Field(default="databases/mssql")
16
- connection: dict[str, Any] = Field(
17
- description="Connection parameters for the Microsoft Server SQL database. It can contain any of the keys supported by the Microsoft Server connection library"
18
- )
32
+ type: str = Field(default="mssql")
33
+ connection: MSSQLConnectionProperties
19
34
 
20
35
 
21
36
  class MSSQLIntrospector(BaseIntrospector[MSSQLConfigFile]):
@@ -42,22 +57,16 @@ class MSSQLIntrospector(BaseIntrospector[MSSQLConfigFile]):
42
57
 
43
58
  def _connect(self, file_config: MSSQLConfigFile):
44
59
  connection = file_config.connection
45
- if not isinstance(connection, Mapping):
46
- raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
47
-
48
- connection_string = self._create_connection_string_for_config(connection)
60
+ connection_string = self._create_connection_string_for_config(connection.to_mssql_kwargs())
49
61
  return connect(connection_string)
50
62
 
51
63
  def _connect_to_catalog(self, file_config: MSSQLConfigFile, catalog: str):
52
- base_cfg = file_config.connection or {}
53
- cfg_for_db: dict[str, Any] = dict(base_cfg)
54
- cfg_for_db["database"] = catalog
55
-
56
- connection_string = self._create_connection_string_for_config(cfg_for_db)
57
- return connect(connection_string)
64
+ cfg = file_config.model_copy(deep=True)
65
+ cfg.connection.database = catalog
66
+ return self._connect(cfg)
58
67
 
59
68
  def _get_catalogs(self, connection, file_config: MSSQLConfigFile) -> list[str]:
60
- database = file_config.connection.get("database")
69
+ database = file_config.connection.database
61
70
  if isinstance(database, str) and database:
62
71
  return [database]
63
72
 
@@ -1,18 +1,33 @@
1
- from typing import Any, Mapping
1
+ from typing import Annotated, Any
2
2
 
3
3
  import pymysql
4
- from pydantic import Field
4
+ from pydantic import BaseModel, Field
5
5
  from pymysql.constants import CLIENT
6
6
 
7
+ from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
7
8
  from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
8
9
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
9
10
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema, DatabaseTable
10
11
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
11
12
 
12
13
 
14
+ class MySQLConnectionProperties(BaseModel):
15
+ host: Annotated[str, ConfigPropertyAnnotation(default_value="localhost", required=True)]
16
+ port: int | None = None
17
+ database: str | None = None
18
+ user: str | None = None
19
+ password: Annotated[str, ConfigPropertyAnnotation(secret=True)]
20
+ additional_properties: dict[str, Any] = {}
21
+
22
+ def to_pymysql_kwargs(self) -> dict[str, Any]:
23
+ kwargs = self.model_dump(exclude={"additional_properties"}, exclude_none=True)
24
+ kwargs.update(self.additional_properties)
25
+ return kwargs
26
+
27
+
13
28
  class MySQLConfigFile(BaseDatabaseConfigFile):
14
- connection: dict[str, Any]
15
- type: str = Field(default="databases/mysql")
29
+ connection: MySQLConnectionProperties
30
+ type: str = Field(default="mysql")
16
31
 
17
32
 
18
33
  class MySQLIntrospector(BaseIntrospector[MySQLConfigFile]):
@@ -21,20 +36,16 @@ class MySQLIntrospector(BaseIntrospector[MySQLConfigFile]):
21
36
  supports_catalogs = True
22
37
 
23
38
  def _connect(self, file_config: MySQLConfigFile):
24
- connection = file_config.connection
25
- if not isinstance(connection, Mapping):
26
- raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
27
-
28
39
  return pymysql.connect(
29
- **connection,
40
+ **file_config.connection.to_pymysql_kwargs(),
30
41
  cursorclass=pymysql.cursors.DictCursor,
31
42
  client_flag=CLIENT.MULTI_STATEMENTS | CLIENT.MULTI_RESULTS,
32
43
  )
33
44
 
34
45
  def _connect_to_catalog(self, file_config: MySQLConfigFile, catalog: str):
35
- cfg = dict(file_config.connection or {})
36
- cfg["database"] = catalog
37
- return self._connect(MySQLConfigFile(connection=cfg))
46
+ cfg = file_config.model_copy(deep=True)
47
+ cfg.connection.database = catalog
48
+ return self._connect(cfg)
38
49
 
39
50
  def _get_catalogs(self, connection, file_config: MySQLConfigFile) -> list[str]:
40
51
  with connection.cursor() as cur:
@@ -16,12 +16,12 @@ class PostgresConnectionProperties(BaseModel):
16
16
  port: int | None = None
17
17
  database: str | None = None
18
18
  user: str | None = None
19
- password: str | None = None
19
+ password: Annotated[str, ConfigPropertyAnnotation(secret=True)]
20
20
  additional_properties: dict[str, Any] = {}
21
21
 
22
22
 
23
23
  class PostgresConfigFile(BaseDatabaseConfigFile):
24
- type: str = Field(default="databases/postgres")
24
+ type: str = Field(default="postgres")
25
25
  connection: PostgresConnectionProperties
26
26
 
27
27