databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +32 -7
- databao_context_engine/build_sources/__init__.py +4 -0
- databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
- databao_context_engine/build_sources/build_service.py +53 -0
- databao_context_engine/build_sources/build_wiring.py +82 -0
- databao_context_engine/build_sources/export_results.py +41 -0
- databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
- databao_context_engine/cli/add_datasource_config.py +49 -44
- databao_context_engine/cli/commands.py +40 -55
- databao_context_engine/cli/info.py +3 -2
- databao_context_engine/databao_context_engine.py +127 -0
- databao_context_engine/databao_context_project_manager.py +147 -30
- databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
- databao_context_engine/datasources/datasource_context.py +90 -0
- databao_context_engine/datasources/datasource_discovery.py +143 -0
- databao_context_engine/datasources/types.py +194 -0
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/init_project.py +25 -3
- databao_context_engine/introspection/property_extract.py +76 -57
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/errors.py +2 -8
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +26 -30
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/llm/service.py +1 -3
- databao_context_engine/mcp/mcp_runner.py +4 -2
- databao_context_engine/mcp/mcp_server.py +9 -11
- databao_context_engine/plugin_loader.py +110 -0
- databao_context_engine/pluginlib/build_plugin.py +12 -29
- databao_context_engine/pluginlib/config.py +16 -2
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
- databao_context_engine/plugins/databases/base_introspector.py +11 -12
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
- databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
- databao_context_engine/plugins/databases/introspection_scope.py +11 -9
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/duckdb_tools.py +18 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +58 -52
- databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/info.py +34 -2
- databao_context_engine/project/init_project.py +16 -7
- databao_context_engine/project/layout.py +14 -15
- databao_context_engine/retrieve_embeddings/__init__.py +3 -0
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
- databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
- databao_context_engine/serialization/__init__.py +0 -0
- databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
- databao_context_engine/services/chunk_embedding_service.py +23 -11
- databao_context_engine/services/factories.py +1 -46
- databao_context_engine/services/persistence_service.py +11 -11
- databao_context_engine/storage/connection.py +11 -7
- databao_context_engine/storage/exceptions/exceptions.py +2 -2
- databao_context_engine/storage/migrate.py +3 -5
- databao_context_engine/storage/migrations/V01__init.sql +6 -31
- databao_context_engine/storage/models.py +2 -23
- databao_context_engine/storage/repositories/chunk_repository.py +16 -12
- databao_context_engine/storage/repositories/factories.py +1 -12
- databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
- databao_context_engine/system/properties.py +4 -2
- databao_context_engine-0.1.5.dist-info/METADATA +228 -0
- databao_context_engine-0.1.5.dist-info/RECORD +135 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
- databao_context_engine/build_sources/internal/build_service.py +0 -77
- databao_context_engine/build_sources/internal/build_wiring.py +0 -52
- databao_context_engine/build_sources/internal/export_results.py +0 -43
- databao_context_engine/build_sources/public/api.py +0 -4
- databao_context_engine/databao_engine.py +0 -85
- databao_context_engine/datasource_config/add_config.py +0 -50
- databao_context_engine/datasource_config/datasource_context.py +0 -60
- databao_context_engine/mcp/all_results_tool.py +0 -5
- databao_context_engine/mcp/retrieve_tool.py +0 -22
- databao_context_engine/plugins/databases/athena_introspector.py +0 -101
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/project/datasource_discovery.py +0 -141
- databao_context_engine/project/runs.py +0 -39
- databao_context_engine/project/types.py +0 -134
- databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
- databao_context_engine/retrieve_embeddings/public/api.py +0 -3
- databao_context_engine/services/run_name_policy.py +0 -8
- databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
- databao_context_engine/storage/repositories/run_repository.py +0 -157
- databao_context_engine-0.1.1.dist-info/METADATA +0 -186
- databao_context_engine-0.1.1.dist-info/RECORD +0 -135
- /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
- /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
- /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
- /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
from databao_context_engine.datasource_config.datasource_context import (
|
|
7
|
-
DatasourceContext,
|
|
8
|
-
get_all_contexts,
|
|
9
|
-
get_context_header_for_datasource,
|
|
10
|
-
get_datasource_context,
|
|
11
|
-
)
|
|
12
|
-
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
13
|
-
from databao_context_engine.project.datasource_discovery import get_datasource_list
|
|
14
|
-
from databao_context_engine.project.layout import ensure_project_dir
|
|
15
|
-
from databao_context_engine.project.types import Datasource, DatasourceId
|
|
16
|
-
from databao_context_engine.retrieve_embeddings.public.api import retrieve_embeddings
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class ContextSearchResult:
|
|
21
|
-
datasource_id: DatasourceId
|
|
22
|
-
datasource_type: DatasourceType
|
|
23
|
-
distance: float
|
|
24
|
-
context_result: str
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class DatabaoContextEngine:
|
|
28
|
-
project_dir: Path
|
|
29
|
-
|
|
30
|
-
def __init__(self, project_dir: Path) -> None:
|
|
31
|
-
self.project_layout = ensure_project_dir(project_dir=project_dir)
|
|
32
|
-
self.project_dir = project_dir
|
|
33
|
-
|
|
34
|
-
def get_datasource_list(self) -> list[Datasource]:
|
|
35
|
-
# TODO: Should this return the list of built datasources rather than the list of datasources within the src folder?
|
|
36
|
-
return get_datasource_list(self.project_dir)
|
|
37
|
-
|
|
38
|
-
def get_datasource_context(self, datasource_id: DatasourceId, run_name: str | None = None) -> DatasourceContext:
|
|
39
|
-
return get_datasource_context(
|
|
40
|
-
project_layout=self.project_layout, datasource_id=datasource_id, run_name=run_name
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
def get_all_contexts(self, run_name: str | None = None) -> list[DatasourceContext]:
|
|
44
|
-
return get_all_contexts(project_layout=self.project_layout, run_name=run_name)
|
|
45
|
-
|
|
46
|
-
def get_all_contexts_formatted(self, run_name: str | None = None) -> str:
|
|
47
|
-
all_contexts = self.get_all_contexts(run_name=run_name)
|
|
48
|
-
|
|
49
|
-
all_results = os.linesep.join(
|
|
50
|
-
[f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
return all_results
|
|
54
|
-
|
|
55
|
-
def search_context(
|
|
56
|
-
self,
|
|
57
|
-
retrieve_text: str,
|
|
58
|
-
run_name: str | None,
|
|
59
|
-
limit: int | None,
|
|
60
|
-
export_to_file: bool,
|
|
61
|
-
datasource_ids: list[DatasourceId] | None = None,
|
|
62
|
-
) -> list[ContextSearchResult]:
|
|
63
|
-
# TODO: Filter with datasource_ids
|
|
64
|
-
# TODO: Remove the need for a run_name
|
|
65
|
-
|
|
66
|
-
results = retrieve_embeddings(
|
|
67
|
-
project_layout=self.project_layout,
|
|
68
|
-
retrieve_text=retrieve_text,
|
|
69
|
-
run_name=run_name,
|
|
70
|
-
limit=limit,
|
|
71
|
-
export_to_file=export_to_file,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
return [
|
|
75
|
-
ContextSearchResult(
|
|
76
|
-
datasource_id=result.datasource_id,
|
|
77
|
-
datasource_type=result.datasource_type,
|
|
78
|
-
distance=result.cosine_distance,
|
|
79
|
-
context_result=result.display_text,
|
|
80
|
-
)
|
|
81
|
-
for result in results
|
|
82
|
-
]
|
|
83
|
-
|
|
84
|
-
def run_sql(self, datasource_id: DatasourceId, sql: str, params: list[str]) -> dict[str, Any]:
|
|
85
|
-
raise NotImplementedError("Running SQL is not supported yet")
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
from databao_context_engine.introspection.property_extract import get_property_list_from_type
|
|
5
|
-
from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, DatasourceType
|
|
6
|
-
from databao_context_engine.pluginlib.config import ConfigPropertyDefinition, CustomiseConfigProperties
|
|
7
|
-
from databao_context_engine.plugins.plugin_loader import get_plugin_for_type
|
|
8
|
-
from databao_context_engine.project.layout import (
|
|
9
|
-
create_datasource_config_file as create_datasource_config_file_internal,
|
|
10
|
-
)
|
|
11
|
-
from databao_context_engine.project.types import DatasourceId
|
|
12
|
-
from databao_context_engine.serialisation.yaml import to_yaml_string
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_config_file_structure_for_datasource_type(datasource_type: DatasourceType) -> list[ConfigPropertyDefinition]:
|
|
16
|
-
plugin = get_plugin_for_type(datasource_type)
|
|
17
|
-
|
|
18
|
-
if isinstance(plugin, CustomiseConfigProperties):
|
|
19
|
-
return plugin.get_config_file_properties()
|
|
20
|
-
elif isinstance(plugin, BuildDatasourcePlugin):
|
|
21
|
-
return get_property_list_from_type(plugin.config_file_type)
|
|
22
|
-
else:
|
|
23
|
-
raise ValueError(
|
|
24
|
-
f"Impossible to create a config for type {datasource_type.full_type}. The plugin for this type is not a BuildDatasourcePlugin or CustomiseConfigProperties"
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def create_datasource_config_file(
|
|
29
|
-
project_dir: Path,
|
|
30
|
-
datasource_type: DatasourceType,
|
|
31
|
-
datasource_name: str,
|
|
32
|
-
config_content: dict[str, Any],
|
|
33
|
-
overwrite_existing: bool,
|
|
34
|
-
) -> Path:
|
|
35
|
-
basic_config = {"type": datasource_type.subtype, "name": datasource_name}
|
|
36
|
-
|
|
37
|
-
return create_datasource_config_file_internal(
|
|
38
|
-
project_dir,
|
|
39
|
-
get_datasource_id_for_config_file(datasource_type, datasource_name),
|
|
40
|
-
to_yaml_string(basic_config | config_content),
|
|
41
|
-
overwrite_existing=overwrite_existing,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def get_datasource_id_for_config_file(datasource_type: DatasourceType, datasource_name: str) -> DatasourceId:
|
|
46
|
-
return DatasourceId(
|
|
47
|
-
datasource_config_folder=datasource_type.config_folder,
|
|
48
|
-
datasource_name=datasource_name,
|
|
49
|
-
config_file_suffix=".yaml",
|
|
50
|
-
)
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from databao_context_engine.project.layout import ProjectLayout
|
|
6
|
-
from databao_context_engine.project.runs import get_run_dir, resolve_run_name
|
|
7
|
-
from databao_context_engine.project.types import DatasourceId
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass(eq=True, frozen=True)
|
|
11
|
-
class DatasourceContext:
|
|
12
|
-
datasource_id: DatasourceId
|
|
13
|
-
# TODO: Read the context as a BuildExecutionResult instead of a Yaml string?
|
|
14
|
-
context: str
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def get_datasource_context(
|
|
18
|
-
project_layout: ProjectLayout, datasource_id: DatasourceId, run_name: str | None = None
|
|
19
|
-
) -> DatasourceContext:
|
|
20
|
-
run_dir = _resolve_run_dir(project_layout, run_name)
|
|
21
|
-
|
|
22
|
-
context_path = run_dir.joinpath(datasource_id.relative_path_to_context_file())
|
|
23
|
-
if not context_path.is_file():
|
|
24
|
-
raise ValueError(f"Context file not found for datasource {str(datasource_id)} in run {run_dir.name}")
|
|
25
|
-
|
|
26
|
-
context = context_path.read_text()
|
|
27
|
-
return DatasourceContext(datasource_id=datasource_id, context=context)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def get_all_contexts(project_layout: ProjectLayout, run_name: str | None = None) -> list[DatasourceContext]:
|
|
31
|
-
run_dir = _resolve_run_dir(project_layout, run_name)
|
|
32
|
-
|
|
33
|
-
result = []
|
|
34
|
-
for main_type_dir in sorted((p for p in run_dir.iterdir() if p.is_dir()), key=lambda p: p.name.lower()):
|
|
35
|
-
for context_path in sorted(
|
|
36
|
-
(p for p in main_type_dir.iterdir() if p.suffix in [".yaml", ".yml"]), key=lambda p: p.name.lower()
|
|
37
|
-
):
|
|
38
|
-
result.append(
|
|
39
|
-
DatasourceContext(
|
|
40
|
-
# FIXME: The extension will always be yaml here even if the datasource is a file with a different extension
|
|
41
|
-
datasource_id=DatasourceId.from_datasource_config_file_path(context_path),
|
|
42
|
-
context=context_path.read_text(),
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
return result
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def get_context_header_for_datasource(datasource_id: DatasourceId) -> str:
|
|
50
|
-
return f"# ===== {str(datasource_id)} ====={os.linesep}"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def _resolve_run_dir(project_layout: ProjectLayout, run_name: str | None) -> Path:
|
|
54
|
-
resolved_run_name = resolve_run_name(project_layout=project_layout, run_name=run_name)
|
|
55
|
-
|
|
56
|
-
run_dir = get_run_dir(project_dir=project_layout.project_dir, run_name=resolved_run_name)
|
|
57
|
-
if not run_dir.is_dir():
|
|
58
|
-
raise ValueError(f"Run {resolved_run_name} does not exist at {run_dir.resolve()}")
|
|
59
|
-
|
|
60
|
-
return run_dir
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
|
|
3
|
-
from databao_context_engine import DatabaoContextEngine
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def run_retrieve_tool(
|
|
7
|
-
*, databao_context_engine: DatabaoContextEngine, run_name: str | None, text: str, limit: int | None = None
|
|
8
|
-
) -> str:
|
|
9
|
-
"""
|
|
10
|
-
Execute the retrieve flow for MCP and return the matching display texts
|
|
11
|
-
Adds the current date to the end
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
retrieve_results = databao_context_engine.search_context(
|
|
15
|
-
retrieve_text=text, run_name=run_name, limit=limit, export_to_file=False
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
display_results = [context_search_result.context_result for context_search_result in retrieve_results]
|
|
19
|
-
|
|
20
|
-
display_results.append(f"\nToday's date is {datetime.date.today()}")
|
|
21
|
-
|
|
22
|
-
return "\n".join(display_results)
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, Mapping
|
|
4
|
-
|
|
5
|
-
from pyathena import connect
|
|
6
|
-
from pyathena.cursor import DictCursor
|
|
7
|
-
from pydantic import Field
|
|
8
|
-
|
|
9
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
10
|
-
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
11
|
-
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
12
|
-
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class AthenaConfigFile(BaseDatabaseConfigFile):
|
|
16
|
-
type: str = Field(default="databases/athena")
|
|
17
|
-
connection: dict[str, Any] = Field(
|
|
18
|
-
description="Connection parameters for the Athena database. It can contain any of the keys supported by the Athena connection library"
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class AthenaIntrospector(BaseIntrospector[AthenaConfigFile]):
|
|
23
|
-
_IGNORED_SCHEMAS = {
|
|
24
|
-
"information_schema",
|
|
25
|
-
}
|
|
26
|
-
supports_catalogs = True
|
|
27
|
-
|
|
28
|
-
def _connect(self, file_config: AthenaConfigFile):
|
|
29
|
-
connection = file_config.connection
|
|
30
|
-
if not isinstance(connection, Mapping):
|
|
31
|
-
raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
|
|
32
|
-
|
|
33
|
-
return connect(**connection, cursor_class=DictCursor)
|
|
34
|
-
|
|
35
|
-
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
36
|
-
with connection.cursor() as cur:
|
|
37
|
-
cur.execute(sql, params or {})
|
|
38
|
-
return cur.fetchall()
|
|
39
|
-
|
|
40
|
-
def _get_catalogs(self, connection, file_config: AthenaConfigFile) -> list[str]:
|
|
41
|
-
catalog = file_config.connection.get("catalog", self._resolve_pseudo_catalog_name(file_config))
|
|
42
|
-
return [catalog]
|
|
43
|
-
|
|
44
|
-
def _connect_to_catalog(self, file_config: AthenaConfigFile, catalog: str):
|
|
45
|
-
self._connect(file_config)
|
|
46
|
-
|
|
47
|
-
def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
|
|
48
|
-
if not catalogs:
|
|
49
|
-
return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
|
|
50
|
-
catalog = catalogs[0]
|
|
51
|
-
sql = "SELECT schema_name, catalog_name FROM information_schema.schemata WHERE catalog_name = %(catalog)s"
|
|
52
|
-
return SQLQuery(sql, {"catalog": catalog})
|
|
53
|
-
|
|
54
|
-
# TODO: Incomplete plugin. Awaiting permission access to AWS to properly develop
|
|
55
|
-
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
56
|
-
if not schemas:
|
|
57
|
-
return []
|
|
58
|
-
|
|
59
|
-
comps = {"columns": self._sql_columns(catalog, schemas)}
|
|
60
|
-
results: dict[str, list[dict]] = {}
|
|
61
|
-
|
|
62
|
-
for name, q in comps.items():
|
|
63
|
-
results[name] = self._fetchall_dicts(connection, q.sql, q.params)
|
|
64
|
-
|
|
65
|
-
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
66
|
-
schemas=schemas,
|
|
67
|
-
rels=results.get("relations", []),
|
|
68
|
-
cols=results.get("columns", []),
|
|
69
|
-
pk_cols=[],
|
|
70
|
-
uq_cols=[],
|
|
71
|
-
checks=[],
|
|
72
|
-
fk_cols=[],
|
|
73
|
-
idx_cols=[],
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
def _sql_columns(self, catalog: str, schemas: list[str]) -> SQLQuery:
|
|
77
|
-
sql = f"""
|
|
78
|
-
SELECT
|
|
79
|
-
table_schema AS schema_name,
|
|
80
|
-
table_name,
|
|
81
|
-
column_name,
|
|
82
|
-
ordinal_position,
|
|
83
|
-
data_type,
|
|
84
|
-
is_nullable
|
|
85
|
-
FROM
|
|
86
|
-
{catalog}.information_schema.columns
|
|
87
|
-
WHERE
|
|
88
|
-
table_schema IN ({schemas})
|
|
89
|
-
ORDER BY
|
|
90
|
-
table_schema,
|
|
91
|
-
table_name,
|
|
92
|
-
ordinal_position
|
|
93
|
-
"""
|
|
94
|
-
return SQLQuery(sql, {"schema": schemas})
|
|
95
|
-
|
|
96
|
-
def _resolve_pseudo_catalog_name(self, file_config: AthenaConfigFile) -> str:
|
|
97
|
-
return "awsdatacatalog"
|
|
98
|
-
|
|
99
|
-
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
100
|
-
sql = f'SELECT * FROM "{schema}"."{table}" LIMIT %(limit)s'
|
|
101
|
-
return SQLQuery(sql, {"limit": limit})
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
-
from databao_context_engine.plugins.databases.duckdb_introspector import DuckDBConfigFile, DuckDBIntrospector
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class DuckDbPlugin(BaseDatabasePlugin[DuckDBConfigFile]):
|
|
6
|
-
id = "jetbrains/duckdb"
|
|
7
|
-
name = "DuckDB Plugin"
|
|
8
|
-
supported = {"databases/duckdb"}
|
|
9
|
-
config_file_type = DuckDBConfigFile
|
|
10
|
-
|
|
11
|
-
def __init__(self):
|
|
12
|
-
super().__init__(DuckDBIntrospector())
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
-
from databao_context_engine.plugins.databases.postgresql_introspector import PostgresConfigFile, PostgresqlIntrospector
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class PostgresqlDbPlugin(BaseDatabasePlugin[PostgresConfigFile]):
|
|
6
|
-
id = "jetbrains/postgres"
|
|
7
|
-
name = "PostgreSQL DB Plugin"
|
|
8
|
-
supported = {"databases/postgres"}
|
|
9
|
-
config_file_type = PostgresConfigFile
|
|
10
|
-
|
|
11
|
-
def __init__(self):
|
|
12
|
-
super().__init__(PostgresqlIntrospector())
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
-
from databao_context_engine.plugins.databases.snowflake_introspector import SnowflakeConfigFile, SnowflakeIntrospector
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class SnowflakeDbPlugin(BaseDatabasePlugin[SnowflakeConfigFile]):
|
|
6
|
-
id = "jetbrains/snowflake"
|
|
7
|
-
name = "Snowflake DB Plugin"
|
|
8
|
-
supported = {"databases/snowflake"}
|
|
9
|
-
config_file_type = SnowflakeConfigFile
|
|
10
|
-
|
|
11
|
-
def __init__(self):
|
|
12
|
-
super().__init__(SnowflakeIntrospector())
|
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
import yaml
|
|
6
|
-
|
|
7
|
-
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
8
|
-
from databao_context_engine.project.layout import get_source_dir
|
|
9
|
-
from databao_context_engine.project.types import (
|
|
10
|
-
DatasourceDescriptor,
|
|
11
|
-
DatasourceKind,
|
|
12
|
-
PreparedConfig,
|
|
13
|
-
PreparedDatasource,
|
|
14
|
-
PreparedFile,
|
|
15
|
-
DatasourceId,
|
|
16
|
-
Datasource,
|
|
17
|
-
)
|
|
18
|
-
from databao_context_engine.templating.renderer import render_template
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def get_datasource_list(project_dir: Path) -> list[Datasource]:
|
|
24
|
-
result = []
|
|
25
|
-
for discovered_datasource in discover_datasources(project_dir=project_dir):
|
|
26
|
-
try:
|
|
27
|
-
prepared_source = prepare_source(discovered_datasource)
|
|
28
|
-
except Exception as e:
|
|
29
|
-
logger.debug(str(e), exc_info=True, stack_info=True)
|
|
30
|
-
logger.info(f"Invalid source at ({discovered_datasource.path}): {str(e)}")
|
|
31
|
-
continue
|
|
32
|
-
|
|
33
|
-
result.append(
|
|
34
|
-
Datasource(
|
|
35
|
-
id=DatasourceId.from_datasource_config_file_path(discovered_datasource.path),
|
|
36
|
-
type=prepared_source.datasource_type,
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return result
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def discover_datasources(project_dir: Path) -> list[DatasourceDescriptor]:
|
|
44
|
-
"""
|
|
45
|
-
Scan the project's src/ directory and return all discovered sources.
|
|
46
|
-
|
|
47
|
-
Rules:
|
|
48
|
-
- Each first-level directory under src/ is treated as a main_type
|
|
49
|
-
- Unsupported or unreadable entries are skipped.
|
|
50
|
-
- The returned list is sorted by directory and then filename
|
|
51
|
-
"""
|
|
52
|
-
src = get_source_dir(project_dir)
|
|
53
|
-
if not src.exists() or not src.is_dir():
|
|
54
|
-
raise ValueError(f"src directory does not exist in {project_dir}")
|
|
55
|
-
|
|
56
|
-
datasources: list[DatasourceDescriptor] = []
|
|
57
|
-
for main_dir in sorted((p for p in src.iterdir() if p.is_dir()), key=lambda p: p.name.lower()):
|
|
58
|
-
for path in sorted((p for p in main_dir.iterdir() if _is_datasource_file(p)), key=lambda p: p.name.lower()):
|
|
59
|
-
datasource = load_datasource_descriptor(path)
|
|
60
|
-
if datasource is not None:
|
|
61
|
-
datasources.append(datasource)
|
|
62
|
-
|
|
63
|
-
return datasources
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _is_datasource_file(p: Path) -> bool:
|
|
67
|
-
# ignore backup files
|
|
68
|
-
return p.is_file() and not p.suffix.endswith("~")
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def get_datasource_descriptors(project_dir: Path, datasource_ids: list[DatasourceId]):
|
|
72
|
-
src = get_source_dir(project_dir)
|
|
73
|
-
if not src.exists() or not src.is_dir():
|
|
74
|
-
raise ValueError(f"src directory does not exist in {project_dir}")
|
|
75
|
-
|
|
76
|
-
datasources: list[DatasourceDescriptor] = []
|
|
77
|
-
for datasource_id in datasource_ids:
|
|
78
|
-
config_file_path = src.joinpath(datasource_id.relative_path_to_config_file())
|
|
79
|
-
if not config_file_path.is_file():
|
|
80
|
-
raise ValueError(f"Datasource config file not found: {config_file_path}")
|
|
81
|
-
|
|
82
|
-
datasource = load_datasource_descriptor(config_file_path)
|
|
83
|
-
if datasource is not None:
|
|
84
|
-
datasources.append(datasource)
|
|
85
|
-
|
|
86
|
-
return datasources
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def load_datasource_descriptor(path: Path) -> DatasourceDescriptor | None:
|
|
90
|
-
"""
|
|
91
|
-
Load a single file with src/<parent_name>/ into a DatasourceDescriptor
|
|
92
|
-
"""
|
|
93
|
-
if not path.is_file():
|
|
94
|
-
return None
|
|
95
|
-
|
|
96
|
-
parent_name = path.parent.name
|
|
97
|
-
extension = path.suffix.lower().lstrip(".")
|
|
98
|
-
|
|
99
|
-
if parent_name == "files":
|
|
100
|
-
return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.FILE)
|
|
101
|
-
|
|
102
|
-
if extension in {"yaml", "yml"}:
|
|
103
|
-
return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.CONFIG)
|
|
104
|
-
|
|
105
|
-
if extension:
|
|
106
|
-
return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.FILE)
|
|
107
|
-
|
|
108
|
-
logger.debug("Skipping file without extension: %s", path)
|
|
109
|
-
return None
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def prepare_source(datasource: DatasourceDescriptor) -> PreparedDatasource:
|
|
113
|
-
"""
|
|
114
|
-
Convert a discovered datasource into a prepared datasource ready for plugin execution
|
|
115
|
-
"""
|
|
116
|
-
if datasource.kind is DatasourceKind.FILE:
|
|
117
|
-
file_subtype = datasource.path.suffix.lower().lstrip(".")
|
|
118
|
-
return PreparedFile(
|
|
119
|
-
datasource_type=DatasourceType.from_main_and_subtypes(main_type=datasource.main_type, subtype=file_subtype),
|
|
120
|
-
path=datasource.path,
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
else:
|
|
124
|
-
config = _parse_config_file(datasource.path)
|
|
125
|
-
|
|
126
|
-
subtype = config.get("type")
|
|
127
|
-
if not subtype or not isinstance(subtype, str):
|
|
128
|
-
raise ValueError("Config missing 'type' at %s - skipping", datasource.path)
|
|
129
|
-
|
|
130
|
-
return PreparedConfig(
|
|
131
|
-
datasource_type=DatasourceType.from_main_and_subtypes(main_type=datasource.main_type, subtype=subtype),
|
|
132
|
-
path=datasource.path,
|
|
133
|
-
config=config,
|
|
134
|
-
datasource_name=datasource.path.stem,
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def _parse_config_file(file_path: Path) -> dict[Any, Any]:
|
|
139
|
-
rendered_file = render_template(file_path.read_text())
|
|
140
|
-
|
|
141
|
-
return yaml.safe_load(rendered_file) or {}
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from databao_context_engine.project.layout import ProjectLayout, get_output_dir
|
|
4
|
-
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
5
|
-
from databao_context_engine.storage.repositories.factories import create_run_repository
|
|
6
|
-
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
7
|
-
from databao_context_engine.system.properties import get_db_path
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def resolve_run_name(*, project_layout: ProjectLayout, run_name: str | None) -> str:
|
|
11
|
-
project_id = str(project_layout.read_config_file().project_id)
|
|
12
|
-
|
|
13
|
-
with open_duckdb_connection(get_db_path()) as conn:
|
|
14
|
-
run_repository = create_run_repository(conn)
|
|
15
|
-
|
|
16
|
-
return resolve_run_name_from_repo(run_repository=run_repository, project_id=project_id, run_name=run_name)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def resolve_run_name_from_repo(*, run_repository: RunRepository, project_id: str, run_name: str | None) -> str:
|
|
20
|
-
if run_name is None:
|
|
21
|
-
latest = run_repository.get_latest_run_for_project(project_id=project_id)
|
|
22
|
-
if latest is None:
|
|
23
|
-
raise LookupError(f"No runs found for project '{project_id}'. Run a build first.")
|
|
24
|
-
return latest.run_name
|
|
25
|
-
else:
|
|
26
|
-
run = run_repository.get_by_run_name(project_id=project_id, run_name=run_name)
|
|
27
|
-
if run is None:
|
|
28
|
-
raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
|
|
29
|
-
return run.run_name
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def get_run_dir(project_dir: Path, run_name: str) -> Path:
|
|
33
|
-
run_dir = get_output_dir(project_dir).joinpath(run_name)
|
|
34
|
-
if not run_dir.is_dir():
|
|
35
|
-
raise ValueError(
|
|
36
|
-
f"The run with name {run_name} doesn't exist in the project. [project_dir: {project_dir.resolve()}]"
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
return run_dir
|