databao-context-engine 0.1.4.dev1__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +14 -1
- databao_context_engine/build_sources/build_runner.py +7 -7
- databao_context_engine/build_sources/build_wiring.py +8 -10
- databao_context_engine/build_sources/plugin_execution.py +9 -12
- databao_context_engine/cli/add_datasource_config.py +9 -30
- databao_context_engine/cli/commands.py +29 -13
- databao_context_engine/databao_context_engine.py +3 -13
- databao_context_engine/databao_context_project_manager.py +56 -29
- databao_context_engine/datasources/check_config.py +13 -16
- databao_context_engine/datasources/datasource_context.py +21 -24
- databao_context_engine/datasources/datasource_discovery.py +45 -44
- databao_context_engine/datasources/types.py +53 -42
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/introspection/property_extract.py +52 -47
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +13 -10
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/mcp/mcp_server.py +1 -3
- databao_context_engine/plugin_loader.py +6 -7
- databao_context_engine/pluginlib/build_plugin.py +0 -33
- databao_context_engine/plugins/databases/athena/__init__.py +0 -0
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{athena_introspector.py → athena/athena_introspector.py} +2 -5
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +1 -3
- databao_context_engine/plugins/databases/base_introspector.py +11 -14
- databao_context_engine/plugins/databases/clickhouse/__init__.py +0 -0
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +2 -5
- databao_context_engine/plugins/databases/duckdb/__init__.py +0 -0
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +4 -7
- databao_context_engine/plugins/databases/mssql/__init__.py +0 -0
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +9 -10
- databao_context_engine/plugins/databases/mysql/__init__.py +0 -0
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +8 -8
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +9 -16
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +8 -9
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/dbt/__init__.py +0 -0
- databao_context_engine/plugins/dbt/dbt_chunker.py +47 -0
- databao_context_engine/plugins/dbt/dbt_context_extractor.py +106 -0
- databao_context_engine/plugins/dbt/dbt_plugin.py +25 -0
- databao_context_engine/plugins/dbt/types.py +44 -0
- databao_context_engine/plugins/dbt/types_artifacts.py +58 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +13 -15
- databao_context_engine/plugins/resources/parquet_introspector.py +1 -1
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/layout.py +12 -13
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +3 -16
- databao_context_engine/retrieve_embeddings/retrieve_service.py +13 -6
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +4 -7
- databao_context_engine/serialization/yaml.py +5 -5
- databao_context_engine/storage/migrate.py +1 -1
- databao_context_engine/storage/repositories/vector_search_repository.py +18 -6
- databao_context_engine-0.1.6.dist-info/METADATA +228 -0
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/RECORD +71 -55
- databao_context_engine/datasources/add_config.py +0 -34
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/retrieve_embeddings/export_results.py +0 -12
- databao_context_engine-0.1.4.dev1.dist-info/METADATA +0 -75
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/WHEEL +0 -0
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/entry_points.txt +0 -0
|
@@ -18,9 +18,8 @@ def load_plugins(exclude_file_plugins: bool = False) -> dict[DatasourceType, Bui
|
|
|
18
18
|
"""Load both builtin and external plugins and merges them into one list."""
|
|
19
19
|
builtin_plugins = _load_builtin_plugins(exclude_file_plugins)
|
|
20
20
|
external_plugins = _load_external_plugins(exclude_file_plugins)
|
|
21
|
-
plugins = _merge_plugins(builtin_plugins, external_plugins)
|
|
22
21
|
|
|
23
|
-
return
|
|
22
|
+
return _merge_plugins(builtin_plugins, external_plugins)
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
def _load_builtin_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
|
|
@@ -35,7 +34,7 @@ def _load_builtin_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugi
|
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
def _load_builtin_file_plugins() -> list[BuildFilePlugin]:
|
|
38
|
-
from databao_context_engine.plugins.unstructured_files_plugin import InternalUnstructuredFilesPlugin
|
|
37
|
+
from databao_context_engine.plugins.files.unstructured_files_plugin import InternalUnstructuredFilesPlugin
|
|
39
38
|
|
|
40
39
|
return [
|
|
41
40
|
InternalUnstructuredFilesPlugin(),
|
|
@@ -44,57 +43,56 @@ def _load_builtin_file_plugins() -> list[BuildFilePlugin]:
|
|
|
44
43
|
|
|
45
44
|
def _load_builtin_datasource_plugins() -> list[BuildDatasourcePlugin]:
|
|
46
45
|
"""Statically register built-in plugins."""
|
|
47
|
-
from databao_context_engine.plugins.duckdb_db_plugin import DuckDbPlugin
|
|
48
|
-
from databao_context_engine.plugins.
|
|
46
|
+
from databao_context_engine.plugins.databases.duckdb.duckdb_db_plugin import DuckDbPlugin
|
|
47
|
+
from databao_context_engine.plugins.databases.sqlite.sqlite_db_plugin import SQLiteDbPlugin
|
|
48
|
+
from databao_context_engine.plugins.dbt.dbt_plugin import DbtPlugin
|
|
49
|
+
from databao_context_engine.plugins.resources.parquet_plugin import ParquetPlugin
|
|
49
50
|
|
|
50
51
|
# optional plugins are added to the python environment via extras
|
|
51
52
|
optional_plugins: list[BuildDatasourcePlugin] = []
|
|
52
53
|
try:
|
|
53
|
-
from databao_context_engine.plugins.mssql_db_plugin import MSSQLDbPlugin
|
|
54
|
+
from databao_context_engine.plugins.databases.mssql.mssql_db_plugin import MSSQLDbPlugin
|
|
54
55
|
|
|
55
56
|
optional_plugins = [MSSQLDbPlugin()]
|
|
56
57
|
except ImportError:
|
|
57
58
|
pass
|
|
58
59
|
|
|
59
60
|
try:
|
|
60
|
-
from databao_context_engine.plugins.clickhouse_db_plugin import ClickhouseDbPlugin
|
|
61
|
+
from databao_context_engine.plugins.databases.clickhouse.clickhouse_db_plugin import ClickhouseDbPlugin
|
|
61
62
|
|
|
62
63
|
optional_plugins.append(ClickhouseDbPlugin())
|
|
63
64
|
except ImportError:
|
|
64
65
|
pass
|
|
65
66
|
|
|
66
67
|
try:
|
|
67
|
-
from databao_context_engine.plugins.athena_db_plugin import AthenaDbPlugin
|
|
68
|
+
from databao_context_engine.plugins.databases.athena.athena_db_plugin import AthenaDbPlugin
|
|
68
69
|
|
|
69
70
|
optional_plugins.append(AthenaDbPlugin())
|
|
70
71
|
except ImportError:
|
|
71
72
|
pass
|
|
72
73
|
|
|
73
74
|
try:
|
|
74
|
-
from databao_context_engine.plugins.snowflake_db_plugin import SnowflakeDbPlugin
|
|
75
|
+
from databao_context_engine.plugins.databases.snowflake.snowflake_db_plugin import SnowflakeDbPlugin
|
|
75
76
|
|
|
76
77
|
optional_plugins.append(SnowflakeDbPlugin())
|
|
77
78
|
except ImportError:
|
|
78
79
|
pass
|
|
79
80
|
|
|
80
81
|
try:
|
|
81
|
-
from databao_context_engine.plugins.mysql_db_plugin import MySQLDbPlugin
|
|
82
|
+
from databao_context_engine.plugins.databases.mysql.mysql_db_plugin import MySQLDbPlugin
|
|
82
83
|
|
|
83
84
|
optional_plugins.append(MySQLDbPlugin())
|
|
84
85
|
except ImportError:
|
|
85
86
|
pass
|
|
86
87
|
|
|
87
88
|
try:
|
|
88
|
-
from databao_context_engine.plugins.postgresql_db_plugin import PostgresqlDbPlugin
|
|
89
|
+
from databao_context_engine.plugins.databases.postgresql.postgresql_db_plugin import PostgresqlDbPlugin
|
|
89
90
|
|
|
90
91
|
optional_plugins.append(PostgresqlDbPlugin())
|
|
91
92
|
except ImportError:
|
|
92
93
|
pass
|
|
93
94
|
|
|
94
|
-
required_plugins: list[BuildDatasourcePlugin] = [
|
|
95
|
-
DuckDbPlugin(),
|
|
96
|
-
ParquetPlugin(),
|
|
97
|
-
]
|
|
95
|
+
required_plugins: list[BuildDatasourcePlugin] = [DuckDbPlugin(), ParquetPlugin(), SQLiteDbPlugin(), DbtPlugin()]
|
|
98
96
|
return required_plugins + optional_plugins
|
|
99
97
|
|
|
100
98
|
|
|
@@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
|
|
|
12
12
|
from databao_context_engine.pluginlib.config import DuckDBSecret
|
|
13
13
|
from databao_context_engine.plugins.duckdb_tools import fetchall_dicts, generate_create_secret_sql
|
|
14
14
|
|
|
15
|
-
parquet_type = "
|
|
15
|
+
parquet_type = "parquet"
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
@@ -24,9 +24,7 @@ class ParquetPlugin(BuildDatasourcePlugin[ParquetConfigFile]):
|
|
|
24
24
|
return build_parquet_chunks(context)
|
|
25
25
|
|
|
26
26
|
def build_context(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> Any:
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
return introspection_result
|
|
27
|
+
return self._introspector.introspect(file_config)
|
|
30
28
|
|
|
31
29
|
def check_connection(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> None:
|
|
32
30
|
self._introspector.check_connection(file_config)
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
-
from databao_context_engine.datasources.types import DatasourceId
|
|
6
5
|
from databao_context_engine.project.project_config import ProjectConfig
|
|
7
6
|
|
|
8
7
|
SOURCE_FOLDER_NAME = "src"
|
|
@@ -25,6 +24,14 @@ class ProjectLayout:
|
|
|
25
24
|
def read_config_file(self) -> ProjectConfig:
|
|
26
25
|
return ProjectConfig.from_file(self.config_file)
|
|
27
26
|
|
|
27
|
+
@property
|
|
28
|
+
def src_dir(self) -> Path:
|
|
29
|
+
return get_source_dir(self.project_dir)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def output_dir(self) -> Path:
|
|
33
|
+
return get_output_dir(self.project_dir)
|
|
34
|
+
|
|
28
35
|
|
|
29
36
|
def ensure_project_dir(project_dir: Path) -> ProjectLayout:
|
|
30
37
|
return _ProjectValidator(project_dir).ensure_project_dir_valid()
|
|
@@ -62,22 +69,14 @@ def get_logs_dir(project_dir: Path) -> Path:
|
|
|
62
69
|
return project_dir.joinpath(LOGS_FOLDER_NAME)
|
|
63
70
|
|
|
64
71
|
|
|
65
|
-
def ensure_datasource_config_file_doesnt_exist(project_dir: Path, datasource_id: DatasourceId) -> Path:
|
|
66
|
-
config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
|
|
67
|
-
|
|
68
|
-
if config_file.is_file():
|
|
69
|
-
raise ValueError(f"A config file already exists for {str(datasource_id)}")
|
|
70
|
-
|
|
71
|
-
return config_file
|
|
72
|
-
|
|
73
|
-
|
|
74
72
|
def create_datasource_config_file(
|
|
75
|
-
|
|
73
|
+
project_layout: ProjectLayout, datasource_relative_name: str, config_content: str, overwrite_existing: bool
|
|
76
74
|
) -> Path:
|
|
75
|
+
config_file = project_layout.src_dir / datasource_relative_name
|
|
77
76
|
if not overwrite_existing:
|
|
78
|
-
|
|
77
|
+
if config_file.is_file():
|
|
78
|
+
raise ValueError(f"A config file already exists {config_file}")
|
|
79
79
|
|
|
80
|
-
config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
|
|
81
80
|
config_file.parent.mkdir(parents=True, exist_ok=True)
|
|
82
81
|
|
|
83
82
|
config_file.write_text(config_content)
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
2
|
|
|
4
|
-
from databao_context_engine.
|
|
5
|
-
from databao_context_engine.retrieve_embeddings.export_results import export_retrieve_results
|
|
3
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
6
4
|
from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
|
|
7
5
|
from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
|
|
8
6
|
|
|
@@ -10,21 +8,10 @@ logger = logging.getLogger(__name__)
|
|
|
10
8
|
|
|
11
9
|
|
|
12
10
|
def retrieve(
|
|
13
|
-
project_dir: Path,
|
|
14
11
|
*,
|
|
15
12
|
retrieve_service: RetrieveService,
|
|
16
|
-
project_id: str,
|
|
17
13
|
text: str,
|
|
18
14
|
limit: int | None,
|
|
19
|
-
|
|
15
|
+
datasource_ids: list[DatasourceId] | None = None,
|
|
20
16
|
) -> list[VectorSearchResult]:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
if export_to_file:
|
|
24
|
-
export_directory = get_output_dir(project_dir)
|
|
25
|
-
|
|
26
|
-
display_texts = [result.display_text for result in retrieve_results]
|
|
27
|
-
export_file = export_retrieve_results(export_directory, display_texts)
|
|
28
|
-
logger.info(f"Exported results to {export_file}")
|
|
29
|
-
|
|
30
|
-
return retrieve_results
|
|
17
|
+
return retrieve_service.retrieve(text=text, limit=limit, datasource_ids=datasource_ids)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections.abc import Sequence
|
|
3
3
|
|
|
4
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
4
5
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
5
6
|
from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
|
|
6
7
|
from databao_context_engine.storage.repositories.vector_search_repository import (
|
|
@@ -23,7 +24,9 @@ class RetrieveService:
|
|
|
23
24
|
self._provider = provider
|
|
24
25
|
self._vector_search_repo = vector_search_repo
|
|
25
26
|
|
|
26
|
-
def retrieve(
|
|
27
|
+
def retrieve(
|
|
28
|
+
self, *, text: str, limit: int | None = None, datasource_ids: list[DatasourceId] | None = None
|
|
29
|
+
) -> list[VectorSearchResult]:
|
|
27
30
|
if limit is None:
|
|
28
31
|
limit = 10
|
|
29
32
|
|
|
@@ -40,15 +43,19 @@ class RetrieveService:
|
|
|
40
43
|
retrieve_vec=retrieve_vec,
|
|
41
44
|
dimension=dimension,
|
|
42
45
|
limit=limit,
|
|
46
|
+
datasource_ids=datasource_ids,
|
|
43
47
|
)
|
|
44
48
|
|
|
45
49
|
logger.debug(f"Retrieved {len(search_results)} display texts in table {table_name}")
|
|
46
50
|
|
|
47
51
|
if logger.isEnabledFor(logging.DEBUG):
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
52
|
+
if search_results:
|
|
53
|
+
closest_result = min(search_results, key=lambda result: result.cosine_distance)
|
|
54
|
+
logger.debug(f"Best result: ({closest_result.cosine_distance}, {closest_result.embeddable_text})")
|
|
55
|
+
|
|
56
|
+
farthest_result = max(search_results, key=lambda result: result.cosine_distance)
|
|
57
|
+
logger.debug(f"Worst result: ({farthest_result.cosine_distance}, {farthest_result.embeddable_text})")
|
|
58
|
+
else:
|
|
59
|
+
logger.debug("No results found")
|
|
53
60
|
|
|
54
61
|
return search_results
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from duckdb import DuckDBPyConnection
|
|
2
2
|
|
|
3
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
3
4
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
4
5
|
from databao_context_engine.llm.factory import create_ollama_embedding_provider, create_ollama_service
|
|
5
|
-
from databao_context_engine.project.layout import ProjectLayout
|
|
6
|
+
from databao_context_engine.project.layout import ProjectLayout
|
|
6
7
|
from databao_context_engine.retrieve_embeddings.retrieve_runner import retrieve
|
|
7
8
|
from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
|
|
8
9
|
from databao_context_engine.services.factories import create_shard_resolver
|
|
@@ -16,21 +17,17 @@ def retrieve_embeddings(
|
|
|
16
17
|
project_layout: ProjectLayout,
|
|
17
18
|
retrieve_text: str,
|
|
18
19
|
limit: int | None,
|
|
19
|
-
|
|
20
|
+
datasource_ids: list[DatasourceId] | None,
|
|
20
21
|
) -> list[VectorSearchResult]:
|
|
21
|
-
ensure_project_dir(project_layout.project_dir)
|
|
22
|
-
|
|
23
22
|
with open_duckdb_connection(get_db_path(project_layout.project_dir)) as conn:
|
|
24
23
|
ollama_service = create_ollama_service()
|
|
25
24
|
embedding_provider = create_ollama_embedding_provider(ollama_service)
|
|
26
25
|
retrieve_service = _create_retrieve_service(conn, embedding_provider=embedding_provider)
|
|
27
26
|
return retrieve(
|
|
28
|
-
project_dir=project_layout.project_dir,
|
|
29
27
|
retrieve_service=retrieve_service,
|
|
30
|
-
project_id=str(project_layout.read_config_file().project_id),
|
|
31
28
|
text=retrieve_text,
|
|
32
29
|
limit=limit,
|
|
33
|
-
|
|
30
|
+
datasource_ids=datasource_ids,
|
|
34
31
|
)
|
|
35
32
|
|
|
36
33
|
|
|
@@ -7,17 +7,17 @@ from yaml import Node, SafeDumper
|
|
|
7
7
|
def default_representer(dumper: SafeDumper, data: object) -> Node:
|
|
8
8
|
if isinstance(data, Mapping):
|
|
9
9
|
return dumper.represent_dict(data)
|
|
10
|
-
|
|
10
|
+
if hasattr(data, "__dict__"):
|
|
11
11
|
# Doesn't serialize "private" attributes (that starts with an _)
|
|
12
12
|
data_public_attributes = {key: value for key, value in data.__dict__.items() if not key.startswith("_")}
|
|
13
13
|
if data_public_attributes:
|
|
14
14
|
return dumper.represent_dict(data_public_attributes)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
return dumper.represent_str(str(data))
|
|
18
|
-
else:
|
|
15
|
+
|
|
16
|
+
# If there is no public attributes, we default to the string representation
|
|
19
17
|
return dumper.represent_str(str(data))
|
|
20
18
|
|
|
19
|
+
return dumper.represent_str(str(data))
|
|
20
|
+
|
|
21
21
|
|
|
22
22
|
# Registers our default representer only once, when that file is imported
|
|
23
23
|
yaml.add_multi_representer(object, default_representer, Dumper=SafeDumper)
|
|
@@ -69,7 +69,7 @@ class _Migration:
|
|
|
69
69
|
def _create_migration(file: Path) -> _Migration:
|
|
70
70
|
query_bytes = file.read_bytes()
|
|
71
71
|
query = query_bytes.decode("utf-8")
|
|
72
|
-
checksum = hashlib.md5(query_bytes).hexdigest()
|
|
72
|
+
checksum = hashlib.md5(query_bytes, usedforsecurity=False).hexdigest()
|
|
73
73
|
version = _extract_version_from_name(file.name)
|
|
74
74
|
return _Migration(name=file.name, version=version, checksum=checksum, query=query)
|
|
75
75
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
import duckdb
|
|
5
6
|
|
|
@@ -23,27 +24,38 @@ class VectorSearchRepository:
|
|
|
23
24
|
self._conn = conn
|
|
24
25
|
|
|
25
26
|
def get_display_texts_by_similarity(
|
|
26
|
-
self,
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
table_name: str,
|
|
30
|
+
retrieve_vec: Sequence[float],
|
|
31
|
+
dimension: int,
|
|
32
|
+
limit: int,
|
|
33
|
+
datasource_ids: list[DatasourceId] | None = None,
|
|
27
34
|
) -> list[VectorSearchResult]:
|
|
28
35
|
"""Read only similarity search on a specific embedding shard table."""
|
|
36
|
+
params: list[Any] = [list(retrieve_vec), self._DEFAULT_DISTANCE_THRESHOLD, list(retrieve_vec), limit]
|
|
37
|
+
if datasource_ids:
|
|
38
|
+
params.append([str(datasource_id) for datasource_id in datasource_ids])
|
|
39
|
+
|
|
29
40
|
rows = self._conn.execute(
|
|
30
41
|
f"""
|
|
31
42
|
SELECT
|
|
32
43
|
COALESCE(c.display_text, c.embeddable_text) AS display_text,
|
|
33
44
|
c.embeddable_text,
|
|
34
|
-
array_cosine_distance(e.vec, CAST(
|
|
45
|
+
array_cosine_distance(e.vec, CAST($1 AS FLOAT[{dimension}])) AS cosine_distance,
|
|
35
46
|
c.full_type,
|
|
36
47
|
c.datasource_id,
|
|
37
48
|
FROM
|
|
38
49
|
{table_name} e
|
|
39
50
|
JOIN chunk c ON e.chunk_id = c.chunk_id
|
|
40
51
|
WHERE
|
|
41
|
-
cosine_distance <
|
|
52
|
+
cosine_distance < $2
|
|
53
|
+
{"AND c.datasource_id IN $5" if datasource_ids else ""}
|
|
42
54
|
ORDER BY
|
|
43
|
-
array_cosine_distance(e.vec, CAST(
|
|
44
|
-
LIMIT
|
|
55
|
+
array_cosine_distance(e.vec, CAST($3 AS FLOAT[{dimension}])) ASC
|
|
56
|
+
LIMIT $4
|
|
45
57
|
""",
|
|
46
|
-
|
|
58
|
+
params,
|
|
47
59
|
).fetchall()
|
|
48
60
|
|
|
49
61
|
return [
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: databao-context-engine
|
|
3
|
+
Version: 0.1.6
|
|
4
|
+
Summary: Semantic context for your LLMs — generated automatically
|
|
5
|
+
Requires-Dist: click>=8.3.0
|
|
6
|
+
Requires-Dist: duckdb>=1.4.3
|
|
7
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
8
|
+
Requires-Dist: requests>=2.32.5
|
|
9
|
+
Requires-Dist: mcp>=1.23.3
|
|
10
|
+
Requires-Dist: pydantic>=2.12.4
|
|
11
|
+
Requires-Dist: jinja2>=3.1.6
|
|
12
|
+
Requires-Dist: pyathena>=3.25.0 ; extra == 'athena'
|
|
13
|
+
Requires-Dist: clickhouse-connect>=0.10.0 ; extra == 'clickhouse'
|
|
14
|
+
Requires-Dist: mssql-python>=1.0.0 ; extra == 'mssql'
|
|
15
|
+
Requires-Dist: pymysql>=1.1.2 ; extra == 'mysql'
|
|
16
|
+
Requires-Dist: asyncpg>=0.31.0 ; extra == 'postgresql'
|
|
17
|
+
Requires-Dist: snowflake-connector-python>=4.2.0 ; extra == 'snowflake'
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Provides-Extra: athena
|
|
20
|
+
Provides-Extra: clickhouse
|
|
21
|
+
Provides-Extra: mssql
|
|
22
|
+
Provides-Extra: mysql
|
|
23
|
+
Provides-Extra: postgresql
|
|
24
|
+
Provides-Extra: snowflake
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
[](https://github.com/JetBrains#jetbrains-on-github)
|
|
28
|
+
[](https://pypi.org/project/databao-context-engine)
|
|
29
|
+
[](https://github.com/JetBrains/databao-context-engine?tab=License-1-ov-file)
|
|
30
|
+
|
|
31
|
+
[//]: # ([](https://pypi.org/project/databao-context-engine/))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
<h1 align="center">Databao Context Engine</h1>
|
|
35
|
+
<p align="center">
|
|
36
|
+
<b>Semantic context for your LLMs — generated automatically.</b><br/>
|
|
37
|
+
No more copying schemas. No manual documentation. Just accurate answers.
|
|
38
|
+
</p>
|
|
39
|
+
<p align="center">
|
|
40
|
+
<a href="https://databao.app">Website</a> •
|
|
41
|
+
<a href="#quickstart">Quickstart</a> •
|
|
42
|
+
<a href="#supported-data-sources">Data Sources</a> •
|
|
43
|
+
<a href="#contributing">Contributing</a>
|
|
44
|
+
</p>
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## What is Databao Context Engine?
|
|
49
|
+
|
|
50
|
+
Databao Context Engine is a CLI tool that **automatically generates governed semantic context** from your databases, BI tools, documents, and spreadsheets.
|
|
51
|
+
|
|
52
|
+
Integrate it with any LLM to deliver **accurate, context-aware answers** — without copying schemas or writing documentation by hand.
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
Your data sources → Context Engine → Unified semantic graph → Any LLM
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Why choose Databao Context Engine?
|
|
59
|
+
|
|
60
|
+
| Feature | What it means for you |
|
|
61
|
+
|----------------------------|----------------------------------------------------------------|
|
|
62
|
+
| **Auto-generated context** | Extracts schemas, relationships, and semantics automatically |
|
|
63
|
+
| **Runs locally** | Your data never leaves your environment |
|
|
64
|
+
| **MCP integration** | Works with Claude Desktop, Cursor, and any MCP-compatible tool |
|
|
65
|
+
| **Multiple sources** | Databases, dbt projects, spreadsheets, documents |
|
|
66
|
+
| **Built-in benchmarks** | Measure and improve context quality over time |
|
|
67
|
+
| **LLM agnostic** | OpenAI, Anthropic, Ollama, Gemini — use any model |
|
|
68
|
+
| **Governed & versioned** | Track, version, and share context across your team |
|
|
69
|
+
| **Dynamic or static** | Serve context via MCP server or export as artifact |
|
|
70
|
+
|
|
71
|
+
## Installation
|
|
72
|
+
|
|
73
|
+
Databao Context Engine is [available on PyPI](https://pypi.org/project/databao-context-engine/) and can be installed with uv, pip, or another package manage.
|
|
74
|
+
|
|
75
|
+
### Using uv
|
|
76
|
+
|
|
77
|
+
1. Install Databao Context Engine:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
uv tool install databao-context-engine
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
1. Add it to your PATH:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uv tool update-shell
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
1. Verify the installation:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
dce --help
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Using pip
|
|
96
|
+
|
|
97
|
+
1. Install Databao Context Engine:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install databao-context-engine
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
1. Verify the installation:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
dce --help
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Supported data sources
|
|
110
|
+
|
|
111
|
+
* <img src="https://cdn.simpleicons.org/postgresql/316192" width="16" height="16" alt=""> PostgreSQL
|
|
112
|
+
* <img src="https://cdn.simpleicons.org/mysql/4479A1" width="16" height="16" alt=""> MySQL
|
|
113
|
+
* <img src="https://cdn.simpleicons.org/sqlite/003B57" width="16" height="16" alt=""> SQLite
|
|
114
|
+
* <img src="https://cdn.simpleicons.org/duckdb/FFF000" width="16" height="16" alt=""> DuckDB
|
|
115
|
+
* <img src="https://cdn.simpleicons.org/dbt/FF694B" width="16" height="16" alt=""> dbt projects
|
|
116
|
+
* 📄 Documents & spreadsheets *(coming soon)*
|
|
117
|
+
|
|
118
|
+
## Supported LLMs
|
|
119
|
+
|
|
120
|
+
| Provider | Configuration |
|
|
121
|
+
|---------------|----------------------------------------------|
|
|
122
|
+
| **Ollama** | `languageModel: OLLAMA`: runs locally, free |
|
|
123
|
+
| **OpenAI** | `languageModel: OPENAI`: requires an API key |
|
|
124
|
+
| **Anthropic** | `languageModel: CLAUDE`: requires an API key |
|
|
125
|
+
| **Google** | `languageModel: GEMINI`: requires an API key |
|
|
126
|
+
|
|
127
|
+
## Quickstart
|
|
128
|
+
|
|
129
|
+
### 1. Create a project
|
|
130
|
+
|
|
131
|
+
1. Create a new directory for your project and navigate to it:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
mkdir dce-project && cd dce-project
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
1. Initialize a new project:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
dce init
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### 2. Configure data sources
|
|
144
|
+
|
|
145
|
+
1. When prompted, agree to create a new datasource.
|
|
146
|
+
You can also use the `dce datasource add` command.
|
|
147
|
+
|
|
148
|
+
1. Provide the data source type and its name.
|
|
149
|
+
|
|
150
|
+
1. Open the config file that was created for you in your editor and fill in the connection details.
|
|
151
|
+
|
|
152
|
+
1. Repeat these steps for all data sources you want to include in your project.
|
|
153
|
+
|
|
154
|
+
1. If you have data in Markdown or text files,
|
|
155
|
+
you can add them to the `dce/src/files` directory.
|
|
156
|
+
|
|
157
|
+
### 3. Build context
|
|
158
|
+
|
|
159
|
+
1. To build the context, run the following command:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
dce build
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### 4. Use Context with Your LLM
|
|
166
|
+
|
|
167
|
+
**Option A: Dynamic via MCP Server**
|
|
168
|
+
|
|
169
|
+
Databao Context Engine exposes the context through a local MCP Server, so your agent can access the latest context at runtime.
|
|
170
|
+
|
|
171
|
+
1. In **Claude Desktop**, **Cursor**, or another MCP-compatible agent, add the following configuration.
|
|
172
|
+
Replace `dce-project/` with the path to your project directory:
|
|
173
|
+
|
|
174
|
+
```json
|
|
175
|
+
# claude_desktop_config.json, mcp.json, or similar
|
|
176
|
+
|
|
177
|
+
{
|
|
178
|
+
"mcpServers": {
|
|
179
|
+
"dce": {
|
|
180
|
+
"command": "dce mcp",
|
|
181
|
+
"args": ["--project-dir", "dce-project/"]
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
1. Save the file and restart your agent.
|
|
188
|
+
|
|
189
|
+
1. Open a new chat, in the chat window, select the `dce` server, and ask questions related to your project context.
|
|
190
|
+
|
|
191
|
+
**Option B: Static artifact**
|
|
192
|
+
|
|
193
|
+
Even if you don’t have Claude or Cursor installed on your local machine,
|
|
194
|
+
you can still use the context built by Databao Context Engine by pasting it directly into your chat with an AI assistant.
|
|
195
|
+
|
|
196
|
+
1. Navigate to `dce-project/output/` and open the directory with the latest run.
|
|
197
|
+
|
|
198
|
+
1. Attach the `all_results.yaml` file to your chat with the AI assistant or copy and paste its contents into your chat.
|
|
199
|
+
|
|
200
|
+
## Contributing
|
|
201
|
+
|
|
202
|
+
We’d love your help! Here’s how to get involved:
|
|
203
|
+
|
|
204
|
+
- ⭐ **Star this repo** — it helps others find us!
|
|
205
|
+
- 🐛 **Found a bug?** [Open an issue](https://github.com/JetBrains/databao-context-engine/issues)
|
|
206
|
+
- 💡 **Have an idea?** We’re all ears — create a feature request
|
|
207
|
+
- 👍 **Upvote issues** you care about — helps us prioritize
|
|
208
|
+
- 🔧 **Submit a PR**
|
|
209
|
+
- 📝 **Improve docs** — typos, examples, tutorials — everything helps!
|
|
210
|
+
|
|
211
|
+
New to open source? No worries! We're friendly and happy to help you get started. 🌱
|
|
212
|
+
|
|
213
|
+
For more details, see [CONTRIBUTING](CONTRIBUTING.md).
|
|
214
|
+
|
|
215
|
+
## 📄 License
|
|
216
|
+
|
|
217
|
+
Apache 2.0 — use it however you want. See the [LICENSE](LICENSE.md) file for details.
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
<p align="center">
|
|
222
|
+
<b>Like Databao Context Engine?</b> Give us a ⭐ — it means a lot!
|
|
223
|
+
</p>
|
|
224
|
+
|
|
225
|
+
<p align="center">
|
|
226
|
+
<a href="https://databao.app">Website</a> •
|
|
227
|
+
<a href="https://discord.gg/databao">Discord</a>
|
|
228
|
+
</p>
|