databao-context-engine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. databao_context_engine/__init__.py +35 -0
  2. databao_context_engine/build_sources/__init__.py +0 -0
  3. databao_context_engine/build_sources/internal/__init__.py +0 -0
  4. databao_context_engine/build_sources/internal/build_runner.py +111 -0
  5. databao_context_engine/build_sources/internal/build_service.py +77 -0
  6. databao_context_engine/build_sources/internal/build_wiring.py +52 -0
  7. databao_context_engine/build_sources/internal/export_results.py +43 -0
  8. databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
  9. databao_context_engine/build_sources/public/__init__.py +0 -0
  10. databao_context_engine/build_sources/public/api.py +4 -0
  11. databao_context_engine/cli/__init__.py +0 -0
  12. databao_context_engine/cli/add_datasource_config.py +130 -0
  13. databao_context_engine/cli/commands.py +256 -0
  14. databao_context_engine/cli/datasources.py +64 -0
  15. databao_context_engine/cli/info.py +32 -0
  16. databao_context_engine/config/__init__.py +0 -0
  17. databao_context_engine/config/log_config.yaml +16 -0
  18. databao_context_engine/config/logging.py +43 -0
  19. databao_context_engine/databao_context_project_manager.py +92 -0
  20. databao_context_engine/databao_engine.py +85 -0
  21. databao_context_engine/datasource_config/__init__.py +0 -0
  22. databao_context_engine/datasource_config/add_config.py +50 -0
  23. databao_context_engine/datasource_config/check_config.py +131 -0
  24. databao_context_engine/datasource_config/datasource_context.py +60 -0
  25. databao_context_engine/event_journal/__init__.py +0 -0
  26. databao_context_engine/event_journal/writer.py +29 -0
  27. databao_context_engine/generate_configs_schemas.py +92 -0
  28. databao_context_engine/init_project.py +18 -0
  29. databao_context_engine/introspection/__init__.py +0 -0
  30. databao_context_engine/introspection/property_extract.py +202 -0
  31. databao_context_engine/llm/__init__.py +0 -0
  32. databao_context_engine/llm/config.py +20 -0
  33. databao_context_engine/llm/descriptions/__init__.py +0 -0
  34. databao_context_engine/llm/descriptions/ollama.py +21 -0
  35. databao_context_engine/llm/descriptions/provider.py +10 -0
  36. databao_context_engine/llm/embeddings/__init__.py +0 -0
  37. databao_context_engine/llm/embeddings/ollama.py +37 -0
  38. databao_context_engine/llm/embeddings/provider.py +13 -0
  39. databao_context_engine/llm/errors.py +16 -0
  40. databao_context_engine/llm/factory.py +61 -0
  41. databao_context_engine/llm/install.py +227 -0
  42. databao_context_engine/llm/runtime.py +73 -0
  43. databao_context_engine/llm/service.py +159 -0
  44. databao_context_engine/main.py +19 -0
  45. databao_context_engine/mcp/__init__.py +0 -0
  46. databao_context_engine/mcp/all_results_tool.py +5 -0
  47. databao_context_engine/mcp/mcp_runner.py +16 -0
  48. databao_context_engine/mcp/mcp_server.py +63 -0
  49. databao_context_engine/mcp/retrieve_tool.py +22 -0
  50. databao_context_engine/pluginlib/__init__.py +0 -0
  51. databao_context_engine/pluginlib/build_plugin.py +107 -0
  52. databao_context_engine/pluginlib/config.py +37 -0
  53. databao_context_engine/pluginlib/plugin_utils.py +68 -0
  54. databao_context_engine/plugins/__init__.py +0 -0
  55. databao_context_engine/plugins/athena_db_plugin.py +12 -0
  56. databao_context_engine/plugins/base_db_plugin.py +45 -0
  57. databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
  58. databao_context_engine/plugins/databases/__init__.py +0 -0
  59. databao_context_engine/plugins/databases/athena_introspector.py +101 -0
  60. databao_context_engine/plugins/databases/base_introspector.py +144 -0
  61. databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
  62. databao_context_engine/plugins/databases/database_chunker.py +69 -0
  63. databao_context_engine/plugins/databases/databases_types.py +114 -0
  64. databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
  65. databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
  66. databao_context_engine/plugins/databases/introspection_scope.py +74 -0
  67. databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
  68. databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
  69. databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
  70. databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
  71. databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
  72. databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
  73. databao_context_engine/plugins/mssql_db_plugin.py +12 -0
  74. databao_context_engine/plugins/mysql_db_plugin.py +12 -0
  75. databao_context_engine/plugins/parquet_plugin.py +32 -0
  76. databao_context_engine/plugins/plugin_loader.py +110 -0
  77. databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
  78. databao_context_engine/plugins/resources/__init__.py +0 -0
  79. databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
  80. databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
  81. databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
  82. databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
  83. databao_context_engine/project/__init__.py +0 -0
  84. databao_context_engine/project/datasource_discovery.py +141 -0
  85. databao_context_engine/project/info.py +44 -0
  86. databao_context_engine/project/init_project.py +102 -0
  87. databao_context_engine/project/layout.py +127 -0
  88. databao_context_engine/project/project_config.py +32 -0
  89. databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
  90. databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
  91. databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
  92. databao_context_engine/project/runs.py +39 -0
  93. databao_context_engine/project/types.py +134 -0
  94. databao_context_engine/retrieve_embeddings/__init__.py +0 -0
  95. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  96. databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
  97. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
  98. databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
  99. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
  100. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  101. databao_context_engine/retrieve_embeddings/public/api.py +3 -0
  102. databao_context_engine/serialisation/__init__.py +0 -0
  103. databao_context_engine/serialisation/yaml.py +35 -0
  104. databao_context_engine/services/__init__.py +0 -0
  105. databao_context_engine/services/chunk_embedding_service.py +104 -0
  106. databao_context_engine/services/embedding_shard_resolver.py +64 -0
  107. databao_context_engine/services/factories.py +88 -0
  108. databao_context_engine/services/models.py +12 -0
  109. databao_context_engine/services/persistence_service.py +61 -0
  110. databao_context_engine/services/run_name_policy.py +8 -0
  111. databao_context_engine/services/table_name_policy.py +15 -0
  112. databao_context_engine/storage/__init__.py +0 -0
  113. databao_context_engine/storage/connection.py +32 -0
  114. databao_context_engine/storage/exceptions/__init__.py +0 -0
  115. databao_context_engine/storage/exceptions/exceptions.py +6 -0
  116. databao_context_engine/storage/migrate.py +127 -0
  117. databao_context_engine/storage/migrations/V01__init.sql +63 -0
  118. databao_context_engine/storage/models.py +51 -0
  119. databao_context_engine/storage/repositories/__init__.py +0 -0
  120. databao_context_engine/storage/repositories/chunk_repository.py +130 -0
  121. databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
  122. databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
  123. databao_context_engine/storage/repositories/embedding_repository.py +113 -0
  124. databao_context_engine/storage/repositories/factories.py +35 -0
  125. databao_context_engine/storage/repositories/run_repository.py +157 -0
  126. databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
  127. databao_context_engine/storage/transaction.py +14 -0
  128. databao_context_engine/system/__init__.py +0 -0
  129. databao_context_engine/system/properties.py +13 -0
  130. databao_context_engine/templating/__init__.py +0 -0
  131. databao_context_engine/templating/renderer.py +29 -0
  132. databao_context_engine-0.1.1.dist-info/METADATA +186 -0
  133. databao_context_engine-0.1.1.dist-info/RECORD +135 -0
  134. databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
  135. databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,113 @@
1
+ from typing import Optional, Sequence, Tuple
2
+
3
+ import duckdb
4
+ from _duckdb import ConstraintException
5
+
6
+ from databao_context_engine.services.table_name_policy import TableNamePolicy
7
+ from databao_context_engine.storage.exceptions.exceptions import IntegrityError
8
+ from databao_context_engine.storage.models import EmbeddingDTO
9
+
10
+
11
+ class EmbeddingRepository:
12
+ def __init__(self, conn: duckdb.DuckDBPyConnection):
13
+ self._conn = conn
14
+
15
+ def create(
16
+ self,
17
+ *,
18
+ table_name: str,
19
+ chunk_id: int,
20
+ vec: Sequence[float],
21
+ ) -> EmbeddingDTO:
22
+ try:
23
+ TableNamePolicy.validate_table_name(table_name=table_name)
24
+ row = self._conn.execute(
25
+ f"""
26
+ INSERT INTO
27
+ {table_name} (chunk_id, vec)
28
+ VALUES
29
+ (?, ?)
30
+ RETURNING
31
+ *
32
+ """,
33
+ [chunk_id, vec],
34
+ ).fetchone()
35
+ if row is None:
36
+ raise RuntimeError("Embedding creation returned no object")
37
+ return self._row_to_dto(row)
38
+ except ConstraintException as e:
39
+ raise IntegrityError from e
40
+
41
+ def get(self, *, table_name: str, chunk_id: int) -> Optional[EmbeddingDTO]:
42
+ TableNamePolicy.validate_table_name(table_name=table_name)
43
+ row = self._conn.execute(
44
+ f"""
45
+ SELECT
46
+ *
47
+ FROM
48
+ {table_name}
49
+ WHERE
50
+ chunk_id = ?
51
+ """,
52
+ [chunk_id],
53
+ ).fetchone()
54
+ return self._row_to_dto(row) if row else None
55
+
56
+ def update(
57
+ self,
58
+ *,
59
+ table_name: str,
60
+ chunk_id: int,
61
+ vec: Sequence[float],
62
+ ) -> Optional[EmbeddingDTO]:
63
+ TableNamePolicy.validate_table_name(table_name=table_name)
64
+ self._conn.execute(
65
+ f"""
66
+ UPDATE
67
+ {table_name}
68
+ SET
69
+ vec = ?
70
+ WHERE
71
+ chunk_id = ?
72
+ """,
73
+ [list(vec), chunk_id],
74
+ )
75
+ return self.get(table_name=table_name, chunk_id=chunk_id)
76
+
77
+ def delete(self, *, table_name: str, chunk_id: int) -> int:
78
+ TableNamePolicy.validate_table_name(table_name=table_name)
79
+ row = self._conn.execute(
80
+ f"""
81
+ DELETE FROM
82
+ {table_name}
83
+ WHERE
84
+ chunk_id = ?
85
+ RETURNING
86
+ chunk_id
87
+ """,
88
+ [chunk_id],
89
+ ).fetchone()
90
+ return 1 if row else 0
91
+
92
+ def list(self, table_name: str) -> list[EmbeddingDTO]:
93
+ TableNamePolicy.validate_table_name(table_name=table_name)
94
+ rows = self._conn.execute(
95
+ f"""
96
+ SELECT
97
+ *
98
+ FROM
99
+ {table_name}
100
+ ORDER BY
101
+ chunk_id DESC
102
+ """
103
+ ).fetchall()
104
+ return [self._row_to_dto(r) for r in rows]
105
+
106
+ @staticmethod
107
+ def _row_to_dto(row: Tuple) -> EmbeddingDTO:
108
+ chunk_id, vec, created_at = row
109
+ return EmbeddingDTO(
110
+ chunk_id=int(chunk_id),
111
+ vec=list(vec) if not isinstance(vec, list) else vec,
112
+ created_at=created_at,
113
+ )
@@ -0,0 +1,35 @@
1
+ from _duckdb import DuckDBPyConnection
2
+
3
+ from databao_context_engine.services.run_name_policy import RunNamePolicy
4
+ from databao_context_engine.storage.repositories.chunk_repository import ChunkRepository
5
+ from databao_context_engine.storage.repositories.datasource_run_repository import DatasourceRunRepository
6
+ from databao_context_engine.storage.repositories.embedding_model_registry_repository import (
7
+ EmbeddingModelRegistryRepository,
8
+ )
9
+ from databao_context_engine.storage.repositories.embedding_repository import EmbeddingRepository
10
+ from databao_context_engine.storage.repositories.run_repository import RunRepository
11
+ from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchRepository
12
+
13
+
14
+ def create_run_repository(conn: DuckDBPyConnection) -> RunRepository:
15
+ return RunRepository(conn, run_name_policy=RunNamePolicy())
16
+
17
+
18
+ def create_datasource_run_repository(conn: DuckDBPyConnection) -> DatasourceRunRepository:
19
+ return DatasourceRunRepository(conn)
20
+
21
+
22
+ def create_chunk_repository(conn: DuckDBPyConnection) -> ChunkRepository:
23
+ return ChunkRepository(conn)
24
+
25
+
26
+ def create_embedding_repository(conn: DuckDBPyConnection) -> EmbeddingRepository:
27
+ return EmbeddingRepository(conn)
28
+
29
+
30
+ def create_registry_repository(conn: DuckDBPyConnection) -> EmbeddingModelRegistryRepository:
31
+ return EmbeddingModelRegistryRepository(conn)
32
+
33
+
34
+ def create_vector_search_repository(conn: DuckDBPyConnection) -> VectorSearchRepository:
35
+ return VectorSearchRepository(conn)
@@ -0,0 +1,157 @@
1
+ from datetime import datetime
2
+ from typing import Any, Optional
3
+
4
+ import duckdb
5
+
6
+ from databao_context_engine.services.run_name_policy import RunNamePolicy
7
+ from databao_context_engine.storage.models import RunDTO
8
+
9
+
10
+ class RunRepository:
11
+ def __init__(self, conn: duckdb.DuckDBPyConnection, run_name_policy: RunNamePolicy):
12
+ self._conn = conn
13
+ self._run_name_policy = run_name_policy
14
+
15
+ def create(
16
+ self, *, project_id: str, dce_version: Optional[str] = None, started_at: datetime | None = None
17
+ ) -> RunDTO:
18
+ if started_at is None:
19
+ started_at = datetime.now()
20
+ run_name = self._run_name_policy.build(run_started_at=started_at)
21
+
22
+ row = self._conn.execute(
23
+ """
24
+ INSERT INTO
25
+ run (project_id, nemory_version, started_at, run_name)
26
+ VALUES
27
+ (?, ?, ?, ?)
28
+ RETURNING
29
+ *
30
+ """,
31
+ [project_id, dce_version, started_at, run_name],
32
+ ).fetchone()
33
+ if row is None:
34
+ raise RuntimeError("Run creation returned no object")
35
+ return self._row_to_dto(row)
36
+
37
+ def get(self, run_id: int) -> Optional[RunDTO]:
38
+ row = self._conn.execute(
39
+ """
40
+ SELECT
41
+ *
42
+ FROM
43
+ run
44
+ WHERE
45
+ run_id = ?
46
+ """,
47
+ [run_id],
48
+ ).fetchone()
49
+ return self._row_to_dto(row) if row else None
50
+
51
+ def get_by_run_name(self, *, project_id: str, run_name: str) -> RunDTO | None:
52
+ row = self._conn.execute(
53
+ """
54
+ SELECT
55
+ *
56
+ FROM
57
+ run
58
+ WHERE
59
+ run.project_id = ? AND run_name = ?
60
+ """,
61
+ [project_id, run_name],
62
+ ).fetchone()
63
+ return self._row_to_dto(row) if row else None
64
+
65
+ def get_latest_run_for_project(self, project_id: str) -> RunDTO | None:
66
+ row = self._conn.execute(
67
+ """
68
+ SELECT
69
+ *
70
+ FROM
71
+ run
72
+ WHERE
73
+ run.project_id = ?
74
+ ORDER BY run.started_at DESC
75
+ LIMIT 1
76
+ """,
77
+ [project_id],
78
+ ).fetchone()
79
+ return self._row_to_dto(row) if row else None
80
+
81
+ def update(
82
+ self,
83
+ run_id: int,
84
+ *,
85
+ project_id: Optional[str] = None,
86
+ ended_at: Optional[datetime] = None,
87
+ dce_version: Optional[str] = None,
88
+ ) -> Optional[RunDTO]:
89
+ sets: list[Any] = []
90
+ params: list[Any] = []
91
+
92
+ if project_id is not None:
93
+ sets.append("project_id = ?")
94
+ params.append(project_id)
95
+ if ended_at is not None:
96
+ sets.append("ended_at = ?")
97
+ params.append(ended_at)
98
+ if dce_version is not None:
99
+ sets.append("nemory_version = ?")
100
+ params.append(dce_version)
101
+
102
+ if not sets:
103
+ return self.get(run_id)
104
+
105
+ params.append(run_id)
106
+ self._conn.execute(
107
+ f"""
108
+ UPDATE
109
+ run
110
+ SET
111
+ {", ".join(sets)}
112
+ WHERE
113
+ run_id = ?
114
+ """,
115
+ params,
116
+ )
117
+
118
+ return self.get(run_id)
119
+
120
+ def delete(self, run_id: int) -> int:
121
+ row = self._conn.execute(
122
+ """
123
+ DELETE FROM
124
+ run
125
+ WHERE
126
+ run_id = ?
127
+ RETURNING
128
+ run_id
129
+ """,
130
+ [run_id],
131
+ ).fetchone()
132
+ return 1 if row else 0
133
+
134
+ def list(self) -> list[RunDTO]:
135
+ rows = self._conn.execute(
136
+ """
137
+ SELECT
138
+ *
139
+ FROM
140
+ run
141
+ ORDER BY
142
+ run_id DESC
143
+ """
144
+ ).fetchall()
145
+ return [self._row_to_dto(r) for r in rows]
146
+
147
+ @staticmethod
148
+ def _row_to_dto(row: tuple) -> RunDTO:
149
+ run_id, project_id, started_at, ended_at, dce_version, run_name = row
150
+ return RunDTO(
151
+ run_id=int(run_id),
152
+ run_name=run_name,
153
+ project_id=str(project_id),
154
+ started_at=started_at,
155
+ ended_at=ended_at,
156
+ nemory_version=dce_version,
157
+ )
@@ -0,0 +1,63 @@
1
+ from collections.abc import Sequence
2
+ from dataclasses import dataclass
3
+
4
+ import duckdb
5
+
6
+ from databao_context_engine.pluginlib.build_plugin import DatasourceType
7
+ from databao_context_engine.project.types import DatasourceId
8
+
9
+
10
+ @dataclass(kw_only=True, frozen=True)
11
+ class VectorSearchResult:
12
+ display_text: str
13
+ embeddable_text: str
14
+ cosine_distance: float
15
+ datasource_type: DatasourceType
16
+ datasource_id: DatasourceId
17
+
18
+
19
+ class VectorSearchRepository:
20
+ _DEFAULT_DISTANCE_THRESHOLD = 0.75
21
+
22
+ def __init__(self, conn: duckdb.DuckDBPyConnection):
23
+ self._conn = conn
24
+
25
+ def get_display_texts_by_similarity(
26
+ self, *, table_name: str, run_id: int, retrieve_vec: Sequence[float], dimension: int, limit: int
27
+ ) -> list[VectorSearchResult]:
28
+ """
29
+ Read only similarity search on a specific embedding shard table.
30
+ Returns the display text for the closest matches in a given run
31
+ """
32
+ rows = self._conn.execute(
33
+ f"""
34
+ SELECT
35
+ COALESCE(c.display_text, c.embeddable_text) AS display_text,
36
+ c.embeddable_text AS embeddable_text,
37
+ array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) AS cosine_distance,
38
+ dr.full_type,
39
+ dr.source_id,
40
+ FROM
41
+ {table_name} e
42
+ JOIN chunk c ON e.chunk_id = c.chunk_id
43
+ JOIN datasource_run dr ON c.datasource_run_id = dr.datasource_run_id
44
+ WHERE
45
+ dr.run_id = ?
46
+ AND cosine_distance < ?
47
+ ORDER BY
48
+ array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) ASC
49
+ LIMIT ?
50
+ """,
51
+ [list(retrieve_vec), run_id, self._DEFAULT_DISTANCE_THRESHOLD, list(retrieve_vec), limit],
52
+ ).fetchall()
53
+
54
+ return [
55
+ VectorSearchResult(
56
+ display_text=row[0],
57
+ embeddable_text=row[1],
58
+ cosine_distance=row[2],
59
+ datasource_type=DatasourceType(full_type=row[3]),
60
+ datasource_id=DatasourceId.from_string_repr(row[4]),
61
+ )
62
+ for row in rows
63
+ ]
@@ -0,0 +1,14 @@
1
+ from contextlib import contextmanager
2
+
3
+ import duckdb
4
+
5
+
6
+ @contextmanager
7
+ def transaction(conn: duckdb.DuckDBPyConnection):
8
+ conn.execute("BEGIN")
9
+ try:
10
+ yield
11
+ conn.execute("COMMIT")
12
+ except Exception:
13
+ conn.execute("ROLLBACK")
14
+ raise
File without changes
@@ -0,0 +1,13 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ # it's private, so it doesn't get imported directy. This value is mocked in tests
5
+ _dce_path = Path(os.getenv("DATABAO_CONTEXT_ENGINE_PATH") or "~/.dce").expanduser().resolve()
6
+
7
+
8
+ def get_dce_path() -> Path:
9
+ return _dce_path
10
+
11
+
12
+ def get_db_path() -> Path:
13
+ return get_dce_path() / "dce.duckdb"
File without changes
@@ -0,0 +1,29 @@
1
+ import os
2
+
3
+ from jinja2.sandbox import SandboxedEnvironment
4
+
5
+
6
+ class DceTemplateError(Exception):
7
+ pass
8
+
9
+
10
+ class UnknownEnvVarTemplateError(DceTemplateError):
11
+ pass
12
+
13
+
14
+ def render_template(source: str) -> str:
15
+ env = SandboxedEnvironment()
16
+
17
+ return env.from_string(source=str(source)).render(env_var=resolve_env_var)
18
+
19
+
20
+ def resolve_env_var(env_var: str, default: str | None = None) -> str:
21
+ if env_var in os.environ:
22
+ return os.environ[env_var]
23
+
24
+ if default is not None:
25
+ return default
26
+
27
+ raise UnknownEnvVarTemplateError(
28
+ f"Error in template. The environment variable {env_var} is missing and no default was provided"
29
+ )
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.3
2
+ Name: databao-context-engine
3
+ Version: 0.1.1
4
+ Summary: Add your description here
5
+ Requires-Dist: click>=8.3.0
6
+ Requires-Dist: duckdb>=1.4.3
7
+ Requires-Dist: pyyaml>=6.0.3
8
+ Requires-Dist: requests>=2.32.5
9
+ Requires-Dist: pymysql>=1.1.2
10
+ Requires-Dist: clickhouse-connect>=0.10.0
11
+ Requires-Dist: mcp>=1.23.3
12
+ Requires-Dist: pyathena>=3.22.0
13
+ Requires-Dist: snowflake-connector-python>=4.1.0
14
+ Requires-Dist: mssql-python>=1.0.0
15
+ Requires-Dist: pydantic>=2.12.4
16
+ Requires-Dist: jinja2>=3.1.6
17
+ Requires-Dist: asyncpg>=0.31.0
18
+ Requires-Dist: asyncio>=4.0.0
19
+ Requires-Dist: asyncpg-stubs>=0.31.1
20
+ Requires-Python: >=3.12
21
+ Description-Content-Type: text/markdown
22
+
23
+ [![official project](https://jb.gg/badges/official.svg)](https://confluence.jetbrains.com/display/ALL/JetBrains+on+GitHub)
24
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/JetBrains/databao-context-engine/blob/main/LICENSE)
25
+
26
+ [//]: # ([![PyPI version]&#40;https://img.shields.io/pypi/v/databao-context-engine.svg&#41;]&#40;https://pypi.org/project/databao-context-engine&#41;)
27
+
28
+ [//]: # ([![Python versions]&#40;https://img.shields.io/pypi/pyversions/databao-context-engine.svg&#41;]&#40;https://pypi.org/project/databao-context-engine/&#41;)
29
+
30
+
31
+ <h1 align="center">Databao Context Engine</h1>
32
+ <p align="center">
33
+ <b>Semantic context for your LLMs — generated automatically.</b><br/>
34
+ No more copying schemas. No manual documentation. Just accurate answers.
35
+ </p>
36
+ <p align="center">
37
+ <a href="https://databao.app">Website</a>
38
+
39
+ [//]: # (•)
40
+
41
+ [//]: # ( <a href="#quickstart">Quickstart</a> •)
42
+
43
+ [//]: # ( <a href="#supported-data-sources">Data Sources</a> •)
44
+
45
+ [//]: # ( <a href="#contributing">Contributing</a>)
46
+ </p>
47
+
48
+ ---
49
+
50
+ ## What is Databao Context Engine?
51
+
52
+ Databao Context Engine **automatically generates governed semantic context** from your databases, BI tools, documents, and spreadsheets.
53
+
54
+ Integrate it with any LLM to deliver **accurate, context-aware answers** — without copying schemas or writing documentation by hand.
55
+
56
+ ```
57
+ Your data sources → Context Engine → Unified semantic graph → Any LLM
58
+ ```
59
+
60
+ ## Why choose Databao Context Engine?
61
+
62
+ | Feature | What it means for you |
63
+ |----------------------------|----------------------------------------------------------------|
64
+ | **Auto-generated context** | Extracts schemas, relationships, and semantics automatically |
65
+ | **Runs locally** | Your data never leaves your environment |
66
+ | **MCP integration** | Works with Claude Desktop, Cursor, and any MCP-compatible tool |
67
+ | **Multiple sources** | Databases, dbt projects, spreadsheets, documents |
68
+ | **Built-in benchmarks** | Measure and improve context quality over time |
69
+ | **LLM agnostic** | OpenAI, Anthropic, Ollama, Gemini — use any model |
70
+ | **Governed & versioned** | Track, version, and share context across your team |
71
+ | **Dynamic or static** | Serve context via MCP server or export as artifact |
72
+
73
+ # Prerequisites
74
+
75
+ This README assumes you will use `uv` as your package manager.
76
+
77
+ You can install it following the instructions [here](https://docs.astral.sh/uv/getting-started/installation/)
78
+
79
+ If you are going to push to the repository, please make sure to install git pre-commit hooks by running
80
+
81
+ ```bash
82
+ uv run pre-commit install
83
+ ```
84
+
85
+ # How to run?
86
+
87
+ You can run it with:
88
+
89
+ ```bash
90
+ uv run dce info
91
+ ```
92
+
93
+ Not providing the `info` subcommand or using the `--help` flag will show the help screen for the command.
94
+
95
+ ## Using the dce command directly
96
+
97
+ To be able to use the `dce` command directly (without using `uv run` or `python`) there are two options.
98
+
99
+ ### Installing dce locally
100
+
101
+ For that one needs to:
102
+
103
+ 1. Build the project by running
104
+
105
+ ```bash
106
+ uv build
107
+ ```
108
+
109
+ 2. Installing the project on our machine by running:
110
+
111
+ ```bash
112
+ uv tool install -e .
113
+ ```
114
+
115
+ This second step will install the `dce` script on your machine and add it into your path.
116
+
117
+ ### Create dce alias using nix
118
+
119
+ This method will simply create a new shell environment with `dce` alias. For that one needs to install `nix` package
120
+ manager (https://nixos.org/download/). After that one could simply run in the project root
121
+
122
+ ```bash
123
+ $ nix-shell
124
+ ```
125
+
126
+ which is a short version of `$ nix-shell shell.nix`.
127
+
128
+ Alternatively, one could specify the path to the project repository
129
+
130
+ ```bash
131
+ $ nix-shell {path_to_dce_repository}
132
+ ```
133
+
134
+ After that, you can then directly use:
135
+
136
+ ```bash
137
+ dce --help
138
+ ```
139
+
140
+ Note: when we actually release our built Python package, users that don't use `uv` will still be able to install the CLI
141
+ by using `pipx install` instead.
142
+
143
+ # Running Mypy
144
+
145
+ [mypy](https://mypy.readthedocs.io/en/stable/getting_started.html) has been added to the project for type checking.
146
+
147
+ You can run it with the following:
148
+
149
+ ```bash
150
+ uv run mypy src --exclude "test_*" --exclude dist
151
+ ```
152
+
153
+ NB: the above runs type checking on all files within the `src` directory, excluding all test files.
154
+
155
+ # Running tests
156
+
157
+ You can run the tests with:
158
+
159
+ ```bash
160
+ uv run pytest
161
+ ```
162
+
163
+ (there is currently one test succeeding and one test failing in the project)
164
+
165
+ # Generating JSON Schemas for our plugin's config files
166
+
167
+ To be able to build a datasource, each plugin requires a yaml config file that describes how to connect to the
168
+ datasource,
169
+ as well as other information needed to customise the plugin.
170
+
171
+ To document what each config file should look like, we can generate a JSON schema describing the fields allowed in that
172
+ file.
173
+
174
+ You can generate all JSON schemas for all plugins by running:
175
+
176
+ ```bash
177
+ uv run generate_configs_schemas
178
+ ```
179
+
180
+ Some options can be provided to the command to choose which plugins to include or exclude from the generation.
181
+ To see the options available, you can refer to the help:
182
+
183
+ ```bash
184
+ uv run generate_configs_schemas --help
185
+ ```
186
+