databao-context-engine 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,47 @@
1
+ from dataclasses import dataclass
2
+
3
+ from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
4
+ from databao_context_engine.plugins.dbt.types import DbtColumn, DbtContext, DbtModel
5
+
6
+
7
+ @dataclass
8
+ class DbtColumnChunkContent:
9
+ database_name: str
10
+ schema_name: str
11
+ model_name: str
12
+ column: DbtColumn
13
+
14
+
15
+ def build_dbt_chunks(context: DbtContext) -> list[EmbeddableChunk]:
16
+ chunks = []
17
+
18
+ for model in context.models:
19
+ chunks.append(_create_model_chunk(model))
20
+
21
+ for column in model.columns:
22
+ chunks.append(_create_column_chunk(model, column))
23
+
24
+ return chunks
25
+
26
+
27
+ def _create_model_chunk(model: DbtModel) -> EmbeddableChunk:
28
+ return EmbeddableChunk(embeddable_text=_build_model_chunk_text(model), content=model)
29
+
30
+
31
+ def _build_model_chunk_text(model: DbtModel) -> str:
32
+ # TODO: Use description and potentially other infos?
33
+ return f"Model {model.name} in database {model.database} and schema {model.schema}, with unique id {model.id}"
34
+
35
+
36
+ def _create_column_chunk(model: DbtModel, column: DbtColumn) -> EmbeddableChunk:
37
+ return EmbeddableChunk(
38
+ embeddable_text=_build_column_chunk_text(model, column),
39
+ content=DbtColumnChunkContent(
40
+ database_name=model.database, schema_name=model.schema, model_name=model.name, column=column
41
+ ),
42
+ )
43
+
44
+
45
+ def _build_column_chunk_text(model: DbtModel, column: DbtColumn) -> str:
46
+ # TODO: Use description and potentially other infos?
47
+ return f"Column {column.name} in model {model.id}"
@@ -0,0 +1,106 @@
1
+ from pathlib import Path
2
+
3
+ from databao_context_engine.plugins.dbt.types import (
4
+ DbtColumn,
5
+ DbtConfigFile,
6
+ DbtContext,
7
+ DbtMaterialization,
8
+ DbtModel,
9
+ )
10
+ from databao_context_engine.plugins.dbt.types_artifacts import (
11
+ DbtArtifacts,
12
+ DbtCatalog,
13
+ DbtCatalogColumn,
14
+ DbtCatalogNode,
15
+ DbtManifest,
16
+ DbtManifestColumn,
17
+ DbtManifestModel,
18
+ )
19
+
20
+
21
+ def check_connection(config_file: DbtConfigFile) -> None:
22
+ _read_dbt_artifacts(config_file.dbt_target_folder_path.expanduser())
23
+
24
+
25
+ def extract_context(config_file: DbtConfigFile) -> DbtContext:
26
+ artifacts = _read_dbt_artifacts(config_file.dbt_target_folder_path.expanduser())
27
+
28
+ return _extract_context_from_artifacts(artifacts)
29
+
30
+
31
+ def _read_dbt_artifacts(dbt_target_folder_path: Path) -> DbtArtifacts:
32
+ if not dbt_target_folder_path.is_dir():
33
+ raise ValueError(f'Invalid "dbt_target_folder_path": not a directory ({dbt_target_folder_path})')
34
+
35
+ # TODO: Check the manifest schema version?
36
+ manifest_file = dbt_target_folder_path.joinpath("manifest.json")
37
+ if not manifest_file.is_file():
38
+ raise ValueError(f'Invalid "dbt_target_folder_path": missing manifest.json file ({manifest_file})')
39
+
40
+ manifest = DbtManifest.model_validate_json(manifest_file.read_text())
41
+
42
+ catalog_file = dbt_target_folder_path.joinpath("catalog.json")
43
+ catalog = DbtCatalog.model_validate_json(catalog_file.read_text()) if catalog_file.is_file() else None
44
+
45
+ return DbtArtifacts(manifest=manifest, catalog=catalog)
46
+
47
+
48
+ def _extract_context_from_artifacts(artifacts: DbtArtifacts) -> DbtContext:
49
+ manifest_models = [
50
+ manifest_model
51
+ for manifest_model in artifacts.manifest.nodes.values()
52
+ if isinstance(manifest_model, DbtManifestModel)
53
+ ]
54
+
55
+ catalog_nodes = artifacts.catalog.nodes if artifacts.catalog else {}
56
+
57
+ # TODO: Extract the stages? Or at least the "highest-level" models (= marts?)
58
+ # TODO: Extract the constraints
59
+ # TODO: Organize the models by schemas? Or by stages?
60
+ return DbtContext(
61
+ models=[
62
+ _manifest_model_to_dbt_model(manifest_model, catalog_nodes.get(manifest_model.unique_id, None))
63
+ for manifest_model in manifest_models
64
+ ],
65
+ )
66
+
67
+
68
+ def _manifest_model_to_dbt_model(manifest_model: DbtManifestModel, catalog_node: DbtCatalogNode | None) -> DbtModel:
69
+ catalog_columns = catalog_node.columns if catalog_node else {}
70
+
71
+ return DbtModel(
72
+ id=manifest_model.unique_id,
73
+ name=manifest_model.name,
74
+ database=manifest_model.database,
75
+ schema=manifest_model.schema_,
76
+ description=manifest_model.description,
77
+ columns=[
78
+ _manifest_column_to_dbt_column(manifest_column, catalog_columns.get(manifest_column.name))
79
+ for manifest_column in manifest_model.columns.values()
80
+ ],
81
+ materialization=_manifest_materialization_to_dbt_materializaton(
82
+ manifest_model.config.materialized if manifest_model.config else None
83
+ ),
84
+ primary_key=manifest_model.primary_key,
85
+ depends_on_nodes=manifest_model.depends_on.get("nodes", []) if manifest_model.depends_on else [],
86
+ )
87
+
88
+
89
+ def _manifest_column_to_dbt_column(
90
+ manifest_column: DbtManifestColumn, catalog_column: DbtCatalogColumn | None
91
+ ) -> DbtColumn:
92
+ return DbtColumn(
93
+ name=manifest_column.name,
94
+ description=manifest_column.description,
95
+ type=catalog_column.type if catalog_column else manifest_column.data_type,
96
+ )
97
+
98
+
99
+ def _manifest_materialization_to_dbt_materializaton(materialized: str | None) -> DbtMaterialization | None:
100
+ if materialized is None:
101
+ return None
102
+
103
+ try:
104
+ return DbtMaterialization(materialized)
105
+ except ValueError:
106
+ return None
@@ -0,0 +1,25 @@
1
+ from typing import Any
2
+
3
+ from databao_context_engine import BuildDatasourcePlugin
4
+ from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
5
+ from databao_context_engine.plugins.dbt.dbt_chunker import build_dbt_chunks
6
+ from databao_context_engine.plugins.dbt.dbt_context_extractor import check_connection, extract_context
7
+ from databao_context_engine.plugins.dbt.types import DbtConfigFile
8
+
9
+
10
+ class DbtPlugin(BuildDatasourcePlugin[DbtConfigFile]):
11
+ id = "jetbrains/dbt"
12
+ name = "Dbt Plugin"
13
+ config_file_type = DbtConfigFile
14
+
15
+ def supported_types(self) -> set[str]:
16
+ return {"dbt"}
17
+
18
+ def build_context(self, full_type: str, datasource_name: str, file_config: DbtConfigFile) -> Any:
19
+ return extract_context(file_config)
20
+
21
+ def check_connection(self, full_type: str, datasource_name: str, file_config: DbtConfigFile) -> None:
22
+ check_connection(file_config)
23
+
24
+ def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
25
+ return build_dbt_chunks(context)
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class DbtConfigFile(BaseModel):
9
+ name: str | None = Field(default=None)
10
+ type: str = Field(default="dbt")
11
+ dbt_target_folder_path: Path
12
+
13
+
14
+ class DbtMaterialization(str, Enum):
15
+ TABLE = "table"
16
+ VIEW = "view"
17
+
18
+ def __str__(self):
19
+ return self.value
20
+
21
+
22
+ @dataclass(kw_only=True)
23
+ class DbtColumn:
24
+ name: str
25
+ type: str | None = None
26
+ description: str | None = None
27
+
28
+
29
+ @dataclass(kw_only=True)
30
+ class DbtModel:
31
+ id: str
32
+ name: str
33
+ database: str
34
+ schema: str
35
+ columns: list[DbtColumn]
36
+ description: str | None = None
37
+ materialization: DbtMaterialization | None = None
38
+ primary_key: list[str] | None = None
39
+ depends_on_nodes: list[str]
40
+
41
+
42
+ @dataclass(kw_only=True)
43
+ class DbtContext:
44
+ models: list[DbtModel]
@@ -0,0 +1,58 @@
1
+ from dataclasses import dataclass
2
+ from typing import Annotated, Literal
3
+
4
+ from pydantic import BaseModel, Discriminator, Field
5
+
6
+
7
+ class DbtManifestNodeConfig(BaseModel):
8
+ materialized: str
9
+
10
+
11
+ class DbtManifestColumn(BaseModel):
12
+ name: str
13
+ description: str | None = None
14
+ data_type: str | None = None
15
+
16
+
17
+ class DbtManifestModel(BaseModel):
18
+ resource_type: Literal["model"]
19
+ unique_id: str
20
+ name: str
21
+ database: str
22
+ schema_: str = Field(alias="schema")
23
+ description: str | None = None
24
+ config: DbtManifestNodeConfig | None = None
25
+ columns: dict[str, DbtManifestColumn]
26
+ depends_on: dict[str, list[str]] | None = None
27
+ primary_key: list[str] | None = None
28
+
29
+
30
+ class DbtManifestOtherNode(BaseModel):
31
+ resource_type: Literal["seed", "analysis", "test", "operation", "sql_operation", "snapshot"]
32
+
33
+
34
+ DbtManifestNode = Annotated[DbtManifestModel | DbtManifestOtherNode, Discriminator("resource_type")]
35
+
36
+
37
+ class DbtManifest(BaseModel):
38
+ nodes: dict[str, DbtManifestNode]
39
+
40
+
41
+ class DbtCatalogColumn(BaseModel):
42
+ name: str
43
+ type: str
44
+
45
+
46
+ class DbtCatalogNode(BaseModel):
47
+ unique_id: str | None = None
48
+ columns: dict[str, DbtCatalogColumn]
49
+
50
+
51
+ class DbtCatalog(BaseModel):
52
+ nodes: dict[str, DbtCatalogNode]
53
+
54
+
55
+ @dataclass(kw_only=True)
56
+ class DbtArtifacts:
57
+ manifest: DbtManifest
58
+ catalog: DbtCatalog | None
@@ -45,6 +45,7 @@ def _load_builtin_datasource_plugins() -> list[BuildDatasourcePlugin]:
45
45
  """Statically register built-in plugins."""
46
46
  from databao_context_engine.plugins.databases.duckdb.duckdb_db_plugin import DuckDbPlugin
47
47
  from databao_context_engine.plugins.databases.sqlite.sqlite_db_plugin import SQLiteDbPlugin
48
+ from databao_context_engine.plugins.dbt.dbt_plugin import DbtPlugin
48
49
  from databao_context_engine.plugins.resources.parquet_plugin import ParquetPlugin
49
50
 
50
51
  # optional plugins are added to the python environment via extras
@@ -91,7 +92,7 @@ def _load_builtin_datasource_plugins() -> list[BuildDatasourcePlugin]:
91
92
  except ImportError:
92
93
  pass
93
94
 
94
- required_plugins: list[BuildDatasourcePlugin] = [DuckDbPlugin(), ParquetPlugin(), SQLiteDbPlugin()]
95
+ required_plugins: list[BuildDatasourcePlugin] = [DuckDbPlugin(), ParquetPlugin(), SQLiteDbPlugin(), DbtPlugin()]
95
96
  return required_plugins + optional_plugins
96
97
 
97
98
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: databao-context-engine
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Semantic context for your LLMs — generated automatically
5
5
  Requires-Dist: click>=8.3.0
6
6
  Requires-Dist: duckdb>=1.4.3
@@ -82,10 +82,16 @@ databao_context_engine/plugins/databases/snowflake/snowflake_introspector.py,sha
82
82
  databao_context_engine/plugins/databases/sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
83
  databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py,sha256=MI896G7Yq8NcQ8sTWErAFD7YK8qfDpZZprxoszJ46l8,460
84
84
  databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py,sha256=kKFNzclp1NmQ6BF3ylGn86R5PuQh6aqevj6E7_zAKmQ,8361
85
+ databao_context_engine/plugins/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
+ databao_context_engine/plugins/dbt/dbt_chunker.py,sha256=5OQo0Y9ouelZV8KQBA0Kqksx72wRhTdRztUOSCKwLZc,1516
87
+ databao_context_engine/plugins/dbt/dbt_context_extractor.py,sha256=8ac2Fx5vv0t_AGqNbx9BXj73l9ZqjDK8r-VAAQukcP0,3748
88
+ databao_context_engine/plugins/dbt/dbt_plugin.py,sha256=-oAVh4u9YZ7j1b6w2K2lwcZT_bOdkN-lL1RRBWgB20k,1026
89
+ databao_context_engine/plugins/dbt/types.py,sha256=yJUVSZa5Ohhx6Znl_20gKwjvgxkBBcxlKrAFFV6tfgY,881
90
+ databao_context_engine/plugins/dbt/types_artifacts.py,sha256=OC2_DL7srcniVqrj9uj5XxNV-7VcA974xqgU1Rzfd5A,1349
85
91
  databao_context_engine/plugins/duckdb_tools.py,sha256=46rctnTxDPAhHtaiTp1DxMuuDuRKrtKWJFSSM2w7uUU,645
86
92
  databao_context_engine/plugins/files/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
93
  databao_context_engine/plugins/files/unstructured_files_plugin.py,sha256=eqs1anQhYBZh7xu4CwhfkqXQjGE5gJnKEwyJbtUR78E,2384
88
- databao_context_engine/plugins/plugin_loader.py,sha256=8j7JAKqtG6_VjX3GbfAD6kjrWuuJ7NnC2nnHPEujux8,4074
94
+ databao_context_engine/plugins/plugin_loader.py,sha256=x5cZ8pUwrEYbadqlPddZvBzU2BpdyBWYI4KlEtffFUY,4159
89
95
  databao_context_engine/plugins/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
96
  databao_context_engine/plugins/resources/parquet_chunker.py,sha256=R9WCOBqpKRTVN6t5eeOm_mmnKBOxvjIiQ9zTc8vnUb4,848
91
97
  databao_context_engine/plugins/resources/parquet_introspector.py,sha256=Cn_yh6E-dOTOZstlavEGAsV6ZRKZXJraVAl_pzJJuGs,5629
@@ -129,7 +135,7 @@ databao_context_engine/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
129
135
  databao_context_engine/system/properties.py,sha256=mQ7-_PZeYSESYn1cMUQ0IK7rJEnbhc7t4WesFjAgo-Q,429
130
136
  databao_context_engine/templating/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
137
  databao_context_engine/templating/renderer.py,sha256=W2-0IGStAp6oxANmsKs_Z-UoIR6Gt_c4ILYFa3Hruo4,662
132
- databao_context_engine-0.1.5.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
133
- databao_context_engine-0.1.5.dist-info/entry_points.txt,sha256=5EeQJ1W8zEFh4HuF1bs2zBeoP408oiwuM9UrkJiurgI,138
134
- databao_context_engine-0.1.5.dist-info/METADATA,sha256=HfJBADmvbXEEK710S6zzybfWl-mqnDXfm_18LIgh7kk,7773
135
- databao_context_engine-0.1.5.dist-info/RECORD,,
138
+ databao_context_engine-0.1.6.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
139
+ databao_context_engine-0.1.6.dist-info/entry_points.txt,sha256=5EeQJ1W8zEFh4HuF1bs2zBeoP408oiwuM9UrkJiurgI,138
140
+ databao_context_engine-0.1.6.dist-info/METADATA,sha256=weN9iS4ZtRnnt3tTJY-epHUgNdpMo-dWD4E3F-8dXyk,7773
141
+ databao_context_engine-0.1.6.dist-info/RECORD,,