databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Iterable, cast
|
|
3
|
+
|
|
4
|
+
from databao_context_engine.plugins.databases.databases_types import (
|
|
5
|
+
CheckConstraint,
|
|
6
|
+
DatabaseColumn,
|
|
7
|
+
DatabasePartitionInfo,
|
|
8
|
+
DatabaseTable,
|
|
9
|
+
DatasetKind,
|
|
10
|
+
ForeignKey,
|
|
11
|
+
ForeignKeyColumnMap,
|
|
12
|
+
Index,
|
|
13
|
+
KeyConstraint,
|
|
14
|
+
DatabaseSchema,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IntrospectionModelBuilder:
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self.by_table: dict[str, DatabaseTable] = {}
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def build_schemas_from_components(
|
|
24
|
+
cls,
|
|
25
|
+
*,
|
|
26
|
+
schemas: list[str],
|
|
27
|
+
rels: list[dict] | None = None,
|
|
28
|
+
cols: list[dict] | None = None,
|
|
29
|
+
pk_cols: list[dict] | None = None,
|
|
30
|
+
uq_cols: list[dict] | None = None,
|
|
31
|
+
checks: list[dict] | None = None,
|
|
32
|
+
fk_cols: list[dict] | None = None,
|
|
33
|
+
idx_cols: list[dict] | None = None,
|
|
34
|
+
partitions: list[dict] | None = None,
|
|
35
|
+
schema_field: str = "schema_name",
|
|
36
|
+
) -> list[DatabaseSchema]:
|
|
37
|
+
def group_by_schema(rows: list[dict] | None) -> dict[str, list[dict]]:
|
|
38
|
+
g: dict[str, list[dict]] = defaultdict(list)
|
|
39
|
+
for r in rows or []:
|
|
40
|
+
s = r.get(schema_field)
|
|
41
|
+
if isinstance(s, str) and s:
|
|
42
|
+
g[s].append(r)
|
|
43
|
+
return g
|
|
44
|
+
|
|
45
|
+
grouped = {
|
|
46
|
+
"rels": group_by_schema(rels),
|
|
47
|
+
"cols": group_by_schema(cols),
|
|
48
|
+
"pk": group_by_schema(pk_cols),
|
|
49
|
+
"uq": group_by_schema(uq_cols),
|
|
50
|
+
"checks": group_by_schema(checks),
|
|
51
|
+
"fks": group_by_schema(fk_cols),
|
|
52
|
+
"idx": group_by_schema(idx_cols),
|
|
53
|
+
"parts": group_by_schema(partitions),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
out: list[DatabaseSchema] = []
|
|
57
|
+
for schema in schemas:
|
|
58
|
+
tables = (
|
|
59
|
+
cls.build_tables_from_components(
|
|
60
|
+
rels=grouped["rels"].get(schema, []),
|
|
61
|
+
cols=grouped["cols"].get(schema, []),
|
|
62
|
+
pk_cols=grouped["pk"].get(schema, []),
|
|
63
|
+
uq_cols=grouped["uq"].get(schema, []),
|
|
64
|
+
checks=grouped["checks"].get(schema, []),
|
|
65
|
+
fk_cols=grouped["fks"].get(schema, []),
|
|
66
|
+
idx_cols=grouped["idx"].get(schema, []),
|
|
67
|
+
partitions=grouped["parts"].get(schema, []),
|
|
68
|
+
)
|
|
69
|
+
or []
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if tables:
|
|
73
|
+
out.append(DatabaseSchema(name=schema, tables=tables))
|
|
74
|
+
|
|
75
|
+
return out
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def build_tables_from_components(
|
|
79
|
+
cls,
|
|
80
|
+
*,
|
|
81
|
+
rels: list[dict] | None = None,
|
|
82
|
+
cols: list[dict] | None = None,
|
|
83
|
+
pk_cols: list[dict] | None = None,
|
|
84
|
+
uq_cols: list[dict] | None = None,
|
|
85
|
+
checks: list[dict] | None = None,
|
|
86
|
+
fk_cols: list[dict] | None = None,
|
|
87
|
+
idx_cols: list[dict] | None = None,
|
|
88
|
+
partitions: list[dict] | None = None,
|
|
89
|
+
) -> list[DatabaseTable]:
|
|
90
|
+
b = cls()
|
|
91
|
+
b.apply_relations(rels)
|
|
92
|
+
b.apply_columns(cols)
|
|
93
|
+
b.apply_primary_keys(pk_cols)
|
|
94
|
+
b.apply_unique_constraints(uq_cols)
|
|
95
|
+
b.apply_checks(checks)
|
|
96
|
+
b.apply_foreign_keys(fk_cols)
|
|
97
|
+
b.apply_indexes(idx_cols)
|
|
98
|
+
b.apply_partitions(partitions)
|
|
99
|
+
return b.finish()
|
|
100
|
+
|
|
101
|
+
def get_or_create_table(self, table_name: str) -> DatabaseTable:
|
|
102
|
+
t = self.by_table.get(table_name)
|
|
103
|
+
if t is None:
|
|
104
|
+
t = self.by_table[table_name] = DatabaseTable(
|
|
105
|
+
name=table_name,
|
|
106
|
+
columns=[],
|
|
107
|
+
samples=[],
|
|
108
|
+
partition_info=None,
|
|
109
|
+
description=None,
|
|
110
|
+
kind=DatasetKind.TABLE,
|
|
111
|
+
)
|
|
112
|
+
return t
|
|
113
|
+
|
|
114
|
+
def apply_relations(self, rels: list[dict] | None) -> None:
|
|
115
|
+
for r in rels or []:
|
|
116
|
+
t = self.get_or_create_table(r["table_name"])
|
|
117
|
+
t.kind = DatasetKind.from_raw((r.get("kind") or "table").lower())
|
|
118
|
+
if desc := r.get("description"):
|
|
119
|
+
t.description = desc
|
|
120
|
+
|
|
121
|
+
def apply_columns(self, cols: list[dict] | None) -> None:
|
|
122
|
+
cols_by_table = group_rows(cols, ("table_name",))
|
|
123
|
+
for (table_name,), grp in cols_by_table.items():
|
|
124
|
+
grp.sort(key=lambda r: (r.get("ordinal_position") is None, r.get("ordinal_position") or 0))
|
|
125
|
+
t = self.get_or_create_table(table_name)
|
|
126
|
+
for c in grp:
|
|
127
|
+
t.columns.append(
|
|
128
|
+
DatabaseColumn(
|
|
129
|
+
name=c["column_name"],
|
|
130
|
+
type=c["data_type"],
|
|
131
|
+
nullable=bool(coerce_bool(c.get("is_nullable"), default=True)),
|
|
132
|
+
description=c.get("description"),
|
|
133
|
+
default_expression=c.get("default_expression"),
|
|
134
|
+
generated=c.get("generated"),
|
|
135
|
+
checks=[],
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def apply_primary_keys(self, pk_cols: list[dict] | None) -> None:
|
|
140
|
+
pk_groups = group_rows(pk_cols, ("table_name", "constraint_name"))
|
|
141
|
+
for (table_name, cname), grp in pk_groups.items():
|
|
142
|
+
grp.sort(key=lambda r: sort_position_by_key(r, "position"))
|
|
143
|
+
self.get_or_create_table(table_name).primary_key = KeyConstraint(
|
|
144
|
+
name=cname,
|
|
145
|
+
columns=[r["column_name"] for r in grp if r.get("column_name") is not None],
|
|
146
|
+
validated=True,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def apply_unique_constraints(self, uq_cols: list[dict] | None) -> None:
|
|
150
|
+
uq_groups = group_rows(uq_cols, ("table_name", "constraint_name"))
|
|
151
|
+
by_table: dict[str, list[KeyConstraint]] = defaultdict(list)
|
|
152
|
+
|
|
153
|
+
for (table_name, cname), grp in uq_groups.items():
|
|
154
|
+
grp.sort(key=lambda r: sort_position_by_key(r, "position"))
|
|
155
|
+
by_table[table_name].append(
|
|
156
|
+
KeyConstraint(
|
|
157
|
+
name=cname,
|
|
158
|
+
columns=[r["column_name"] for r in grp if r.get("column_name") is not None],
|
|
159
|
+
validated=True,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
for table_name, uqs in by_table.items():
|
|
164
|
+
self.get_or_create_table(table_name).unique_constraints = uqs
|
|
165
|
+
|
|
166
|
+
def apply_checks(self, checks: list[dict] | None) -> None:
|
|
167
|
+
for r in checks or []:
|
|
168
|
+
self.get_or_create_table(r["table_name"]).checks.append(
|
|
169
|
+
CheckConstraint(
|
|
170
|
+
name=r["constraint_name"],
|
|
171
|
+
expression=cast(str, r.get("expression")),
|
|
172
|
+
validated=coerce_bool(r.get("validated"), default=True),
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def apply_foreign_keys(self, fk_cols: list[dict] | None) -> None:
|
|
177
|
+
fk_groups = group_rows(fk_cols, ("table_name", "constraint_name"))
|
|
178
|
+
by_table: dict[str, list[ForeignKey]] = defaultdict(list)
|
|
179
|
+
|
|
180
|
+
for (table_name, cname), grp in fk_groups.items():
|
|
181
|
+
grp.sort(key=lambda r: sort_position_by_key(r, "position"))
|
|
182
|
+
first = grp[0]
|
|
183
|
+
|
|
184
|
+
ref_schema = first.get("ref_schema")
|
|
185
|
+
ref_table = first.get("ref_table")
|
|
186
|
+
referenced = f"{ref_schema}.{ref_table}" if ref_schema and ref_table else ""
|
|
187
|
+
|
|
188
|
+
by_table[table_name].append(
|
|
189
|
+
ForeignKey(
|
|
190
|
+
name=cname,
|
|
191
|
+
mapping=[ForeignKeyColumnMap(from_column=r["from_column"], to_column=r["to_column"]) for r in grp],
|
|
192
|
+
referenced_table=referenced,
|
|
193
|
+
on_update=first.get("on_update"),
|
|
194
|
+
on_delete=first.get("on_delete"),
|
|
195
|
+
enforced=coerce_bool(first.get("enforced"), default=True),
|
|
196
|
+
validated=coerce_bool(first.get("validated"), default=True),
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
for table_name, fks in by_table.items():
|
|
201
|
+
self.get_or_create_table(table_name).foreign_keys = fks
|
|
202
|
+
|
|
203
|
+
def apply_indexes(self, idx_cols: list[dict] | None) -> None:
|
|
204
|
+
idx_groups = group_rows(idx_cols, ("table_name", "index_name"))
|
|
205
|
+
by_table: dict[str, list[Index]] = defaultdict(list)
|
|
206
|
+
|
|
207
|
+
for (table_name, idx_name), grp in idx_groups.items():
|
|
208
|
+
grp.sort(key=lambda r: sort_position_by_key(r, "position"))
|
|
209
|
+
first = grp[0]
|
|
210
|
+
|
|
211
|
+
by_table[table_name].append(
|
|
212
|
+
Index(
|
|
213
|
+
name=idx_name,
|
|
214
|
+
columns=[cast(str, r.get("expr")) for r in grp if r.get("expr") is not None],
|
|
215
|
+
unique=bool(coerce_bool(first.get("is_unique"), default=False)),
|
|
216
|
+
method=first.get("method"),
|
|
217
|
+
predicate=first.get("predicate"),
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
for table_name, idxs in by_table.items():
|
|
222
|
+
self.get_or_create_table(table_name).indexes = idxs
|
|
223
|
+
|
|
224
|
+
def apply_partitions(self, partitions: list[dict] | None) -> None:
|
|
225
|
+
for r in partitions or []:
|
|
226
|
+
t = self.get_or_create_table(r["table_name"])
|
|
227
|
+
|
|
228
|
+
meta = {k: v for k, v in r.items() if k not in ("table_name", "partition_tables", "schema_name")}
|
|
229
|
+
part_tables = r.get("partition_tables") or []
|
|
230
|
+
part_tables_list = [p for p in list(part_tables) if p is not None]
|
|
231
|
+
|
|
232
|
+
t.partition_info = DatabasePartitionInfo(
|
|
233
|
+
meta=meta,
|
|
234
|
+
partition_tables=part_tables_list,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def finish(self) -> list[DatabaseTable]:
|
|
238
|
+
return [self.by_table[k] for k in sorted(self.by_table)]
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def coerce_bool(value: Any, default: bool | None = None) -> bool | None:
|
|
242
|
+
if value is None:
|
|
243
|
+
return default
|
|
244
|
+
if isinstance(value, bool):
|
|
245
|
+
return value
|
|
246
|
+
if isinstance(value, int):
|
|
247
|
+
return bool(value)
|
|
248
|
+
if isinstance(value, str):
|
|
249
|
+
v = value.strip().lower()
|
|
250
|
+
if v in {"yes", "true", "1"}:
|
|
251
|
+
return True
|
|
252
|
+
if v in {"no", "false", "0"}:
|
|
253
|
+
return False
|
|
254
|
+
return bool(value)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def sort_position_by_key(r: dict, pos_field: str) -> tuple[bool, int]:
|
|
258
|
+
pos = r.get(pos_field)
|
|
259
|
+
try:
|
|
260
|
+
pos_val = int(pos) if pos is not None else 0
|
|
261
|
+
except (TypeError, ValueError):
|
|
262
|
+
pos_val = 0
|
|
263
|
+
return pos is None, pos_val
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def group_rows(rows: Iterable[dict] | None, key_fields: tuple[str, ...]) -> dict[tuple[Any, ...], list[dict]]:
|
|
267
|
+
grouped: dict[tuple[Any, ...], list[dict]] = defaultdict(list)
|
|
268
|
+
for r in rows or []:
|
|
269
|
+
grouped[tuple(r.get(f) for f in key_fields)].append(r)
|
|
270
|
+
return grouped
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ScopeIncludeRule(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Allowlist selector.
|
|
11
|
+
- catalog: optional glob pattern
|
|
12
|
+
- schemas: optional list of glob patterns (string also accepted and normalized to a list)
|
|
13
|
+
|
|
14
|
+
A rule must specify at least one of: catalog, schemas.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
model_config = ConfigDict(extra="forbid")
|
|
18
|
+
|
|
19
|
+
catalog: str | None = None
|
|
20
|
+
schemas: list[str] | None = None
|
|
21
|
+
|
|
22
|
+
@field_validator("schemas", mode="before")
|
|
23
|
+
@classmethod
|
|
24
|
+
def _normalize_schemas(cls, v: Any) -> Any:
|
|
25
|
+
if v is None:
|
|
26
|
+
return None
|
|
27
|
+
if isinstance(v, str):
|
|
28
|
+
return [v]
|
|
29
|
+
return v
|
|
30
|
+
|
|
31
|
+
@model_validator(mode="after")
|
|
32
|
+
def _validate_rule(self) -> ScopeIncludeRule:
|
|
33
|
+
if self.catalog is None and self.schemas is None:
|
|
34
|
+
raise ValueError("Include rule must specify at least 'catalog' or 'schemas'")
|
|
35
|
+
return self
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ScopeExcludeRule(BaseModel):
|
|
39
|
+
"""
|
|
40
|
+
Denylist selector.
|
|
41
|
+
- catalog: optional glob pattern
|
|
42
|
+
- schemas: optional list of glob patterns (string also accepted)
|
|
43
|
+
- except_schemas: optional list of glob patterns (string also accepted)
|
|
44
|
+
|
|
45
|
+
If a target matches the rule but also matches except_schemas, it is NOT excluded by this rule.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
model_config = ConfigDict(extra="forbid")
|
|
49
|
+
|
|
50
|
+
catalog: str | None = None
|
|
51
|
+
schemas: list[str] | None = None
|
|
52
|
+
except_schemas: list[str] | None = None
|
|
53
|
+
|
|
54
|
+
@field_validator("schemas", "except_schemas", mode="before")
|
|
55
|
+
@classmethod
|
|
56
|
+
def _normalize_lists(cls, v: Any) -> Any:
|
|
57
|
+
if v is None:
|
|
58
|
+
return None
|
|
59
|
+
if isinstance(v, str):
|
|
60
|
+
return [v]
|
|
61
|
+
return v
|
|
62
|
+
|
|
63
|
+
@model_validator(mode="after")
|
|
64
|
+
def _validate_rule(self) -> ScopeExcludeRule:
|
|
65
|
+
if self.catalog is None and self.schemas is None:
|
|
66
|
+
raise ValueError("Exclude rule must specify at least 'catalog' or 'schemas'")
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class IntrospectionScope(BaseModel):
|
|
71
|
+
model_config = ConfigDict(extra="forbid")
|
|
72
|
+
|
|
73
|
+
include: list[ScopeIncludeRule] = []
|
|
74
|
+
exclude: list[ScopeExcludeRule] = []
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from databao_context_engine.plugins.databases.introspection_scope import (
|
|
5
|
+
IntrospectionScope,
|
|
6
|
+
ScopeExcludeRule,
|
|
7
|
+
ScopeIncludeRule,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class ScopeSelection:
|
|
13
|
+
"""
|
|
14
|
+
The final catalog+schema scope to introspect.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
catalogs: list[str]
|
|
18
|
+
schemas_per_catalog: dict[str, list[str]]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class IntrospectionScopeMatcher:
|
|
22
|
+
"""
|
|
23
|
+
Applies include/exclude rules (glob matching, case-insensitive) to a discovered set of catalogs/schemas.
|
|
24
|
+
|
|
25
|
+
Semantics:
|
|
26
|
+
- If include is empty => start from "everything"
|
|
27
|
+
- If include is non-empty => start from "only what include matches"
|
|
28
|
+
- Then apply exclude (exclude wins)
|
|
29
|
+
- except_schemas on an exclude rule prevents exclusion for that rule only
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
scope: IntrospectionScope | None,
|
|
35
|
+
*,
|
|
36
|
+
ignored_schemas: set[str] | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self._scope = scope or IntrospectionScope()
|
|
39
|
+
self._ignored_schemas = {s.lower() for s in (ignored_schemas or set())}
|
|
40
|
+
|
|
41
|
+
def filter_scopes(
|
|
42
|
+
self,
|
|
43
|
+
catalogs: list[str],
|
|
44
|
+
schemas_per_catalog: dict[str, list[str]],
|
|
45
|
+
) -> ScopeSelection:
|
|
46
|
+
include_rules = self._scope.include
|
|
47
|
+
exclude_rules = self._scope.exclude
|
|
48
|
+
has_includes = len(include_rules) > 0
|
|
49
|
+
|
|
50
|
+
filtered: dict[str, list[str]] = {}
|
|
51
|
+
|
|
52
|
+
for catalog in catalogs:
|
|
53
|
+
kept_schemas: list[str] = []
|
|
54
|
+
for schema in schemas_per_catalog.get(catalog, []):
|
|
55
|
+
if schema.lower() in self._ignored_schemas:
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
if has_includes and not self._is_included(include_rules, catalog, schema):
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
if self._is_excluded(exclude_rules, catalog, schema):
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
kept_schemas.append(schema)
|
|
65
|
+
|
|
66
|
+
if kept_schemas:
|
|
67
|
+
filtered[catalog] = kept_schemas
|
|
68
|
+
|
|
69
|
+
filtered_catalogs = [c for c in catalogs if c in filtered]
|
|
70
|
+
return ScopeSelection(catalogs=filtered_catalogs, schemas_per_catalog=filtered)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _glob_match(pattern: str, value: str) -> bool:
|
|
74
|
+
return fnmatch.fnmatchcase(value.lower(), pattern.lower())
|
|
75
|
+
|
|
76
|
+
def _matches_any(self, patterns: list[str] | None, value: str) -> bool:
|
|
77
|
+
if patterns is None:
|
|
78
|
+
return True
|
|
79
|
+
return any(self._glob_match(p, value) for p in patterns)
|
|
80
|
+
|
|
81
|
+
def _include_rule_matches(self, rule: ScopeIncludeRule, catalog: str, schema: str) -> bool:
|
|
82
|
+
if rule.catalog is not None and not self._glob_match(rule.catalog, catalog):
|
|
83
|
+
return False
|
|
84
|
+
if rule.schemas is not None and not self._matches_any(rule.schemas, schema):
|
|
85
|
+
return False
|
|
86
|
+
return True
|
|
87
|
+
|
|
88
|
+
def _exclude_rule_excludes(self, rule: ScopeExcludeRule, catalog: str, schema: str) -> bool:
|
|
89
|
+
if rule.catalog is not None and not self._glob_match(rule.catalog, catalog):
|
|
90
|
+
return False
|
|
91
|
+
if rule.schemas is not None and not self._matches_any(rule.schemas, schema):
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
if rule.except_schemas is not None and self._matches_any(rule.except_schemas, schema):
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
def _is_included(self, rules: list[ScopeIncludeRule], catalog: str, schema: str) -> bool:
|
|
100
|
+
return any(self._include_rule_matches(r, catalog, schema) for r in rules)
|
|
101
|
+
|
|
102
|
+
def _is_excluded(self, rules: list[ScopeExcludeRule], catalog: str, schema: str) -> bool:
|
|
103
|
+
return any(self._exclude_rule_excludes(r, catalog, schema) for r in rules)
|