databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
_TABLE_NAME_RE = re.compile(r"^embedding_[a-z0-9_]+$")
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TableNamePolicy:
|
|
7
|
+
@staticmethod
|
|
8
|
+
def build(*, embedder: str, model_id: str, dim: int) -> str:
|
|
9
|
+
safe_model = model_id.replace(":", "_").replace("-", "_").replace(" ", "_").replace(".", "_")
|
|
10
|
+
return f"embedding_{embedder}__{safe_model}__{dim}"
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
def validate_table_name(*, table_name: str):
|
|
14
|
+
if not _TABLE_NAME_RE.fullmatch(table_name):
|
|
15
|
+
raise ValueError(f"invalid table_name {table_name!r}; expected pattern {_TABLE_NAME_RE.pattern}")
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import duckdb
|
|
6
|
+
|
|
7
|
+
from databao_context_engine.system.properties import get_db_path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@contextmanager
|
|
13
|
+
def open_duckdb_connection(db_path: str | Path | None = None):
|
|
14
|
+
"""
|
|
15
|
+
Open a DuckDB connection with vector search enabled and close on exist.
|
|
16
|
+
Loads the vss extension and enables HNSW experimental persistence.
|
|
17
|
+
|
|
18
|
+
Usage:
|
|
19
|
+
with open_duckdb_connection() as conn:
|
|
20
|
+
"""
|
|
21
|
+
path = str(db_path or get_db_path())
|
|
22
|
+
conn = duckdb.connect(path)
|
|
23
|
+
logger.debug(f"Connected to DuckDB database at {path}")
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
conn.execute("LOAD vss;")
|
|
27
|
+
conn.execute("SET hnsw_enable_experimental_persistence = true;")
|
|
28
|
+
|
|
29
|
+
logger.debug("Loaded Vector Similarity Search extension")
|
|
30
|
+
yield conn
|
|
31
|
+
finally:
|
|
32
|
+
conn.close()
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from importlib.resources import files
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import LiteralString
|
|
8
|
+
|
|
9
|
+
import duckdb
|
|
10
|
+
|
|
11
|
+
from databao_context_engine.system.properties import get_db_path
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def migrate(db_path: str | Path | None = None, migration_files: list[Path] | None = None) -> None:
|
|
17
|
+
if migration_files is None:
|
|
18
|
+
migration_files = [
|
|
19
|
+
migration
|
|
20
|
+
for migration in files("databao_context_engine.storage.migrations").iterdir()
|
|
21
|
+
if isinstance(migration, Path) and ".sql" == migration.suffix
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
db = Path(db_path or get_db_path()).expanduser().resolve()
|
|
25
|
+
db.parent.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
logger.debug("Running migrations on database: %s", db)
|
|
27
|
+
|
|
28
|
+
migration_manager = _MigrationManager(db, migration_files)
|
|
29
|
+
migration_manager.migrate()
|
|
30
|
+
logger.debug("Migration complete")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class MigrationDTO:
|
|
35
|
+
name: str
|
|
36
|
+
version: int
|
|
37
|
+
checksum: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MigrationError(Exception):
|
|
41
|
+
"""Base class for migration errors."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_migrations(conn) -> list[MigrationDTO]:
|
|
45
|
+
with conn.cursor() as cur:
|
|
46
|
+
cur.execute(
|
|
47
|
+
"SELECT name, version, checksum, applied_at FROM migration_history",
|
|
48
|
+
)
|
|
49
|
+
rows = cur.fetchall()
|
|
50
|
+
return [
|
|
51
|
+
MigrationDTO(name=name, version=version, checksum=checksum)
|
|
52
|
+
for (name, version, checksum, applied_at) in rows
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _extract_version_from_name(name: str) -> int:
|
|
57
|
+
version_groups = re.findall(r"(\d+)__", name)
|
|
58
|
+
if not version_groups:
|
|
59
|
+
raise ValueError(f"Invalid migration name: {name}")
|
|
60
|
+
return int(version_groups[0])
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(frozen=True)
|
|
64
|
+
class _Migration:
|
|
65
|
+
name: str
|
|
66
|
+
version: int
|
|
67
|
+
checksum: str
|
|
68
|
+
query: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _create_migration(file: Path) -> _Migration:
|
|
72
|
+
query_bytes = file.read_bytes()
|
|
73
|
+
query = query_bytes.decode("utf-8")
|
|
74
|
+
checksum = hashlib.md5(query_bytes).hexdigest()
|
|
75
|
+
version = _extract_version_from_name(file.name)
|
|
76
|
+
return _Migration(name=file.name, version=version, checksum=checksum, query=query)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class _MigrationManager:
|
|
80
|
+
_init_migration_table_sql: LiteralString = """
|
|
81
|
+
CREATE SEQUENCE IF NOT EXISTS migration_history_id_seq START 1;
|
|
82
|
+
|
|
83
|
+
CREATE TABLE IF NOT EXISTS migration_history (
|
|
84
|
+
id BIGINT PRIMARY KEY DEFAULT nextval('migration_history_id_seq'),
|
|
85
|
+
name TEXT NOT NULL,
|
|
86
|
+
version INTEGER NOT NULL,
|
|
87
|
+
checksum TEXT NOT NULL,
|
|
88
|
+
applied_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
89
|
+
UNIQUE (version)
|
|
90
|
+
);
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
_insert_migration_sql: LiteralString = "INSERT INTO migration_history (name, version, checksum) VALUES (?, ?, ?)"
|
|
94
|
+
|
|
95
|
+
def __init__(self, db_path: Path, migration_files: list[Path]):
|
|
96
|
+
self._migration_files = migration_files
|
|
97
|
+
self._db_path = db_path
|
|
98
|
+
self._requested_migrations = [_create_migration(file) for file in migration_files]
|
|
99
|
+
|
|
100
|
+
def migrate(self) -> None:
|
|
101
|
+
applied_migrations: list[MigrationDTO] = self.init_db_and_load_applied_migrations()
|
|
102
|
+
applied_checksums = [m.checksum for m in applied_migrations]
|
|
103
|
+
applied_versions = [m.version for m in applied_migrations]
|
|
104
|
+
migrations_to_apply = [m for m in self._requested_migrations if m.checksum not in applied_checksums]
|
|
105
|
+
duplicated_versions = [
|
|
106
|
+
migration.version for migration in migrations_to_apply if migration.version in applied_versions
|
|
107
|
+
]
|
|
108
|
+
if any(duplicated_versions):
|
|
109
|
+
raise MigrationError(f"Migrations with versions {duplicated_versions} already exist")
|
|
110
|
+
with duckdb.connect(self._db_path) as conn:
|
|
111
|
+
for migration in migrations_to_apply:
|
|
112
|
+
logger.debug("Applying migration %s", migration.name)
|
|
113
|
+
with conn.cursor() as cur:
|
|
114
|
+
cur.execute("START TRANSACTION;")
|
|
115
|
+
try:
|
|
116
|
+
cur.execute(migration.query)
|
|
117
|
+
cur.execute(self._insert_migration_sql, [migration.name, migration.version, migration.checksum])
|
|
118
|
+
cur.commit()
|
|
119
|
+
except Exception:
|
|
120
|
+
cur.rollback()
|
|
121
|
+
raise MigrationError(f"Failed to apply migration {migration.name}. Aborting migration process.")
|
|
122
|
+
|
|
123
|
+
def init_db_and_load_applied_migrations(self) -> list[MigrationDTO]:
|
|
124
|
+
with duckdb.connect(str(self._db_path)) as conn:
|
|
125
|
+
conn.execute(self._init_migration_table_sql)
|
|
126
|
+
conn.commit()
|
|
127
|
+
return load_migrations(conn)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
INSTALL vss;
|
|
2
|
+
LOAD vss;
|
|
3
|
+
SET hnsw_enable_experimental_persistence = true;
|
|
4
|
+
|
|
5
|
+
CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
|
|
6
|
+
CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
|
|
7
|
+
CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
|
|
8
|
+
|
|
9
|
+
CREATE TABLE IF NOT EXISTS run (
|
|
10
|
+
run_id BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
|
|
11
|
+
project_id TEXT NOT NULL,
|
|
12
|
+
started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
13
|
+
ended_at TIMESTAMP,
|
|
14
|
+
nemory_version TEXT,
|
|
15
|
+
run_name TEXT NOT NULL,
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
CREATE TABLE IF NOT EXISTS datasource_run (
|
|
19
|
+
datasource_run_id BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
|
|
20
|
+
run_id BIGINT NOT NULL REFERENCES run(run_id),
|
|
21
|
+
plugin TEXT NOT NULL,
|
|
22
|
+
source_id TEXT NOT NULL,
|
|
23
|
+
storage_directory TEXT NOT NULL,
|
|
24
|
+
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
25
|
+
full_type TEXT NOT NULL,
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
CREATE TABLE IF NOT EXISTS chunk (
|
|
29
|
+
chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
|
|
30
|
+
datasource_run_id BIGINT NOT NULL REFERENCES datasource_run(datasource_run_id),
|
|
31
|
+
embeddable_text TEXT NOT NULL,
|
|
32
|
+
display_text TEXT,
|
|
33
|
+
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
34
|
+
generated_description TEXT,
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
CREATE TABLE IF NOT EXISTS embedding_model_registry (
|
|
38
|
+
embedder TEXT NOT NULL,
|
|
39
|
+
model_id TEXT NOT NULL,
|
|
40
|
+
dim INTEGER NOT NULL,
|
|
41
|
+
table_name TEXT NOT NULL,
|
|
42
|
+
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
43
|
+
PRIMARY KEY (embedder, model_id),
|
|
44
|
+
UNIQUE (table_name)
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
CREATE TABLE IF NOT EXISTS embedding_ollama__nomic_embed_text_v1_5__768 (
|
|
48
|
+
chunk_id BIGINT NOT NULL REFERENCES chunk(chunk_id),
|
|
49
|
+
vec FLOAT[768] NOT NULL,
|
|
50
|
+
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
51
|
+
PRIMARY KEY (chunk_id)
|
|
52
|
+
);
|
|
53
|
+
CREATE INDEX IF NOT EXISTS emb_hnsw_embedding_ollama__nomic_embed_text_v1_5__768 ON embedding_ollama__nomic_embed_text_v1_5__768 USING HNSW (vec) WITH (metric = 'cosine');
|
|
54
|
+
|
|
55
|
+
INSERT
|
|
56
|
+
OR IGNORE INTO
|
|
57
|
+
embedding_model_registry(embedder, model_id, dim, table_name)
|
|
58
|
+
VALUES
|
|
59
|
+
('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
|
|
60
|
+
|
|
61
|
+
CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
|
|
62
|
+
CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
|
|
63
|
+
CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class RunDTO:
|
|
9
|
+
run_id: int
|
|
10
|
+
run_name: str
|
|
11
|
+
project_id: str
|
|
12
|
+
started_at: datetime
|
|
13
|
+
ended_at: Optional[datetime]
|
|
14
|
+
nemory_version: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class DatasourceRunDTO:
|
|
19
|
+
datasource_run_id: int
|
|
20
|
+
run_id: int
|
|
21
|
+
plugin: str
|
|
22
|
+
full_type: str
|
|
23
|
+
source_id: str
|
|
24
|
+
storage_directory: str
|
|
25
|
+
created_at: datetime
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class ChunkDTO:
|
|
30
|
+
chunk_id: int
|
|
31
|
+
datasource_run_id: int
|
|
32
|
+
embeddable_text: str
|
|
33
|
+
display_text: Optional[str]
|
|
34
|
+
generated_description: str
|
|
35
|
+
created_at: datetime
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class EmbeddingModelRegistryDTO:
|
|
40
|
+
embedder: str
|
|
41
|
+
model_id: str
|
|
42
|
+
dim: int
|
|
43
|
+
table_name: str
|
|
44
|
+
created_at: datetime
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class EmbeddingDTO:
|
|
49
|
+
chunk_id: int
|
|
50
|
+
vec: Sequence[float]
|
|
51
|
+
created_at: datetime
|
|
File without changes
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from typing import Any, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
from _duckdb import ConstraintException
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.storage.exceptions.exceptions import IntegrityError
|
|
7
|
+
from databao_context_engine.storage.models import ChunkDTO
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChunkRepository:
|
|
11
|
+
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
12
|
+
self._conn = conn
|
|
13
|
+
|
|
14
|
+
def create(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
datasource_run_id: int,
|
|
18
|
+
embeddable_text: str,
|
|
19
|
+
display_text: Optional[str],
|
|
20
|
+
generated_description: str,
|
|
21
|
+
) -> ChunkDTO:
|
|
22
|
+
try:
|
|
23
|
+
row = self._conn.execute(
|
|
24
|
+
"""
|
|
25
|
+
INSERT INTO
|
|
26
|
+
chunk(datasource_run_id, embeddable_text, display_text, generated_description)
|
|
27
|
+
VALUES
|
|
28
|
+
(?, ?, ?, ?)
|
|
29
|
+
RETURNING
|
|
30
|
+
*
|
|
31
|
+
""",
|
|
32
|
+
[datasource_run_id, embeddable_text, display_text, generated_description],
|
|
33
|
+
).fetchone()
|
|
34
|
+
if row is None:
|
|
35
|
+
raise RuntimeError("chunk creation returned no object")
|
|
36
|
+
return self._row_to_dto(row)
|
|
37
|
+
except ConstraintException as e:
|
|
38
|
+
raise IntegrityError from e
|
|
39
|
+
|
|
40
|
+
def get(self, chunk_id: int) -> Optional[ChunkDTO]:
|
|
41
|
+
row = self._conn.execute(
|
|
42
|
+
"""
|
|
43
|
+
SELECT
|
|
44
|
+
*
|
|
45
|
+
FROM
|
|
46
|
+
chunk
|
|
47
|
+
WHERE
|
|
48
|
+
chunk_id = ?
|
|
49
|
+
""",
|
|
50
|
+
[chunk_id],
|
|
51
|
+
).fetchone()
|
|
52
|
+
return self._row_to_dto(row) if row else None
|
|
53
|
+
|
|
54
|
+
def update(
|
|
55
|
+
self,
|
|
56
|
+
chunk_id: int,
|
|
57
|
+
*,
|
|
58
|
+
embeddable_text: Optional[str] = None,
|
|
59
|
+
display_text: Optional[str] = None,
|
|
60
|
+
generated_description: Optional[str] = None,
|
|
61
|
+
) -> Optional[ChunkDTO]:
|
|
62
|
+
sets: list[Any] = []
|
|
63
|
+
params: list[Any] = []
|
|
64
|
+
|
|
65
|
+
if embeddable_text is not None:
|
|
66
|
+
sets.append("embeddable_text = ?")
|
|
67
|
+
params.append(embeddable_text)
|
|
68
|
+
if display_text is not None:
|
|
69
|
+
sets.append("display_text = ?")
|
|
70
|
+
params.append(display_text)
|
|
71
|
+
if generated_description is not None:
|
|
72
|
+
sets.append("generated_description = ?")
|
|
73
|
+
params.append(generated_description)
|
|
74
|
+
|
|
75
|
+
if not sets:
|
|
76
|
+
return self.get(chunk_id)
|
|
77
|
+
|
|
78
|
+
params.append(chunk_id)
|
|
79
|
+
self._conn.execute(
|
|
80
|
+
f"""
|
|
81
|
+
UPDATE
|
|
82
|
+
chunk
|
|
83
|
+
SET
|
|
84
|
+
{", ".join(sets)}
|
|
85
|
+
WHERE
|
|
86
|
+
chunk_id = ?
|
|
87
|
+
""",
|
|
88
|
+
params,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return self.get(chunk_id)
|
|
92
|
+
|
|
93
|
+
def delete(self, chunk_id: int) -> int:
|
|
94
|
+
row = self._conn.execute(
|
|
95
|
+
"""
|
|
96
|
+
DELETE FROM
|
|
97
|
+
chunk
|
|
98
|
+
WHERE
|
|
99
|
+
chunk_id = ?
|
|
100
|
+
RETURNING
|
|
101
|
+
chunk_id
|
|
102
|
+
""",
|
|
103
|
+
[chunk_id],
|
|
104
|
+
)
|
|
105
|
+
return 1 if row else 0
|
|
106
|
+
|
|
107
|
+
def list(self) -> list[ChunkDTO]:
|
|
108
|
+
rows = self._conn.execute(
|
|
109
|
+
"""
|
|
110
|
+
SELECT
|
|
111
|
+
*
|
|
112
|
+
FROM
|
|
113
|
+
chunk
|
|
114
|
+
ORDER BY
|
|
115
|
+
chunk_id DESC
|
|
116
|
+
"""
|
|
117
|
+
).fetchall()
|
|
118
|
+
return [self._row_to_dto(r) for r in rows]
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _row_to_dto(row: Tuple) -> ChunkDTO:
|
|
122
|
+
chunk_id, datasource_run_id, embeddable_text, display_text, created_at, generated_description = row
|
|
123
|
+
return ChunkDTO(
|
|
124
|
+
chunk_id=int(chunk_id),
|
|
125
|
+
datasource_run_id=int(datasource_run_id),
|
|
126
|
+
embeddable_text=str(embeddable_text),
|
|
127
|
+
display_text=display_text,
|
|
128
|
+
generated_description=str(generated_description),
|
|
129
|
+
created_at=created_at,
|
|
130
|
+
)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from typing import Any, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
from _duckdb import ConstraintException
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.storage.exceptions.exceptions import IntegrityError
|
|
7
|
+
from databao_context_engine.storage.models import DatasourceRunDTO
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DatasourceRunRepository:
|
|
11
|
+
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
12
|
+
self._conn = conn
|
|
13
|
+
|
|
14
|
+
def create(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
run_id: int,
|
|
18
|
+
plugin: str,
|
|
19
|
+
full_type: str,
|
|
20
|
+
source_id: str,
|
|
21
|
+
storage_directory: str,
|
|
22
|
+
) -> DatasourceRunDTO:
|
|
23
|
+
try:
|
|
24
|
+
row = self._conn.execute(
|
|
25
|
+
"""
|
|
26
|
+
INSERT INTO
|
|
27
|
+
datasource_run(run_id, plugin, full_type, source_id, storage_directory)
|
|
28
|
+
VALUES
|
|
29
|
+
(?, ?, ?, ?, ?)
|
|
30
|
+
RETURNING
|
|
31
|
+
*
|
|
32
|
+
""",
|
|
33
|
+
[run_id, plugin, full_type, source_id, storage_directory],
|
|
34
|
+
).fetchone()
|
|
35
|
+
if row is None:
|
|
36
|
+
raise RuntimeError("datasource_run creation returned no object")
|
|
37
|
+
return self._row_to_dto(row)
|
|
38
|
+
except ConstraintException as e:
|
|
39
|
+
raise IntegrityError from e
|
|
40
|
+
|
|
41
|
+
def get(self, datasource_run_id: int) -> Optional[DatasourceRunDTO]:
|
|
42
|
+
row = self._conn.execute(
|
|
43
|
+
"""
|
|
44
|
+
SELECT
|
|
45
|
+
*
|
|
46
|
+
FROM
|
|
47
|
+
datasource_run
|
|
48
|
+
WHERE
|
|
49
|
+
datasource_run_id = ?
|
|
50
|
+
""",
|
|
51
|
+
[datasource_run_id],
|
|
52
|
+
).fetchone()
|
|
53
|
+
return self._row_to_dto(row) if row else None
|
|
54
|
+
|
|
55
|
+
def update(
|
|
56
|
+
self,
|
|
57
|
+
datasource_run_id: int,
|
|
58
|
+
*,
|
|
59
|
+
plugin: Optional[str] = None,
|
|
60
|
+
full_type: Optional[str] = None,
|
|
61
|
+
source_id: Optional[str] = None,
|
|
62
|
+
storage_directory: Optional[str] = None,
|
|
63
|
+
) -> Optional[DatasourceRunDTO]:
|
|
64
|
+
sets: list[Any] = []
|
|
65
|
+
params: list[Any] = []
|
|
66
|
+
|
|
67
|
+
if plugin is not None:
|
|
68
|
+
sets.append("plugin = ?")
|
|
69
|
+
params.append(plugin)
|
|
70
|
+
if full_type is not None:
|
|
71
|
+
sets.append("full_type = ?")
|
|
72
|
+
params.append(full_type)
|
|
73
|
+
if source_id is not None:
|
|
74
|
+
sets.append("source_id = ?")
|
|
75
|
+
params.append(source_id)
|
|
76
|
+
if storage_directory is not None:
|
|
77
|
+
sets.append("storage_directory = ?")
|
|
78
|
+
params.append(storage_directory)
|
|
79
|
+
|
|
80
|
+
if not sets:
|
|
81
|
+
return self.get(datasource_run_id)
|
|
82
|
+
|
|
83
|
+
params.append(datasource_run_id)
|
|
84
|
+
self._conn.execute(
|
|
85
|
+
f"""
|
|
86
|
+
UPDATE
|
|
87
|
+
datasource_run
|
|
88
|
+
SET
|
|
89
|
+
{", ".join(sets)}
|
|
90
|
+
WHERE
|
|
91
|
+
datasource_run_id = ?
|
|
92
|
+
""",
|
|
93
|
+
params,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return self.get(datasource_run_id)
|
|
97
|
+
|
|
98
|
+
def delete(self, datasource_run_id: int) -> int:
|
|
99
|
+
row = self._conn.execute(
|
|
100
|
+
"""
|
|
101
|
+
DELETE FROM
|
|
102
|
+
datasource_run
|
|
103
|
+
WHERE
|
|
104
|
+
datasource_run_id = ?
|
|
105
|
+
RETURNING
|
|
106
|
+
datasource_run_id
|
|
107
|
+
""",
|
|
108
|
+
[datasource_run_id],
|
|
109
|
+
).fetchone()
|
|
110
|
+
return 1 if row else 0
|
|
111
|
+
|
|
112
|
+
def list(self) -> list[DatasourceRunDTO]:
|
|
113
|
+
rows = self._conn.execute(
|
|
114
|
+
"""
|
|
115
|
+
SELECT
|
|
116
|
+
*
|
|
117
|
+
FROM
|
|
118
|
+
datasource_run
|
|
119
|
+
ORDER BY
|
|
120
|
+
datasource_run_id DESC
|
|
121
|
+
"""
|
|
122
|
+
).fetchall()
|
|
123
|
+
return [self._row_to_dto(r) for r in rows]
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _row_to_dto(row: Tuple) -> DatasourceRunDTO:
|
|
127
|
+
datasource_run_id, run_id, plugin, source_id, storage_directory, created_at, full_type = row
|
|
128
|
+
return DatasourceRunDTO(
|
|
129
|
+
datasource_run_id=int(datasource_run_id),
|
|
130
|
+
run_id=int(run_id),
|
|
131
|
+
plugin=str(plugin),
|
|
132
|
+
full_type=str(full_type),
|
|
133
|
+
source_id=str(source_id),
|
|
134
|
+
storage_directory=str(storage_directory),
|
|
135
|
+
created_at=created_at,
|
|
136
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.services.table_name_policy import TableNamePolicy
|
|
7
|
+
from databao_context_engine.storage.models import EmbeddingModelRegistryDTO
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EmbeddingModelRegistryRepository:
|
|
11
|
+
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
12
|
+
self._conn = conn
|
|
13
|
+
|
|
14
|
+
def create(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
embedder: str,
|
|
18
|
+
model_id: str,
|
|
19
|
+
dim: int,
|
|
20
|
+
table_name: str,
|
|
21
|
+
) -> EmbeddingModelRegistryDTO:
|
|
22
|
+
TableNamePolicy.validate_table_name(table_name=table_name)
|
|
23
|
+
row = self._conn.execute(
|
|
24
|
+
"""
|
|
25
|
+
INSERT INTO
|
|
26
|
+
embedding_model_registry(embedder, model_id, dim, table_name)
|
|
27
|
+
VALUES
|
|
28
|
+
(?, ?, ?, ?)
|
|
29
|
+
RETURNING
|
|
30
|
+
*
|
|
31
|
+
""",
|
|
32
|
+
[embedder, model_id, dim, table_name],
|
|
33
|
+
).fetchone()
|
|
34
|
+
if row is None:
|
|
35
|
+
raise RuntimeError("Embedding_model_registry creatuib returned no object")
|
|
36
|
+
return self._row_to_dto(row)
|
|
37
|
+
|
|
38
|
+
def get(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
embedder: str,
|
|
42
|
+
model_id: str,
|
|
43
|
+
) -> Optional[EmbeddingModelRegistryDTO]:
|
|
44
|
+
row = self._conn.execute(
|
|
45
|
+
"""
|
|
46
|
+
SELECT
|
|
47
|
+
*
|
|
48
|
+
FROM
|
|
49
|
+
embedding_model_registry
|
|
50
|
+
WHERE
|
|
51
|
+
embedder = ?
|
|
52
|
+
AND model_id = ?
|
|
53
|
+
""",
|
|
54
|
+
[embedder, model_id],
|
|
55
|
+
).fetchone()
|
|
56
|
+
return self._row_to_dto(row) if row else None
|
|
57
|
+
|
|
58
|
+
def delete(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
embedder: str,
|
|
62
|
+
model_id: str,
|
|
63
|
+
) -> int:
|
|
64
|
+
row = self._conn.execute(
|
|
65
|
+
"""
|
|
66
|
+
DELETE FROM
|
|
67
|
+
embedding_model_registry
|
|
68
|
+
WHERE
|
|
69
|
+
embedder = ?
|
|
70
|
+
AND model_id = ?
|
|
71
|
+
RETURNING
|
|
72
|
+
model_id
|
|
73
|
+
""",
|
|
74
|
+
[embedder, model_id],
|
|
75
|
+
).fetchone()
|
|
76
|
+
return 1 if row else 0
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def _row_to_dto(row: tuple) -> EmbeddingModelRegistryDTO:
|
|
80
|
+
embedder, model_id, dim, table_name, created_at = row
|
|
81
|
+
return EmbeddingModelRegistryDTO(
|
|
82
|
+
embedder=str(embedder),
|
|
83
|
+
model_id=str(model_id),
|
|
84
|
+
dim=int(dim),
|
|
85
|
+
table_name=str(table_name),
|
|
86
|
+
created_at=created_at if isinstance(created_at, datetime) else created_at,
|
|
87
|
+
)
|