databao-context-engine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. databao_context_engine/__init__.py +35 -0
  2. databao_context_engine/build_sources/__init__.py +0 -0
  3. databao_context_engine/build_sources/internal/__init__.py +0 -0
  4. databao_context_engine/build_sources/internal/build_runner.py +111 -0
  5. databao_context_engine/build_sources/internal/build_service.py +77 -0
  6. databao_context_engine/build_sources/internal/build_wiring.py +52 -0
  7. databao_context_engine/build_sources/internal/export_results.py +43 -0
  8. databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
  9. databao_context_engine/build_sources/public/__init__.py +0 -0
  10. databao_context_engine/build_sources/public/api.py +4 -0
  11. databao_context_engine/cli/__init__.py +0 -0
  12. databao_context_engine/cli/add_datasource_config.py +130 -0
  13. databao_context_engine/cli/commands.py +256 -0
  14. databao_context_engine/cli/datasources.py +64 -0
  15. databao_context_engine/cli/info.py +32 -0
  16. databao_context_engine/config/__init__.py +0 -0
  17. databao_context_engine/config/log_config.yaml +16 -0
  18. databao_context_engine/config/logging.py +43 -0
  19. databao_context_engine/databao_context_project_manager.py +92 -0
  20. databao_context_engine/databao_engine.py +85 -0
  21. databao_context_engine/datasource_config/__init__.py +0 -0
  22. databao_context_engine/datasource_config/add_config.py +50 -0
  23. databao_context_engine/datasource_config/check_config.py +131 -0
  24. databao_context_engine/datasource_config/datasource_context.py +60 -0
  25. databao_context_engine/event_journal/__init__.py +0 -0
  26. databao_context_engine/event_journal/writer.py +29 -0
  27. databao_context_engine/generate_configs_schemas.py +92 -0
  28. databao_context_engine/init_project.py +18 -0
  29. databao_context_engine/introspection/__init__.py +0 -0
  30. databao_context_engine/introspection/property_extract.py +202 -0
  31. databao_context_engine/llm/__init__.py +0 -0
  32. databao_context_engine/llm/config.py +20 -0
  33. databao_context_engine/llm/descriptions/__init__.py +0 -0
  34. databao_context_engine/llm/descriptions/ollama.py +21 -0
  35. databao_context_engine/llm/descriptions/provider.py +10 -0
  36. databao_context_engine/llm/embeddings/__init__.py +0 -0
  37. databao_context_engine/llm/embeddings/ollama.py +37 -0
  38. databao_context_engine/llm/embeddings/provider.py +13 -0
  39. databao_context_engine/llm/errors.py +16 -0
  40. databao_context_engine/llm/factory.py +61 -0
  41. databao_context_engine/llm/install.py +227 -0
  42. databao_context_engine/llm/runtime.py +73 -0
  43. databao_context_engine/llm/service.py +159 -0
  44. databao_context_engine/main.py +19 -0
  45. databao_context_engine/mcp/__init__.py +0 -0
  46. databao_context_engine/mcp/all_results_tool.py +5 -0
  47. databao_context_engine/mcp/mcp_runner.py +16 -0
  48. databao_context_engine/mcp/mcp_server.py +63 -0
  49. databao_context_engine/mcp/retrieve_tool.py +22 -0
  50. databao_context_engine/pluginlib/__init__.py +0 -0
  51. databao_context_engine/pluginlib/build_plugin.py +107 -0
  52. databao_context_engine/pluginlib/config.py +37 -0
  53. databao_context_engine/pluginlib/plugin_utils.py +68 -0
  54. databao_context_engine/plugins/__init__.py +0 -0
  55. databao_context_engine/plugins/athena_db_plugin.py +12 -0
  56. databao_context_engine/plugins/base_db_plugin.py +45 -0
  57. databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
  58. databao_context_engine/plugins/databases/__init__.py +0 -0
  59. databao_context_engine/plugins/databases/athena_introspector.py +101 -0
  60. databao_context_engine/plugins/databases/base_introspector.py +144 -0
  61. databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
  62. databao_context_engine/plugins/databases/database_chunker.py +69 -0
  63. databao_context_engine/plugins/databases/databases_types.py +114 -0
  64. databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
  65. databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
  66. databao_context_engine/plugins/databases/introspection_scope.py +74 -0
  67. databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
  68. databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
  69. databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
  70. databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
  71. databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
  72. databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
  73. databao_context_engine/plugins/mssql_db_plugin.py +12 -0
  74. databao_context_engine/plugins/mysql_db_plugin.py +12 -0
  75. databao_context_engine/plugins/parquet_plugin.py +32 -0
  76. databao_context_engine/plugins/plugin_loader.py +110 -0
  77. databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
  78. databao_context_engine/plugins/resources/__init__.py +0 -0
  79. databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
  80. databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
  81. databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
  82. databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
  83. databao_context_engine/project/__init__.py +0 -0
  84. databao_context_engine/project/datasource_discovery.py +141 -0
  85. databao_context_engine/project/info.py +44 -0
  86. databao_context_engine/project/init_project.py +102 -0
  87. databao_context_engine/project/layout.py +127 -0
  88. databao_context_engine/project/project_config.py +32 -0
  89. databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
  90. databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
  91. databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
  92. databao_context_engine/project/runs.py +39 -0
  93. databao_context_engine/project/types.py +134 -0
  94. databao_context_engine/retrieve_embeddings/__init__.py +0 -0
  95. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  96. databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
  97. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
  98. databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
  99. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
  100. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  101. databao_context_engine/retrieve_embeddings/public/api.py +3 -0
  102. databao_context_engine/serialisation/__init__.py +0 -0
  103. databao_context_engine/serialisation/yaml.py +35 -0
  104. databao_context_engine/services/__init__.py +0 -0
  105. databao_context_engine/services/chunk_embedding_service.py +104 -0
  106. databao_context_engine/services/embedding_shard_resolver.py +64 -0
  107. databao_context_engine/services/factories.py +88 -0
  108. databao_context_engine/services/models.py +12 -0
  109. databao_context_engine/services/persistence_service.py +61 -0
  110. databao_context_engine/services/run_name_policy.py +8 -0
  111. databao_context_engine/services/table_name_policy.py +15 -0
  112. databao_context_engine/storage/__init__.py +0 -0
  113. databao_context_engine/storage/connection.py +32 -0
  114. databao_context_engine/storage/exceptions/__init__.py +0 -0
  115. databao_context_engine/storage/exceptions/exceptions.py +6 -0
  116. databao_context_engine/storage/migrate.py +127 -0
  117. databao_context_engine/storage/migrations/V01__init.sql +63 -0
  118. databao_context_engine/storage/models.py +51 -0
  119. databao_context_engine/storage/repositories/__init__.py +0 -0
  120. databao_context_engine/storage/repositories/chunk_repository.py +130 -0
  121. databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
  122. databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
  123. databao_context_engine/storage/repositories/embedding_repository.py +113 -0
  124. databao_context_engine/storage/repositories/factories.py +35 -0
  125. databao_context_engine/storage/repositories/run_repository.py +157 -0
  126. databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
  127. databao_context_engine/storage/transaction.py +14 -0
  128. databao_context_engine/system/__init__.py +0 -0
  129. databao_context_engine/system/properties.py +13 -0
  130. databao_context_engine/templating/__init__.py +0 -0
  131. databao_context_engine/templating/renderer.py +29 -0
  132. databao_context_engine-0.1.1.dist-info/METADATA +186 -0
  133. databao_context_engine-0.1.1.dist-info/RECORD +135 -0
  134. databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
  135. databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,15 @@
1
+ import re
2
+
3
+ _TABLE_NAME_RE = re.compile(r"^embedding_[a-z0-9_]+$")
4
+
5
+
6
+ class TableNamePolicy:
7
+ @staticmethod
8
+ def build(*, embedder: str, model_id: str, dim: int) -> str:
9
+ safe_model = model_id.replace(":", "_").replace("-", "_").replace(" ", "_").replace(".", "_")
10
+ return f"embedding_{embedder}__{safe_model}__{dim}"
11
+
12
+ @staticmethod
13
+ def validate_table_name(*, table_name: str):
14
+ if not _TABLE_NAME_RE.fullmatch(table_name):
15
+ raise ValueError(f"invalid table_name {table_name!r}; expected pattern {_TABLE_NAME_RE.pattern}")
File without changes
@@ -0,0 +1,32 @@
1
+ import logging
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+
5
+ import duckdb
6
+
7
+ from databao_context_engine.system.properties import get_db_path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ @contextmanager
13
+ def open_duckdb_connection(db_path: str | Path | None = None):
14
+ """
15
+ Open a DuckDB connection with vector search enabled and close on exist.
16
+ Loads the vss extension and enables HNSW experimental persistence.
17
+
18
+ Usage:
19
+ with open_duckdb_connection() as conn:
20
+ """
21
+ path = str(db_path or get_db_path())
22
+ conn = duckdb.connect(path)
23
+ logger.debug(f"Connected to DuckDB database at {path}")
24
+
25
+ try:
26
+ conn.execute("LOAD vss;")
27
+ conn.execute("SET hnsw_enable_experimental_persistence = true;")
28
+
29
+ logger.debug("Loaded Vector Similarity Search extension")
30
+ yield conn
31
+ finally:
32
+ conn.close()
File without changes
@@ -0,0 +1,6 @@
1
+ class RepositoryError(Exception):
2
+ """Base exception for repository errors"""
3
+
4
+
5
+ class IntegrityError(RepositoryError):
6
+ """Raised when a DB constraint is violated"""
@@ -0,0 +1,127 @@
1
+ import hashlib
2
+ import logging
3
+ import re
4
+ from dataclasses import dataclass
5
+ from importlib.resources import files
6
+ from pathlib import Path
7
+ from typing import LiteralString
8
+
9
+ import duckdb
10
+
11
+ from databao_context_engine.system.properties import get_db_path
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def migrate(db_path: str | Path | None = None, migration_files: list[Path] | None = None) -> None:
17
+ if migration_files is None:
18
+ migration_files = [
19
+ migration
20
+ for migration in files("databao_context_engine.storage.migrations").iterdir()
21
+ if isinstance(migration, Path) and ".sql" == migration.suffix
22
+ ]
23
+
24
+ db = Path(db_path or get_db_path()).expanduser().resolve()
25
+ db.parent.mkdir(parents=True, exist_ok=True)
26
+ logger.debug("Running migrations on database: %s", db)
27
+
28
+ migration_manager = _MigrationManager(db, migration_files)
29
+ migration_manager.migrate()
30
+ logger.debug("Migration complete")
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class MigrationDTO:
35
+ name: str
36
+ version: int
37
+ checksum: str
38
+
39
+
40
+ class MigrationError(Exception):
41
+ """Base class for migration errors."""
42
+
43
+
44
+ def load_migrations(conn) -> list[MigrationDTO]:
45
+ with conn.cursor() as cur:
46
+ cur.execute(
47
+ "SELECT name, version, checksum, applied_at FROM migration_history",
48
+ )
49
+ rows = cur.fetchall()
50
+ return [
51
+ MigrationDTO(name=name, version=version, checksum=checksum)
52
+ for (name, version, checksum, applied_at) in rows
53
+ ]
54
+
55
+
56
+ def _extract_version_from_name(name: str) -> int:
57
+ version_groups = re.findall(r"(\d+)__", name)
58
+ if not version_groups:
59
+ raise ValueError(f"Invalid migration name: {name}")
60
+ return int(version_groups[0])
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class _Migration:
65
+ name: str
66
+ version: int
67
+ checksum: str
68
+ query: str
69
+
70
+
71
+ def _create_migration(file: Path) -> _Migration:
72
+ query_bytes = file.read_bytes()
73
+ query = query_bytes.decode("utf-8")
74
+ checksum = hashlib.md5(query_bytes).hexdigest()
75
+ version = _extract_version_from_name(file.name)
76
+ return _Migration(name=file.name, version=version, checksum=checksum, query=query)
77
+
78
+
79
+ class _MigrationManager:
80
+ _init_migration_table_sql: LiteralString = """
81
+ CREATE SEQUENCE IF NOT EXISTS migration_history_id_seq START 1;
82
+
83
+ CREATE TABLE IF NOT EXISTS migration_history (
84
+ id BIGINT PRIMARY KEY DEFAULT nextval('migration_history_id_seq'),
85
+ name TEXT NOT NULL,
86
+ version INTEGER NOT NULL,
87
+ checksum TEXT NOT NULL,
88
+ applied_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
89
+ UNIQUE (version)
90
+ );
91
+ """
92
+
93
+ _insert_migration_sql: LiteralString = "INSERT INTO migration_history (name, version, checksum) VALUES (?, ?, ?)"
94
+
95
+ def __init__(self, db_path: Path, migration_files: list[Path]):
96
+ self._migration_files = migration_files
97
+ self._db_path = db_path
98
+ self._requested_migrations = [_create_migration(file) for file in migration_files]
99
+
100
+ def migrate(self) -> None:
101
+ applied_migrations: list[MigrationDTO] = self.init_db_and_load_applied_migrations()
102
+ applied_checksums = [m.checksum for m in applied_migrations]
103
+ applied_versions = [m.version for m in applied_migrations]
104
+ migrations_to_apply = [m for m in self._requested_migrations if m.checksum not in applied_checksums]
105
+ duplicated_versions = [
106
+ migration.version for migration in migrations_to_apply if migration.version in applied_versions
107
+ ]
108
+ if any(duplicated_versions):
109
+ raise MigrationError(f"Migrations with versions {duplicated_versions} already exist")
110
+ with duckdb.connect(self._db_path) as conn:
111
+ for migration in migrations_to_apply:
112
+ logger.debug("Applying migration %s", migration.name)
113
+ with conn.cursor() as cur:
114
+ cur.execute("START TRANSACTION;")
115
+ try:
116
+ cur.execute(migration.query)
117
+ cur.execute(self._insert_migration_sql, [migration.name, migration.version, migration.checksum])
118
+ cur.commit()
119
+ except Exception:
120
+ cur.rollback()
121
+ raise MigrationError(f"Failed to apply migration {migration.name}. Aborting migration process.")
122
+
123
+ def init_db_and_load_applied_migrations(self) -> list[MigrationDTO]:
124
+ with duckdb.connect(str(self._db_path)) as conn:
125
+ conn.execute(self._init_migration_table_sql)
126
+ conn.commit()
127
+ return load_migrations(conn)
@@ -0,0 +1,63 @@
1
+ INSTALL vss;
2
+ LOAD vss;
3
+ SET hnsw_enable_experimental_persistence = true;
4
+
5
+ CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
6
+ CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
7
+ CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
8
+
9
+ CREATE TABLE IF NOT EXISTS run (
10
+ run_id BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
11
+ project_id TEXT NOT NULL,
12
+ started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
13
+ ended_at TIMESTAMP,
14
+ nemory_version TEXT,
15
+ run_name TEXT NOT NULL,
16
+ );
17
+
18
+ CREATE TABLE IF NOT EXISTS datasource_run (
19
+ datasource_run_id BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
20
+ run_id BIGINT NOT NULL REFERENCES run(run_id),
21
+ plugin TEXT NOT NULL,
22
+ source_id TEXT NOT NULL,
23
+ storage_directory TEXT NOT NULL,
24
+ created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
25
+ full_type TEXT NOT NULL,
26
+ );
27
+
28
+ CREATE TABLE IF NOT EXISTS chunk (
29
+ chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
30
+ datasource_run_id BIGINT NOT NULL REFERENCES datasource_run(datasource_run_id),
31
+ embeddable_text TEXT NOT NULL,
32
+ display_text TEXT,
33
+ created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
34
+ generated_description TEXT,
35
+ );
36
+
37
+ CREATE TABLE IF NOT EXISTS embedding_model_registry (
38
+ embedder TEXT NOT NULL,
39
+ model_id TEXT NOT NULL,
40
+ dim INTEGER NOT NULL,
41
+ table_name TEXT NOT NULL,
42
+ created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
43
+ PRIMARY KEY (embedder, model_id),
44
+ UNIQUE (table_name)
45
+ );
46
+
47
+ CREATE TABLE IF NOT EXISTS embedding_ollama__nomic_embed_text_v1_5__768 (
48
+ chunk_id BIGINT NOT NULL REFERENCES chunk(chunk_id),
49
+ vec FLOAT[768] NOT NULL,
50
+ created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
51
+ PRIMARY KEY (chunk_id)
52
+ );
53
+ CREATE INDEX IF NOT EXISTS emb_hnsw_embedding_ollama__nomic_embed_text_v1_5__768 ON embedding_ollama__nomic_embed_text_v1_5__768 USING HNSW (vec) WITH (metric = 'cosine');
54
+
55
+ INSERT
56
+ OR IGNORE INTO
57
+ embedding_model_registry(embedder, model_id, dim, table_name)
58
+ VALUES
59
+ ('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
60
+
61
+ CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
62
+ CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
63
+ CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);
@@ -0,0 +1,51 @@
1
+ from collections.abc import Sequence
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class RunDTO:
9
+ run_id: int
10
+ run_name: str
11
+ project_id: str
12
+ started_at: datetime
13
+ ended_at: Optional[datetime]
14
+ nemory_version: str
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class DatasourceRunDTO:
19
+ datasource_run_id: int
20
+ run_id: int
21
+ plugin: str
22
+ full_type: str
23
+ source_id: str
24
+ storage_directory: str
25
+ created_at: datetime
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class ChunkDTO:
30
+ chunk_id: int
31
+ datasource_run_id: int
32
+ embeddable_text: str
33
+ display_text: Optional[str]
34
+ generated_description: str
35
+ created_at: datetime
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class EmbeddingModelRegistryDTO:
40
+ embedder: str
41
+ model_id: str
42
+ dim: int
43
+ table_name: str
44
+ created_at: datetime
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class EmbeddingDTO:
49
+ chunk_id: int
50
+ vec: Sequence[float]
51
+ created_at: datetime
@@ -0,0 +1,130 @@
1
+ from typing import Any, Optional, Tuple
2
+
3
+ import duckdb
4
+ from _duckdb import ConstraintException
5
+
6
+ from databao_context_engine.storage.exceptions.exceptions import IntegrityError
7
+ from databao_context_engine.storage.models import ChunkDTO
8
+
9
+
10
+ class ChunkRepository:
11
+ def __init__(self, conn: duckdb.DuckDBPyConnection):
12
+ self._conn = conn
13
+
14
+ def create(
15
+ self,
16
+ *,
17
+ datasource_run_id: int,
18
+ embeddable_text: str,
19
+ display_text: Optional[str],
20
+ generated_description: str,
21
+ ) -> ChunkDTO:
22
+ try:
23
+ row = self._conn.execute(
24
+ """
25
+ INSERT INTO
26
+ chunk(datasource_run_id, embeddable_text, display_text, generated_description)
27
+ VALUES
28
+ (?, ?, ?, ?)
29
+ RETURNING
30
+ *
31
+ """,
32
+ [datasource_run_id, embeddable_text, display_text, generated_description],
33
+ ).fetchone()
34
+ if row is None:
35
+ raise RuntimeError("chunk creation returned no object")
36
+ return self._row_to_dto(row)
37
+ except ConstraintException as e:
38
+ raise IntegrityError from e
39
+
40
+ def get(self, chunk_id: int) -> Optional[ChunkDTO]:
41
+ row = self._conn.execute(
42
+ """
43
+ SELECT
44
+ *
45
+ FROM
46
+ chunk
47
+ WHERE
48
+ chunk_id = ?
49
+ """,
50
+ [chunk_id],
51
+ ).fetchone()
52
+ return self._row_to_dto(row) if row else None
53
+
54
+ def update(
55
+ self,
56
+ chunk_id: int,
57
+ *,
58
+ embeddable_text: Optional[str] = None,
59
+ display_text: Optional[str] = None,
60
+ generated_description: Optional[str] = None,
61
+ ) -> Optional[ChunkDTO]:
62
+ sets: list[Any] = []
63
+ params: list[Any] = []
64
+
65
+ if embeddable_text is not None:
66
+ sets.append("embeddable_text = ?")
67
+ params.append(embeddable_text)
68
+ if display_text is not None:
69
+ sets.append("display_text = ?")
70
+ params.append(display_text)
71
+ if generated_description is not None:
72
+ sets.append("generated_description = ?")
73
+ params.append(generated_description)
74
+
75
+ if not sets:
76
+ return self.get(chunk_id)
77
+
78
+ params.append(chunk_id)
79
+ self._conn.execute(
80
+ f"""
81
+ UPDATE
82
+ chunk
83
+ SET
84
+ {", ".join(sets)}
85
+ WHERE
86
+ chunk_id = ?
87
+ """,
88
+ params,
89
+ )
90
+
91
+ return self.get(chunk_id)
92
+
93
+ def delete(self, chunk_id: int) -> int:
94
+ row = self._conn.execute(
95
+ """
96
+ DELETE FROM
97
+ chunk
98
+ WHERE
99
+ chunk_id = ?
100
+ RETURNING
101
+ chunk_id
102
+ """,
103
+ [chunk_id],
104
+ )
105
+ return 1 if row else 0
106
+
107
+ def list(self) -> list[ChunkDTO]:
108
+ rows = self._conn.execute(
109
+ """
110
+ SELECT
111
+ *
112
+ FROM
113
+ chunk
114
+ ORDER BY
115
+ chunk_id DESC
116
+ """
117
+ ).fetchall()
118
+ return [self._row_to_dto(r) for r in rows]
119
+
120
+ @staticmethod
121
+ def _row_to_dto(row: Tuple) -> ChunkDTO:
122
+ chunk_id, datasource_run_id, embeddable_text, display_text, created_at, generated_description = row
123
+ return ChunkDTO(
124
+ chunk_id=int(chunk_id),
125
+ datasource_run_id=int(datasource_run_id),
126
+ embeddable_text=str(embeddable_text),
127
+ display_text=display_text,
128
+ generated_description=str(generated_description),
129
+ created_at=created_at,
130
+ )
@@ -0,0 +1,136 @@
1
+ from typing import Any, Optional, Tuple
2
+
3
+ import duckdb
4
+ from _duckdb import ConstraintException
5
+
6
+ from databao_context_engine.storage.exceptions.exceptions import IntegrityError
7
+ from databao_context_engine.storage.models import DatasourceRunDTO
8
+
9
+
10
+ class DatasourceRunRepository:
11
+ def __init__(self, conn: duckdb.DuckDBPyConnection):
12
+ self._conn = conn
13
+
14
+ def create(
15
+ self,
16
+ *,
17
+ run_id: int,
18
+ plugin: str,
19
+ full_type: str,
20
+ source_id: str,
21
+ storage_directory: str,
22
+ ) -> DatasourceRunDTO:
23
+ try:
24
+ row = self._conn.execute(
25
+ """
26
+ INSERT INTO
27
+ datasource_run(run_id, plugin, full_type, source_id, storage_directory)
28
+ VALUES
29
+ (?, ?, ?, ?, ?)
30
+ RETURNING
31
+ *
32
+ """,
33
+ [run_id, plugin, full_type, source_id, storage_directory],
34
+ ).fetchone()
35
+ if row is None:
36
+ raise RuntimeError("datasource_run creation returned no object")
37
+ return self._row_to_dto(row)
38
+ except ConstraintException as e:
39
+ raise IntegrityError from e
40
+
41
+ def get(self, datasource_run_id: int) -> Optional[DatasourceRunDTO]:
42
+ row = self._conn.execute(
43
+ """
44
+ SELECT
45
+ *
46
+ FROM
47
+ datasource_run
48
+ WHERE
49
+ datasource_run_id = ?
50
+ """,
51
+ [datasource_run_id],
52
+ ).fetchone()
53
+ return self._row_to_dto(row) if row else None
54
+
55
+ def update(
56
+ self,
57
+ datasource_run_id: int,
58
+ *,
59
+ plugin: Optional[str] = None,
60
+ full_type: Optional[str] = None,
61
+ source_id: Optional[str] = None,
62
+ storage_directory: Optional[str] = None,
63
+ ) -> Optional[DatasourceRunDTO]:
64
+ sets: list[Any] = []
65
+ params: list[Any] = []
66
+
67
+ if plugin is not None:
68
+ sets.append("plugin = ?")
69
+ params.append(plugin)
70
+ if full_type is not None:
71
+ sets.append("full_type = ?")
72
+ params.append(full_type)
73
+ if source_id is not None:
74
+ sets.append("source_id = ?")
75
+ params.append(source_id)
76
+ if storage_directory is not None:
77
+ sets.append("storage_directory = ?")
78
+ params.append(storage_directory)
79
+
80
+ if not sets:
81
+ return self.get(datasource_run_id)
82
+
83
+ params.append(datasource_run_id)
84
+ self._conn.execute(
85
+ f"""
86
+ UPDATE
87
+ datasource_run
88
+ SET
89
+ {", ".join(sets)}
90
+ WHERE
91
+ datasource_run_id = ?
92
+ """,
93
+ params,
94
+ )
95
+
96
+ return self.get(datasource_run_id)
97
+
98
+ def delete(self, datasource_run_id: int) -> int:
99
+ row = self._conn.execute(
100
+ """
101
+ DELETE FROM
102
+ datasource_run
103
+ WHERE
104
+ datasource_run_id = ?
105
+ RETURNING
106
+ datasource_run_id
107
+ """,
108
+ [datasource_run_id],
109
+ ).fetchone()
110
+ return 1 if row else 0
111
+
112
+ def list(self) -> list[DatasourceRunDTO]:
113
+ rows = self._conn.execute(
114
+ """
115
+ SELECT
116
+ *
117
+ FROM
118
+ datasource_run
119
+ ORDER BY
120
+ datasource_run_id DESC
121
+ """
122
+ ).fetchall()
123
+ return [self._row_to_dto(r) for r in rows]
124
+
125
+ @staticmethod
126
+ def _row_to_dto(row: Tuple) -> DatasourceRunDTO:
127
+ datasource_run_id, run_id, plugin, source_id, storage_directory, created_at, full_type = row
128
+ return DatasourceRunDTO(
129
+ datasource_run_id=int(datasource_run_id),
130
+ run_id=int(run_id),
131
+ plugin=str(plugin),
132
+ full_type=str(full_type),
133
+ source_id=str(source_id),
134
+ storage_directory=str(storage_directory),
135
+ created_at=created_at,
136
+ )
@@ -0,0 +1,87 @@
1
+ from datetime import datetime
2
+ from typing import Optional
3
+
4
+ import duckdb
5
+
6
+ from databao_context_engine.services.table_name_policy import TableNamePolicy
7
+ from databao_context_engine.storage.models import EmbeddingModelRegistryDTO
8
+
9
+
10
+ class EmbeddingModelRegistryRepository:
11
+ def __init__(self, conn: duckdb.DuckDBPyConnection):
12
+ self._conn = conn
13
+
14
+ def create(
15
+ self,
16
+ *,
17
+ embedder: str,
18
+ model_id: str,
19
+ dim: int,
20
+ table_name: str,
21
+ ) -> EmbeddingModelRegistryDTO:
22
+ TableNamePolicy.validate_table_name(table_name=table_name)
23
+ row = self._conn.execute(
24
+ """
25
+ INSERT INTO
26
+ embedding_model_registry(embedder, model_id, dim, table_name)
27
+ VALUES
28
+ (?, ?, ?, ?)
29
+ RETURNING
30
+ *
31
+ """,
32
+ [embedder, model_id, dim, table_name],
33
+ ).fetchone()
34
+ if row is None:
35
+ raise RuntimeError("Embedding_model_registry creatuib returned no object")
36
+ return self._row_to_dto(row)
37
+
38
+ def get(
39
+ self,
40
+ *,
41
+ embedder: str,
42
+ model_id: str,
43
+ ) -> Optional[EmbeddingModelRegistryDTO]:
44
+ row = self._conn.execute(
45
+ """
46
+ SELECT
47
+ *
48
+ FROM
49
+ embedding_model_registry
50
+ WHERE
51
+ embedder = ?
52
+ AND model_id = ?
53
+ """,
54
+ [embedder, model_id],
55
+ ).fetchone()
56
+ return self._row_to_dto(row) if row else None
57
+
58
+ def delete(
59
+ self,
60
+ *,
61
+ embedder: str,
62
+ model_id: str,
63
+ ) -> int:
64
+ row = self._conn.execute(
65
+ """
66
+ DELETE FROM
67
+ embedding_model_registry
68
+ WHERE
69
+ embedder = ?
70
+ AND model_id = ?
71
+ RETURNING
72
+ model_id
73
+ """,
74
+ [embedder, model_id],
75
+ ).fetchone()
76
+ return 1 if row else 0
77
+
78
+ @staticmethod
79
+ def _row_to_dto(row: tuple) -> EmbeddingModelRegistryDTO:
80
+ embedder, model_id, dim, table_name, created_at = row
81
+ return EmbeddingModelRegistryDTO(
82
+ embedder=str(embedder),
83
+ model_id=str(model_id),
84
+ dim=int(dim),
85
+ table_name=str(table_name),
86
+ created_at=created_at if isinstance(created_at, datetime) else created_at,
87
+ )