sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.dist-info/METADATA +416 -0
- sirchmunk-0.0.1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,676 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
"""
|
|
3
|
+
DuckDB database manager for Sirchmunk
|
|
4
|
+
Provides a comprehensive interface for DuckDB operations including
|
|
5
|
+
connection management, table operations, data manipulation, and analytics
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import duckdb
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import logging
|
|
13
|
+
from contextlib import contextmanager
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DuckDBManager:
|
|
20
|
+
"""
|
|
21
|
+
A comprehensive DuckDB database manager providing common operations
|
|
22
|
+
for data storage, retrieval, and analytics in the Sirchmunk system.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, db_path: Optional[str] = None, read_only: bool = False):
|
|
26
|
+
"""
|
|
27
|
+
Initialize DuckDB connection
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
db_path: Path to database file. If None, creates in-memory database
|
|
31
|
+
read_only: Whether to open database in read-only mode
|
|
32
|
+
"""
|
|
33
|
+
self.db_path = db_path
|
|
34
|
+
self.read_only = read_only
|
|
35
|
+
self.connection = None
|
|
36
|
+
self._connect()
|
|
37
|
+
|
|
38
|
+
def _connect(self):
|
|
39
|
+
"""Establish database connection"""
|
|
40
|
+
try:
|
|
41
|
+
if self.db_path:
|
|
42
|
+
self.connection = duckdb.connect(self.db_path, read_only=self.read_only)
|
|
43
|
+
logger.info(f"Connected to DuckDB at {self.db_path}")
|
|
44
|
+
else:
|
|
45
|
+
self.connection = duckdb.connect(":memory:")
|
|
46
|
+
logger.info("Connected to in-memory DuckDB")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.error(f"Failed to connect to DuckDB: {e}")
|
|
49
|
+
raise
|
|
50
|
+
|
|
51
|
+
def close(self):
|
|
52
|
+
"""Close database connection"""
|
|
53
|
+
if self.connection:
|
|
54
|
+
self.connection.close()
|
|
55
|
+
self.connection = None
|
|
56
|
+
logger.info("DuckDB connection closed")
|
|
57
|
+
|
|
58
|
+
@contextmanager
|
|
59
|
+
def transaction(self):
|
|
60
|
+
"""Context manager for database transactions"""
|
|
61
|
+
try:
|
|
62
|
+
self.connection.begin()
|
|
63
|
+
yield self.connection
|
|
64
|
+
self.connection.commit()
|
|
65
|
+
except Exception as e:
|
|
66
|
+
self.connection.rollback()
|
|
67
|
+
logger.error(f"Transaction rolled back: {e}")
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
def execute(self, query: str, parameters: Optional[List] = None):
|
|
71
|
+
"""
|
|
72
|
+
Execute SQL query
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
query: SQL query string
|
|
76
|
+
parameters: Optional query parameters
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Query result
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
if parameters:
|
|
83
|
+
return self.connection.execute(query, parameters)
|
|
84
|
+
return self.connection.execute(query)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"Query execution failed: {e}")
|
|
87
|
+
logger.error(f"Query: {query}")
|
|
88
|
+
raise
|
|
89
|
+
|
|
90
|
+
def fetch_all(self, query: str, parameters: Optional[List] = None) -> List[Tuple]:
|
|
91
|
+
"""
|
|
92
|
+
Execute query and fetch all results
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
query: SQL query string
|
|
96
|
+
parameters: Optional query parameters
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of result tuples
|
|
100
|
+
"""
|
|
101
|
+
result = self.execute(query, parameters)
|
|
102
|
+
return result.fetchall()
|
|
103
|
+
|
|
104
|
+
def fetch_one(self, query: str, parameters: Optional[List] = None) -> Optional[Tuple]:
|
|
105
|
+
"""
|
|
106
|
+
Execute query and fetch one result
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
query: SQL query string
|
|
110
|
+
parameters: Optional query parameters
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Single result tuple or None
|
|
114
|
+
"""
|
|
115
|
+
result = self.execute(query, parameters)
|
|
116
|
+
return result.fetchone()
|
|
117
|
+
|
|
118
|
+
def fetch_df(self, query: str, parameters: Optional[List] = None) -> pd.DataFrame:
|
|
119
|
+
"""
|
|
120
|
+
Execute query and return results as pandas DataFrame
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
query: SQL query string
|
|
124
|
+
parameters: Optional query parameters
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Results as DataFrame
|
|
128
|
+
"""
|
|
129
|
+
result = self.execute(query, parameters)
|
|
130
|
+
return result.df()
|
|
131
|
+
|
|
132
|
+
def create_table(self, table_name: str, schema: Dict[str, str], if_not_exists: bool = True):
|
|
133
|
+
"""
|
|
134
|
+
Create table with specified schema
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
table_name: Name of the table
|
|
138
|
+
schema: Dictionary mapping column names to types
|
|
139
|
+
if_not_exists: Whether to use IF NOT EXISTS clause
|
|
140
|
+
"""
|
|
141
|
+
columns = ", ".join([f"{col} {dtype}" for col, dtype in schema.items()])
|
|
142
|
+
if_not_exists_clause = "IF NOT EXISTS" if if_not_exists else ""
|
|
143
|
+
|
|
144
|
+
query = f"CREATE TABLE {if_not_exists_clause} {table_name} ({columns})"
|
|
145
|
+
self.execute(query)
|
|
146
|
+
logger.info(f"Table {table_name} created successfully")
|
|
147
|
+
|
|
148
|
+
def drop_table(self, table_name: str, if_exists: bool = True):
|
|
149
|
+
"""
|
|
150
|
+
Drop table
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
table_name: Name of the table to drop
|
|
154
|
+
if_exists: Whether to use IF EXISTS clause
|
|
155
|
+
"""
|
|
156
|
+
if_exists_clause = "IF EXISTS" if if_exists else ""
|
|
157
|
+
query = f"DROP TABLE {if_exists_clause} {table_name}"
|
|
158
|
+
self.execute(query)
|
|
159
|
+
logger.info(f"Table {table_name} dropped successfully")
|
|
160
|
+
|
|
161
|
+
def insert_data(self, table_name: str, data: Union[Dict, List[Dict], pd.DataFrame]):
|
|
162
|
+
"""
|
|
163
|
+
Insert data into table
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
table_name: Target table name
|
|
167
|
+
data: Data to insert (dict, list of dicts, or DataFrame)
|
|
168
|
+
"""
|
|
169
|
+
if isinstance(data, dict):
|
|
170
|
+
data = [data]
|
|
171
|
+
|
|
172
|
+
if isinstance(data, list):
|
|
173
|
+
if not data:
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
columns = list(data[0].keys())
|
|
177
|
+
placeholders = ", ".join(["?" for _ in columns])
|
|
178
|
+
column_names = ", ".join(columns)
|
|
179
|
+
|
|
180
|
+
query = f"INSERT INTO {table_name} ({column_names}) VALUES ({placeholders})"
|
|
181
|
+
|
|
182
|
+
for row in data:
|
|
183
|
+
values = [row.get(col) for col in columns]
|
|
184
|
+
self.execute(query, values)
|
|
185
|
+
|
|
186
|
+
elif isinstance(data, pd.DataFrame):
|
|
187
|
+
# Use DuckDB's efficient DataFrame insertion
|
|
188
|
+
self.connection.register("temp_df", data)
|
|
189
|
+
self.execute(f"INSERT INTO {table_name} SELECT * FROM temp_df")
|
|
190
|
+
self.connection.unregister("temp_df")
|
|
191
|
+
|
|
192
|
+
logger.info(f"Data inserted into {table_name}")
|
|
193
|
+
|
|
194
|
+
def update_data(self, table_name: str, set_clause: Dict[str, Any],
|
|
195
|
+
where_clause: str, where_params: Optional[List] = None):
|
|
196
|
+
"""
|
|
197
|
+
Update data in table
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
table_name: Target table name
|
|
201
|
+
set_clause: Dictionary of column-value pairs to update
|
|
202
|
+
where_clause: WHERE condition
|
|
203
|
+
where_params: Parameters for WHERE clause
|
|
204
|
+
"""
|
|
205
|
+
set_parts = [f"{col} = ?" for col in set_clause.keys()]
|
|
206
|
+
set_string = ", ".join(set_parts)
|
|
207
|
+
|
|
208
|
+
query = f"UPDATE {table_name} SET {set_string} WHERE {where_clause}"
|
|
209
|
+
params = list(set_clause.values())
|
|
210
|
+
if where_params:
|
|
211
|
+
params.extend(where_params)
|
|
212
|
+
|
|
213
|
+
self.execute(query, params)
|
|
214
|
+
logger.info(f"Data updated in {table_name}")
|
|
215
|
+
|
|
216
|
+
def delete_data(self, table_name: str, where_clause: str, where_params: Optional[List] = None):
|
|
217
|
+
"""
|
|
218
|
+
Delete data from table
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
table_name: Target table name
|
|
222
|
+
where_clause: WHERE condition
|
|
223
|
+
where_params: Parameters for WHERE clause
|
|
224
|
+
"""
|
|
225
|
+
query = f"DELETE FROM {table_name} WHERE {where_clause}"
|
|
226
|
+
self.execute(query, where_params)
|
|
227
|
+
logger.info(f"Data deleted from {table_name}")
|
|
228
|
+
|
|
229
|
+
def table_exists(self, table_name: str) -> bool:
|
|
230
|
+
"""
|
|
231
|
+
Check if table exists
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
table_name: Name of the table to check
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
True if table exists, False otherwise
|
|
238
|
+
"""
|
|
239
|
+
query = """
|
|
240
|
+
SELECT COUNT(*)
|
|
241
|
+
FROM information_schema.tables
|
|
242
|
+
WHERE table_name = ?
|
|
243
|
+
"""
|
|
244
|
+
result = self.fetch_one(query, [table_name])
|
|
245
|
+
return result[0] > 0 if result else False
|
|
246
|
+
|
|
247
|
+
def get_table_info(self, table_name: str) -> List[Dict]:
|
|
248
|
+
"""
|
|
249
|
+
Get table schema information
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
table_name: Name of the table
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List of column information dictionaries
|
|
256
|
+
"""
|
|
257
|
+
query = f"DESCRIBE {table_name}"
|
|
258
|
+
result = self.fetch_all(query)
|
|
259
|
+
|
|
260
|
+
columns = []
|
|
261
|
+
for row in result:
|
|
262
|
+
columns.append({
|
|
263
|
+
"column_name": row[0],
|
|
264
|
+
"column_type": row[1],
|
|
265
|
+
"null": row[2],
|
|
266
|
+
"key": row[3] if len(row) > 3 else None,
|
|
267
|
+
"default": row[4] if len(row) > 4 else None,
|
|
268
|
+
"extra": row[5] if len(row) > 5 else None
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
return columns
|
|
272
|
+
|
|
273
|
+
def get_table_count(self, table_name: str) -> int:
|
|
274
|
+
"""
|
|
275
|
+
Get row count for table
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
table_name: Name of the table
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Number of rows in table
|
|
282
|
+
"""
|
|
283
|
+
query = f"SELECT COUNT(*) FROM {table_name}"
|
|
284
|
+
result = self.fetch_one(query)
|
|
285
|
+
return result[0] if result else 0
|
|
286
|
+
|
|
287
|
+
def list_tables(self) -> List[str]:
|
|
288
|
+
"""
|
|
289
|
+
Get list of all tables in database
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
List of table names
|
|
293
|
+
"""
|
|
294
|
+
query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'"
|
|
295
|
+
result = self.fetch_all(query)
|
|
296
|
+
return [row[0] for row in result]
|
|
297
|
+
|
|
298
|
+
def export_to_csv(self, table_name: str, file_path: str, delimiter: str = ","):
|
|
299
|
+
"""
|
|
300
|
+
Export table data to CSV file
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
table_name: Source table name
|
|
304
|
+
file_path: Output CSV file path
|
|
305
|
+
delimiter: CSV delimiter
|
|
306
|
+
"""
|
|
307
|
+
query = f"COPY {table_name} TO '{file_path}' (DELIMITER '{delimiter}', HEADER)"
|
|
308
|
+
self.execute(query)
|
|
309
|
+
logger.info(f"Table {table_name} exported to {file_path}")
|
|
310
|
+
|
|
311
|
+
def import_from_csv(self, table_name: str, file_path: str,
|
|
312
|
+
delimiter: str = ",", header: bool = True,
|
|
313
|
+
create_table: bool = True):
|
|
314
|
+
"""
|
|
315
|
+
Import data from CSV file
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
table_name: Target table name
|
|
319
|
+
file_path: CSV file path
|
|
320
|
+
delimiter: CSV delimiter
|
|
321
|
+
header: Whether CSV has header row
|
|
322
|
+
create_table: Whether to create table automatically
|
|
323
|
+
"""
|
|
324
|
+
if create_table:
|
|
325
|
+
# Let DuckDB auto-detect schema and create table
|
|
326
|
+
query = f"""
|
|
327
|
+
CREATE TABLE {table_name} AS
|
|
328
|
+
SELECT * FROM read_csv_auto('{file_path}', delim='{delimiter}', header={header})
|
|
329
|
+
"""
|
|
330
|
+
else:
|
|
331
|
+
# Insert into existing table
|
|
332
|
+
query = f"""
|
|
333
|
+
INSERT INTO {table_name}
|
|
334
|
+
SELECT * FROM read_csv_auto('{file_path}', delim='{delimiter}', header={header})
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
self.execute(query)
|
|
338
|
+
logger.info(f"Data imported from {file_path} to {table_name}")
|
|
339
|
+
|
|
340
|
+
def export_to_parquet(self, table_name: str, file_path: str):
|
|
341
|
+
"""
|
|
342
|
+
Export table data to Parquet file
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
table_name: Source table name
|
|
346
|
+
file_path: Output Parquet file path
|
|
347
|
+
"""
|
|
348
|
+
query = f"COPY {table_name} TO '{file_path}' (FORMAT PARQUET)"
|
|
349
|
+
self.execute(query)
|
|
350
|
+
logger.info(f"Table {table_name} exported to Parquet: {file_path}")
|
|
351
|
+
|
|
352
|
+
def import_from_parquet(self, table_name: str, file_path: str, create_table: bool = True):
|
|
353
|
+
"""
|
|
354
|
+
Import data from Parquet file
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
table_name: Target table name
|
|
358
|
+
file_path: Parquet file path
|
|
359
|
+
create_table: Whether to create table automatically
|
|
360
|
+
"""
|
|
361
|
+
if create_table:
|
|
362
|
+
query = f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}')"
|
|
363
|
+
else:
|
|
364
|
+
query = f"INSERT INTO {table_name} SELECT * FROM read_parquet('{file_path}')"
|
|
365
|
+
|
|
366
|
+
self.execute(query)
|
|
367
|
+
logger.info(f"Data imported from Parquet {file_path} to {table_name}")
|
|
368
|
+
|
|
369
|
+
def create_index(self, table_name: str, column_names: Union[str, List[str]],
|
|
370
|
+
index_name: Optional[str] = None):
|
|
371
|
+
"""
|
|
372
|
+
Create index on table columns
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
table_name: Target table name
|
|
376
|
+
column_names: Column name(s) for index
|
|
377
|
+
index_name: Optional custom index name
|
|
378
|
+
"""
|
|
379
|
+
if isinstance(column_names, str):
|
|
380
|
+
column_names = [column_names]
|
|
381
|
+
|
|
382
|
+
columns_str = ", ".join(column_names)
|
|
383
|
+
|
|
384
|
+
if not index_name:
|
|
385
|
+
index_name = f"idx_{table_name}_{'_'.join(column_names)}"
|
|
386
|
+
|
|
387
|
+
query = f"CREATE INDEX {index_name} ON {table_name} ({columns_str})"
|
|
388
|
+
self.execute(query)
|
|
389
|
+
logger.info(f"Index {index_name} created on {table_name}({columns_str})")
|
|
390
|
+
|
|
391
|
+
def analyze_table(self, table_name: str) -> Dict[str, Any]:
|
|
392
|
+
"""
|
|
393
|
+
Get comprehensive table statistics
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
table_name: Name of the table to analyze
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Dictionary containing table statistics
|
|
400
|
+
"""
|
|
401
|
+
# Basic table info
|
|
402
|
+
row_count = self.get_table_count(table_name)
|
|
403
|
+
columns = self.get_table_info(table_name)
|
|
404
|
+
|
|
405
|
+
# Column statistics
|
|
406
|
+
column_stats = {}
|
|
407
|
+
for col in columns:
|
|
408
|
+
col_name = col["column_name"]
|
|
409
|
+
col_type = col["column_type"]
|
|
410
|
+
|
|
411
|
+
if "INT" in col_type.upper() or "FLOAT" in col_type.upper() or "DOUBLE" in col_type.upper():
|
|
412
|
+
# Numeric column statistics
|
|
413
|
+
stats_query = f"""
|
|
414
|
+
SELECT
|
|
415
|
+
MIN({col_name}) as min_val,
|
|
416
|
+
MAX({col_name}) as max_val,
|
|
417
|
+
AVG({col_name}) as avg_val,
|
|
418
|
+
COUNT(DISTINCT {col_name}) as distinct_count,
|
|
419
|
+
COUNT({col_name}) as non_null_count
|
|
420
|
+
FROM {table_name}
|
|
421
|
+
"""
|
|
422
|
+
stats = self.fetch_one(stats_query)
|
|
423
|
+
if stats:
|
|
424
|
+
column_stats[col_name] = {
|
|
425
|
+
"type": "numeric",
|
|
426
|
+
"min": stats[0],
|
|
427
|
+
"max": stats[1],
|
|
428
|
+
"avg": stats[2],
|
|
429
|
+
"distinct_count": stats[3],
|
|
430
|
+
"non_null_count": stats[4],
|
|
431
|
+
"null_count": row_count - stats[4]
|
|
432
|
+
}
|
|
433
|
+
else:
|
|
434
|
+
# Text/other column statistics
|
|
435
|
+
stats_query = f"""
|
|
436
|
+
SELECT
|
|
437
|
+
COUNT(DISTINCT {col_name}) as distinct_count,
|
|
438
|
+
COUNT({col_name}) as non_null_count
|
|
439
|
+
FROM {table_name}
|
|
440
|
+
"""
|
|
441
|
+
stats = self.fetch_one(stats_query)
|
|
442
|
+
if stats:
|
|
443
|
+
column_stats[col_name] = {
|
|
444
|
+
"type": "categorical",
|
|
445
|
+
"distinct_count": stats[0],
|
|
446
|
+
"non_null_count": stats[1],
|
|
447
|
+
"null_count": row_count - stats[1]
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
return {
|
|
451
|
+
"table_name": table_name,
|
|
452
|
+
"row_count": row_count,
|
|
453
|
+
"column_count": len(columns),
|
|
454
|
+
"columns": columns,
|
|
455
|
+
"column_statistics": column_stats,
|
|
456
|
+
"analyzed_at": datetime.now().isoformat()
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
def search_tables(self, search_term: str) -> List[Dict[str, Any]]:
|
|
460
|
+
"""
|
|
461
|
+
Search for tables and columns containing the search term
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
search_term: Term to search for
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List of matching tables and columns
|
|
468
|
+
"""
|
|
469
|
+
search_term = search_term.lower()
|
|
470
|
+
results = []
|
|
471
|
+
|
|
472
|
+
# Search table names
|
|
473
|
+
tables = self.list_tables()
|
|
474
|
+
for table in tables:
|
|
475
|
+
if search_term in table.lower():
|
|
476
|
+
results.append({
|
|
477
|
+
"type": "table",
|
|
478
|
+
"table_name": table,
|
|
479
|
+
"match_type": "table_name",
|
|
480
|
+
"match_value": table
|
|
481
|
+
})
|
|
482
|
+
|
|
483
|
+
# Search column names
|
|
484
|
+
for table in tables:
|
|
485
|
+
columns = self.get_table_info(table)
|
|
486
|
+
for col in columns:
|
|
487
|
+
if search_term in col["column_name"].lower():
|
|
488
|
+
results.append({
|
|
489
|
+
"type": "column",
|
|
490
|
+
"table_name": table,
|
|
491
|
+
"column_name": col["column_name"],
|
|
492
|
+
"column_type": col["column_type"],
|
|
493
|
+
"match_type": "column_name",
|
|
494
|
+
"match_value": col["column_name"]
|
|
495
|
+
})
|
|
496
|
+
|
|
497
|
+
return results
|
|
498
|
+
|
|
499
|
+
def backup_database(self, backup_path: str):
|
|
500
|
+
"""
|
|
501
|
+
Create database backup
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
backup_path: Path for backup file
|
|
505
|
+
"""
|
|
506
|
+
if not self.db_path:
|
|
507
|
+
raise ValueError("Cannot backup in-memory database")
|
|
508
|
+
|
|
509
|
+
query = f"EXPORT DATABASE '{backup_path}'"
|
|
510
|
+
self.execute(query)
|
|
511
|
+
logger.info(f"Database backed up to {backup_path}")
|
|
512
|
+
|
|
513
|
+
def restore_database(self, backup_path: str):
|
|
514
|
+
"""
|
|
515
|
+
Restore database from backup
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
backup_path: Path to backup file
|
|
519
|
+
"""
|
|
520
|
+
query = f"IMPORT DATABASE '{backup_path}'"
|
|
521
|
+
self.execute(query)
|
|
522
|
+
logger.info(f"Database restored from {backup_path}")
|
|
523
|
+
|
|
524
|
+
def optimize_database(self):
|
|
525
|
+
"""Run database optimization operations"""
|
|
526
|
+
try:
|
|
527
|
+
# Analyze all tables for query optimization
|
|
528
|
+
tables = self.list_tables()
|
|
529
|
+
for table in tables:
|
|
530
|
+
self.execute(f"ANALYZE {table}")
|
|
531
|
+
|
|
532
|
+
# Run VACUUM to reclaim space
|
|
533
|
+
self.execute("VACUUM")
|
|
534
|
+
|
|
535
|
+
logger.info("Database optimization completed")
|
|
536
|
+
except Exception as e:
|
|
537
|
+
logger.error(f"Database optimization failed: {e}")
|
|
538
|
+
raise
|
|
539
|
+
|
|
540
|
+
def get_database_size(self) -> Dict[str, Any]:
|
|
541
|
+
"""
|
|
542
|
+
Get database size information
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Dictionary with size information
|
|
546
|
+
"""
|
|
547
|
+
if not self.db_path:
|
|
548
|
+
return {"type": "in_memory", "size": "N/A"}
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
db_file = Path(self.db_path)
|
|
552
|
+
if db_file.exists():
|
|
553
|
+
size_bytes = db_file.stat().st_size
|
|
554
|
+
size_mb = size_bytes / (1024 * 1024)
|
|
555
|
+
return {
|
|
556
|
+
"type": "file",
|
|
557
|
+
"path": str(db_file),
|
|
558
|
+
"size_bytes": size_bytes,
|
|
559
|
+
"size_mb": round(size_mb, 2),
|
|
560
|
+
"size_human": f"{size_mb:.2f} MB" if size_mb < 1024 else f"{size_mb/1024:.2f} GB"
|
|
561
|
+
}
|
|
562
|
+
else:
|
|
563
|
+
return {"type": "file", "path": str(db_file), "exists": False}
|
|
564
|
+
except Exception as e:
|
|
565
|
+
logger.error(f"Failed to get database size: {e}")
|
|
566
|
+
return {"type": "error", "error": str(e)}
|
|
567
|
+
|
|
568
|
+
def execute_script(self, script_path: str):
|
|
569
|
+
"""
|
|
570
|
+
Execute SQL script from file
|
|
571
|
+
|
|
572
|
+
Args:
|
|
573
|
+
script_path: Path to SQL script file
|
|
574
|
+
"""
|
|
575
|
+
script_file = Path(script_path)
|
|
576
|
+
if not script_file.exists():
|
|
577
|
+
raise FileNotFoundError(f"Script file not found: {script_path}")
|
|
578
|
+
|
|
579
|
+
with open(script_file, 'r', encoding='utf-8') as f:
|
|
580
|
+
script_content = f.read()
|
|
581
|
+
|
|
582
|
+
# Split script into individual statements
|
|
583
|
+
statements = [stmt.strip() for stmt in script_content.split(';') if stmt.strip()]
|
|
584
|
+
|
|
585
|
+
for statement in statements:
|
|
586
|
+
self.execute(statement)
|
|
587
|
+
|
|
588
|
+
logger.info(f"SQL script executed: {script_path}")
|
|
589
|
+
|
|
590
|
+
def __enter__(self):
|
|
591
|
+
"""Context manager entry"""
|
|
592
|
+
return self
|
|
593
|
+
|
|
594
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
595
|
+
"""Context manager exit"""
|
|
596
|
+
self.close()
|
|
597
|
+
|
|
598
|
+
def __del__(self):
|
|
599
|
+
"""Destructor to ensure connection is closed"""
|
|
600
|
+
if hasattr(self, 'connection') and self.connection:
|
|
601
|
+
self.close()
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
# Utility functions for common operations
|
|
605
|
+
def create_knowledge_base_tables(db_manager: DuckDBManager):
|
|
606
|
+
"""Create standard tables for knowledge base operations"""
|
|
607
|
+
|
|
608
|
+
# Documents table
|
|
609
|
+
documents_schema = {
|
|
610
|
+
"id": "VARCHAR PRIMARY KEY",
|
|
611
|
+
"kb_name": "VARCHAR NOT NULL",
|
|
612
|
+
"filename": "VARCHAR NOT NULL",
|
|
613
|
+
"file_path": "VARCHAR",
|
|
614
|
+
"file_size": "BIGINT",
|
|
615
|
+
"file_type": "VARCHAR",
|
|
616
|
+
"content": "TEXT",
|
|
617
|
+
"metadata": "JSON",
|
|
618
|
+
"created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP",
|
|
619
|
+
"updated_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
|
|
620
|
+
}
|
|
621
|
+
db_manager.create_table("documents", documents_schema)
|
|
622
|
+
|
|
623
|
+
# Chunks table for RAG
|
|
624
|
+
chunks_schema = {
|
|
625
|
+
"id": "VARCHAR PRIMARY KEY",
|
|
626
|
+
"document_id": "VARCHAR NOT NULL",
|
|
627
|
+
"kb_name": "VARCHAR NOT NULL",
|
|
628
|
+
"chunk_index": "INTEGER NOT NULL",
|
|
629
|
+
"content": "TEXT NOT NULL",
|
|
630
|
+
"embedding": "FLOAT[]",
|
|
631
|
+
"metadata": "JSON",
|
|
632
|
+
"created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
|
|
633
|
+
}
|
|
634
|
+
db_manager.create_table("chunks", chunks_schema)
|
|
635
|
+
|
|
636
|
+
# Search history table
|
|
637
|
+
search_history_schema = {
|
|
638
|
+
"id": "VARCHAR PRIMARY KEY",
|
|
639
|
+
"kb_name": "VARCHAR NOT NULL",
|
|
640
|
+
"query": "TEXT NOT NULL",
|
|
641
|
+
"results_count": "INTEGER",
|
|
642
|
+
"response_time_ms": "INTEGER",
|
|
643
|
+
"created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
|
|
644
|
+
}
|
|
645
|
+
db_manager.create_table("search_history", search_history_schema)
|
|
646
|
+
|
|
647
|
+
logger.info("Knowledge base tables created successfully")
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def create_analytics_tables(db_manager: DuckDBManager):
|
|
651
|
+
"""Create tables for analytics and monitoring"""
|
|
652
|
+
|
|
653
|
+
# User activities table
|
|
654
|
+
activities_schema = {
|
|
655
|
+
"id": "VARCHAR PRIMARY KEY",
|
|
656
|
+
"user_id": "VARCHAR",
|
|
657
|
+
"activity_type": "VARCHAR NOT NULL",
|
|
658
|
+
"activity_data": "JSON",
|
|
659
|
+
"duration_ms": "INTEGER",
|
|
660
|
+
"success": "BOOLEAN DEFAULT TRUE",
|
|
661
|
+
"created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
|
|
662
|
+
}
|
|
663
|
+
db_manager.create_table("user_activities", activities_schema)
|
|
664
|
+
|
|
665
|
+
# System metrics table
|
|
666
|
+
metrics_schema = {
|
|
667
|
+
"id": "VARCHAR PRIMARY KEY",
|
|
668
|
+
"metric_name": "VARCHAR NOT NULL",
|
|
669
|
+
"metric_value": "DOUBLE NOT NULL",
|
|
670
|
+
"metric_unit": "VARCHAR",
|
|
671
|
+
"tags": "JSON",
|
|
672
|
+
"recorded_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
|
|
673
|
+
}
|
|
674
|
+
db_manager.create_table("system_metrics", metrics_schema)
|
|
675
|
+
|
|
676
|
+
logger.info("Analytics tables created successfully")
|