sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,676 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ """
3
+ DuckDB database manager for Sirchmunk
4
+ Provides a comprehensive interface for DuckDB operations including
5
+ connection management, table operations, data manipulation, and analytics
6
+ """
7
+
8
+ import duckdb
9
+ import pandas as pd
10
+ from typing import Any, Dict, List, Optional, Union, Tuple
11
+ from pathlib import Path
12
+ import logging
13
+ from contextlib import contextmanager
14
+ from datetime import datetime
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class DuckDBManager:
20
+ """
21
+ A comprehensive DuckDB database manager providing common operations
22
+ for data storage, retrieval, and analytics in the Sirchmunk system.
23
+ """
24
+
25
+ def __init__(self, db_path: Optional[str] = None, read_only: bool = False):
26
+ """
27
+ Initialize DuckDB connection
28
+
29
+ Args:
30
+ db_path: Path to database file. If None, creates in-memory database
31
+ read_only: Whether to open database in read-only mode
32
+ """
33
+ self.db_path = db_path
34
+ self.read_only = read_only
35
+ self.connection = None
36
+ self._connect()
37
+
38
+ def _connect(self):
39
+ """Establish database connection"""
40
+ try:
41
+ if self.db_path:
42
+ self.connection = duckdb.connect(self.db_path, read_only=self.read_only)
43
+ logger.info(f"Connected to DuckDB at {self.db_path}")
44
+ else:
45
+ self.connection = duckdb.connect(":memory:")
46
+ logger.info("Connected to in-memory DuckDB")
47
+ except Exception as e:
48
+ logger.error(f"Failed to connect to DuckDB: {e}")
49
+ raise
50
+
51
+ def close(self):
52
+ """Close database connection"""
53
+ if self.connection:
54
+ self.connection.close()
55
+ self.connection = None
56
+ logger.info("DuckDB connection closed")
57
+
58
+ @contextmanager
59
+ def transaction(self):
60
+ """Context manager for database transactions"""
61
+ try:
62
+ self.connection.begin()
63
+ yield self.connection
64
+ self.connection.commit()
65
+ except Exception as e:
66
+ self.connection.rollback()
67
+ logger.error(f"Transaction rolled back: {e}")
68
+ raise
69
+
70
+ def execute(self, query: str, parameters: Optional[List] = None):
71
+ """
72
+ Execute SQL query
73
+
74
+ Args:
75
+ query: SQL query string
76
+ parameters: Optional query parameters
77
+
78
+ Returns:
79
+ Query result
80
+ """
81
+ try:
82
+ if parameters:
83
+ return self.connection.execute(query, parameters)
84
+ return self.connection.execute(query)
85
+ except Exception as e:
86
+ logger.error(f"Query execution failed: {e}")
87
+ logger.error(f"Query: {query}")
88
+ raise
89
+
90
+ def fetch_all(self, query: str, parameters: Optional[List] = None) -> List[Tuple]:
91
+ """
92
+ Execute query and fetch all results
93
+
94
+ Args:
95
+ query: SQL query string
96
+ parameters: Optional query parameters
97
+
98
+ Returns:
99
+ List of result tuples
100
+ """
101
+ result = self.execute(query, parameters)
102
+ return result.fetchall()
103
+
104
+ def fetch_one(self, query: str, parameters: Optional[List] = None) -> Optional[Tuple]:
105
+ """
106
+ Execute query and fetch one result
107
+
108
+ Args:
109
+ query: SQL query string
110
+ parameters: Optional query parameters
111
+
112
+ Returns:
113
+ Single result tuple or None
114
+ """
115
+ result = self.execute(query, parameters)
116
+ return result.fetchone()
117
+
118
+ def fetch_df(self, query: str, parameters: Optional[List] = None) -> pd.DataFrame:
119
+ """
120
+ Execute query and return results as pandas DataFrame
121
+
122
+ Args:
123
+ query: SQL query string
124
+ parameters: Optional query parameters
125
+
126
+ Returns:
127
+ Results as DataFrame
128
+ """
129
+ result = self.execute(query, parameters)
130
+ return result.df()
131
+
132
+ def create_table(self, table_name: str, schema: Dict[str, str], if_not_exists: bool = True):
133
+ """
134
+ Create table with specified schema
135
+
136
+ Args:
137
+ table_name: Name of the table
138
+ schema: Dictionary mapping column names to types
139
+ if_not_exists: Whether to use IF NOT EXISTS clause
140
+ """
141
+ columns = ", ".join([f"{col} {dtype}" for col, dtype in schema.items()])
142
+ if_not_exists_clause = "IF NOT EXISTS" if if_not_exists else ""
143
+
144
+ query = f"CREATE TABLE {if_not_exists_clause} {table_name} ({columns})"
145
+ self.execute(query)
146
+ logger.info(f"Table {table_name} created successfully")
147
+
148
+ def drop_table(self, table_name: str, if_exists: bool = True):
149
+ """
150
+ Drop table
151
+
152
+ Args:
153
+ table_name: Name of the table to drop
154
+ if_exists: Whether to use IF EXISTS clause
155
+ """
156
+ if_exists_clause = "IF EXISTS" if if_exists else ""
157
+ query = f"DROP TABLE {if_exists_clause} {table_name}"
158
+ self.execute(query)
159
+ logger.info(f"Table {table_name} dropped successfully")
160
+
161
+ def insert_data(self, table_name: str, data: Union[Dict, List[Dict], pd.DataFrame]):
162
+ """
163
+ Insert data into table
164
+
165
+ Args:
166
+ table_name: Target table name
167
+ data: Data to insert (dict, list of dicts, or DataFrame)
168
+ """
169
+ if isinstance(data, dict):
170
+ data = [data]
171
+
172
+ if isinstance(data, list):
173
+ if not data:
174
+ return
175
+
176
+ columns = list(data[0].keys())
177
+ placeholders = ", ".join(["?" for _ in columns])
178
+ column_names = ", ".join(columns)
179
+
180
+ query = f"INSERT INTO {table_name} ({column_names}) VALUES ({placeholders})"
181
+
182
+ for row in data:
183
+ values = [row.get(col) for col in columns]
184
+ self.execute(query, values)
185
+
186
+ elif isinstance(data, pd.DataFrame):
187
+ # Use DuckDB's efficient DataFrame insertion
188
+ self.connection.register("temp_df", data)
189
+ self.execute(f"INSERT INTO {table_name} SELECT * FROM temp_df")
190
+ self.connection.unregister("temp_df")
191
+
192
+ logger.info(f"Data inserted into {table_name}")
193
+
194
+ def update_data(self, table_name: str, set_clause: Dict[str, Any],
195
+ where_clause: str, where_params: Optional[List] = None):
196
+ """
197
+ Update data in table
198
+
199
+ Args:
200
+ table_name: Target table name
201
+ set_clause: Dictionary of column-value pairs to update
202
+ where_clause: WHERE condition
203
+ where_params: Parameters for WHERE clause
204
+ """
205
+ set_parts = [f"{col} = ?" for col in set_clause.keys()]
206
+ set_string = ", ".join(set_parts)
207
+
208
+ query = f"UPDATE {table_name} SET {set_string} WHERE {where_clause}"
209
+ params = list(set_clause.values())
210
+ if where_params:
211
+ params.extend(where_params)
212
+
213
+ self.execute(query, params)
214
+ logger.info(f"Data updated in {table_name}")
215
+
216
+ def delete_data(self, table_name: str, where_clause: str, where_params: Optional[List] = None):
217
+ """
218
+ Delete data from table
219
+
220
+ Args:
221
+ table_name: Target table name
222
+ where_clause: WHERE condition
223
+ where_params: Parameters for WHERE clause
224
+ """
225
+ query = f"DELETE FROM {table_name} WHERE {where_clause}"
226
+ self.execute(query, where_params)
227
+ logger.info(f"Data deleted from {table_name}")
228
+
229
+ def table_exists(self, table_name: str) -> bool:
230
+ """
231
+ Check if table exists
232
+
233
+ Args:
234
+ table_name: Name of the table to check
235
+
236
+ Returns:
237
+ True if table exists, False otherwise
238
+ """
239
+ query = """
240
+ SELECT COUNT(*)
241
+ FROM information_schema.tables
242
+ WHERE table_name = ?
243
+ """
244
+ result = self.fetch_one(query, [table_name])
245
+ return result[0] > 0 if result else False
246
+
247
+ def get_table_info(self, table_name: str) -> List[Dict]:
248
+ """
249
+ Get table schema information
250
+
251
+ Args:
252
+ table_name: Name of the table
253
+
254
+ Returns:
255
+ List of column information dictionaries
256
+ """
257
+ query = f"DESCRIBE {table_name}"
258
+ result = self.fetch_all(query)
259
+
260
+ columns = []
261
+ for row in result:
262
+ columns.append({
263
+ "column_name": row[0],
264
+ "column_type": row[1],
265
+ "null": row[2],
266
+ "key": row[3] if len(row) > 3 else None,
267
+ "default": row[4] if len(row) > 4 else None,
268
+ "extra": row[5] if len(row) > 5 else None
269
+ })
270
+
271
+ return columns
272
+
273
+ def get_table_count(self, table_name: str) -> int:
274
+ """
275
+ Get row count for table
276
+
277
+ Args:
278
+ table_name: Name of the table
279
+
280
+ Returns:
281
+ Number of rows in table
282
+ """
283
+ query = f"SELECT COUNT(*) FROM {table_name}"
284
+ result = self.fetch_one(query)
285
+ return result[0] if result else 0
286
+
287
+ def list_tables(self) -> List[str]:
288
+ """
289
+ Get list of all tables in database
290
+
291
+ Returns:
292
+ List of table names
293
+ """
294
+ query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'"
295
+ result = self.fetch_all(query)
296
+ return [row[0] for row in result]
297
+
298
+ def export_to_csv(self, table_name: str, file_path: str, delimiter: str = ","):
299
+ """
300
+ Export table data to CSV file
301
+
302
+ Args:
303
+ table_name: Source table name
304
+ file_path: Output CSV file path
305
+ delimiter: CSV delimiter
306
+ """
307
+ query = f"COPY {table_name} TO '{file_path}' (DELIMITER '{delimiter}', HEADER)"
308
+ self.execute(query)
309
+ logger.info(f"Table {table_name} exported to {file_path}")
310
+
311
+ def import_from_csv(self, table_name: str, file_path: str,
312
+ delimiter: str = ",", header: bool = True,
313
+ create_table: bool = True):
314
+ """
315
+ Import data from CSV file
316
+
317
+ Args:
318
+ table_name: Target table name
319
+ file_path: CSV file path
320
+ delimiter: CSV delimiter
321
+ header: Whether CSV has header row
322
+ create_table: Whether to create table automatically
323
+ """
324
+ if create_table:
325
+ # Let DuckDB auto-detect schema and create table
326
+ query = f"""
327
+ CREATE TABLE {table_name} AS
328
+ SELECT * FROM read_csv_auto('{file_path}', delim='{delimiter}', header={header})
329
+ """
330
+ else:
331
+ # Insert into existing table
332
+ query = f"""
333
+ INSERT INTO {table_name}
334
+ SELECT * FROM read_csv_auto('{file_path}', delim='{delimiter}', header={header})
335
+ """
336
+
337
+ self.execute(query)
338
+ logger.info(f"Data imported from {file_path} to {table_name}")
339
+
340
+ def export_to_parquet(self, table_name: str, file_path: str):
341
+ """
342
+ Export table data to Parquet file
343
+
344
+ Args:
345
+ table_name: Source table name
346
+ file_path: Output Parquet file path
347
+ """
348
+ query = f"COPY {table_name} TO '{file_path}' (FORMAT PARQUET)"
349
+ self.execute(query)
350
+ logger.info(f"Table {table_name} exported to Parquet: {file_path}")
351
+
352
+ def import_from_parquet(self, table_name: str, file_path: str, create_table: bool = True):
353
+ """
354
+ Import data from Parquet file
355
+
356
+ Args:
357
+ table_name: Target table name
358
+ file_path: Parquet file path
359
+ create_table: Whether to create table automatically
360
+ """
361
+ if create_table:
362
+ query = f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}')"
363
+ else:
364
+ query = f"INSERT INTO {table_name} SELECT * FROM read_parquet('{file_path}')"
365
+
366
+ self.execute(query)
367
+ logger.info(f"Data imported from Parquet {file_path} to {table_name}")
368
+
369
+ def create_index(self, table_name: str, column_names: Union[str, List[str]],
370
+ index_name: Optional[str] = None):
371
+ """
372
+ Create index on table columns
373
+
374
+ Args:
375
+ table_name: Target table name
376
+ column_names: Column name(s) for index
377
+ index_name: Optional custom index name
378
+ """
379
+ if isinstance(column_names, str):
380
+ column_names = [column_names]
381
+
382
+ columns_str = ", ".join(column_names)
383
+
384
+ if not index_name:
385
+ index_name = f"idx_{table_name}_{'_'.join(column_names)}"
386
+
387
+ query = f"CREATE INDEX {index_name} ON {table_name} ({columns_str})"
388
+ self.execute(query)
389
+ logger.info(f"Index {index_name} created on {table_name}({columns_str})")
390
+
391
+ def analyze_table(self, table_name: str) -> Dict[str, Any]:
392
+ """
393
+ Get comprehensive table statistics
394
+
395
+ Args:
396
+ table_name: Name of the table to analyze
397
+
398
+ Returns:
399
+ Dictionary containing table statistics
400
+ """
401
+ # Basic table info
402
+ row_count = self.get_table_count(table_name)
403
+ columns = self.get_table_info(table_name)
404
+
405
+ # Column statistics
406
+ column_stats = {}
407
+ for col in columns:
408
+ col_name = col["column_name"]
409
+ col_type = col["column_type"]
410
+
411
+ if "INT" in col_type.upper() or "FLOAT" in col_type.upper() or "DOUBLE" in col_type.upper():
412
+ # Numeric column statistics
413
+ stats_query = f"""
414
+ SELECT
415
+ MIN({col_name}) as min_val,
416
+ MAX({col_name}) as max_val,
417
+ AVG({col_name}) as avg_val,
418
+ COUNT(DISTINCT {col_name}) as distinct_count,
419
+ COUNT({col_name}) as non_null_count
420
+ FROM {table_name}
421
+ """
422
+ stats = self.fetch_one(stats_query)
423
+ if stats:
424
+ column_stats[col_name] = {
425
+ "type": "numeric",
426
+ "min": stats[0],
427
+ "max": stats[1],
428
+ "avg": stats[2],
429
+ "distinct_count": stats[3],
430
+ "non_null_count": stats[4],
431
+ "null_count": row_count - stats[4]
432
+ }
433
+ else:
434
+ # Text/other column statistics
435
+ stats_query = f"""
436
+ SELECT
437
+ COUNT(DISTINCT {col_name}) as distinct_count,
438
+ COUNT({col_name}) as non_null_count
439
+ FROM {table_name}
440
+ """
441
+ stats = self.fetch_one(stats_query)
442
+ if stats:
443
+ column_stats[col_name] = {
444
+ "type": "categorical",
445
+ "distinct_count": stats[0],
446
+ "non_null_count": stats[1],
447
+ "null_count": row_count - stats[1]
448
+ }
449
+
450
+ return {
451
+ "table_name": table_name,
452
+ "row_count": row_count,
453
+ "column_count": len(columns),
454
+ "columns": columns,
455
+ "column_statistics": column_stats,
456
+ "analyzed_at": datetime.now().isoformat()
457
+ }
458
+
459
+ def search_tables(self, search_term: str) -> List[Dict[str, Any]]:
460
+ """
461
+ Search for tables and columns containing the search term
462
+
463
+ Args:
464
+ search_term: Term to search for
465
+
466
+ Returns:
467
+ List of matching tables and columns
468
+ """
469
+ search_term = search_term.lower()
470
+ results = []
471
+
472
+ # Search table names
473
+ tables = self.list_tables()
474
+ for table in tables:
475
+ if search_term in table.lower():
476
+ results.append({
477
+ "type": "table",
478
+ "table_name": table,
479
+ "match_type": "table_name",
480
+ "match_value": table
481
+ })
482
+
483
+ # Search column names
484
+ for table in tables:
485
+ columns = self.get_table_info(table)
486
+ for col in columns:
487
+ if search_term in col["column_name"].lower():
488
+ results.append({
489
+ "type": "column",
490
+ "table_name": table,
491
+ "column_name": col["column_name"],
492
+ "column_type": col["column_type"],
493
+ "match_type": "column_name",
494
+ "match_value": col["column_name"]
495
+ })
496
+
497
+ return results
498
+
499
+ def backup_database(self, backup_path: str):
500
+ """
501
+ Create database backup
502
+
503
+ Args:
504
+ backup_path: Path for backup file
505
+ """
506
+ if not self.db_path:
507
+ raise ValueError("Cannot backup in-memory database")
508
+
509
+ query = f"EXPORT DATABASE '{backup_path}'"
510
+ self.execute(query)
511
+ logger.info(f"Database backed up to {backup_path}")
512
+
513
+ def restore_database(self, backup_path: str):
514
+ """
515
+ Restore database from backup
516
+
517
+ Args:
518
+ backup_path: Path to backup file
519
+ """
520
+ query = f"IMPORT DATABASE '{backup_path}'"
521
+ self.execute(query)
522
+ logger.info(f"Database restored from {backup_path}")
523
+
524
+ def optimize_database(self):
525
+ """Run database optimization operations"""
526
+ try:
527
+ # Analyze all tables for query optimization
528
+ tables = self.list_tables()
529
+ for table in tables:
530
+ self.execute(f"ANALYZE {table}")
531
+
532
+ # Run VACUUM to reclaim space
533
+ self.execute("VACUUM")
534
+
535
+ logger.info("Database optimization completed")
536
+ except Exception as e:
537
+ logger.error(f"Database optimization failed: {e}")
538
+ raise
539
+
540
+ def get_database_size(self) -> Dict[str, Any]:
541
+ """
542
+ Get database size information
543
+
544
+ Returns:
545
+ Dictionary with size information
546
+ """
547
+ if not self.db_path:
548
+ return {"type": "in_memory", "size": "N/A"}
549
+
550
+ try:
551
+ db_file = Path(self.db_path)
552
+ if db_file.exists():
553
+ size_bytes = db_file.stat().st_size
554
+ size_mb = size_bytes / (1024 * 1024)
555
+ return {
556
+ "type": "file",
557
+ "path": str(db_file),
558
+ "size_bytes": size_bytes,
559
+ "size_mb": round(size_mb, 2),
560
+ "size_human": f"{size_mb:.2f} MB" if size_mb < 1024 else f"{size_mb/1024:.2f} GB"
561
+ }
562
+ else:
563
+ return {"type": "file", "path": str(db_file), "exists": False}
564
+ except Exception as e:
565
+ logger.error(f"Failed to get database size: {e}")
566
+ return {"type": "error", "error": str(e)}
567
+
568
+ def execute_script(self, script_path: str):
569
+ """
570
+ Execute SQL script from file
571
+
572
+ Args:
573
+ script_path: Path to SQL script file
574
+ """
575
+ script_file = Path(script_path)
576
+ if not script_file.exists():
577
+ raise FileNotFoundError(f"Script file not found: {script_path}")
578
+
579
+ with open(script_file, 'r', encoding='utf-8') as f:
580
+ script_content = f.read()
581
+
582
+ # Split script into individual statements
583
+ statements = [stmt.strip() for stmt in script_content.split(';') if stmt.strip()]
584
+
585
+ for statement in statements:
586
+ self.execute(statement)
587
+
588
+ logger.info(f"SQL script executed: {script_path}")
589
+
590
+ def __enter__(self):
591
+ """Context manager entry"""
592
+ return self
593
+
594
+ def __exit__(self, exc_type, exc_val, exc_tb):
595
+ """Context manager exit"""
596
+ self.close()
597
+
598
+ def __del__(self):
599
+ """Destructor to ensure connection is closed"""
600
+ if hasattr(self, 'connection') and self.connection:
601
+ self.close()
602
+
603
+
604
+ # Utility functions for common operations
605
+ def create_knowledge_base_tables(db_manager: DuckDBManager):
606
+ """Create standard tables for knowledge base operations"""
607
+
608
+ # Documents table
609
+ documents_schema = {
610
+ "id": "VARCHAR PRIMARY KEY",
611
+ "kb_name": "VARCHAR NOT NULL",
612
+ "filename": "VARCHAR NOT NULL",
613
+ "file_path": "VARCHAR",
614
+ "file_size": "BIGINT",
615
+ "file_type": "VARCHAR",
616
+ "content": "TEXT",
617
+ "metadata": "JSON",
618
+ "created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP",
619
+ "updated_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
620
+ }
621
+ db_manager.create_table("documents", documents_schema)
622
+
623
+ # Chunks table for RAG
624
+ chunks_schema = {
625
+ "id": "VARCHAR PRIMARY KEY",
626
+ "document_id": "VARCHAR NOT NULL",
627
+ "kb_name": "VARCHAR NOT NULL",
628
+ "chunk_index": "INTEGER NOT NULL",
629
+ "content": "TEXT NOT NULL",
630
+ "embedding": "FLOAT[]",
631
+ "metadata": "JSON",
632
+ "created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
633
+ }
634
+ db_manager.create_table("chunks", chunks_schema)
635
+
636
+ # Search history table
637
+ search_history_schema = {
638
+ "id": "VARCHAR PRIMARY KEY",
639
+ "kb_name": "VARCHAR NOT NULL",
640
+ "query": "TEXT NOT NULL",
641
+ "results_count": "INTEGER",
642
+ "response_time_ms": "INTEGER",
643
+ "created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
644
+ }
645
+ db_manager.create_table("search_history", search_history_schema)
646
+
647
+ logger.info("Knowledge base tables created successfully")
648
+
649
+
650
+ def create_analytics_tables(db_manager: DuckDBManager):
651
+ """Create tables for analytics and monitoring"""
652
+
653
+ # User activities table
654
+ activities_schema = {
655
+ "id": "VARCHAR PRIMARY KEY",
656
+ "user_id": "VARCHAR",
657
+ "activity_type": "VARCHAR NOT NULL",
658
+ "activity_data": "JSON",
659
+ "duration_ms": "INTEGER",
660
+ "success": "BOOLEAN DEFAULT TRUE",
661
+ "created_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
662
+ }
663
+ db_manager.create_table("user_activities", activities_schema)
664
+
665
+ # System metrics table
666
+ metrics_schema = {
667
+ "id": "VARCHAR PRIMARY KEY",
668
+ "metric_name": "VARCHAR NOT NULL",
669
+ "metric_value": "DOUBLE NOT NULL",
670
+ "metric_unit": "VARCHAR",
671
+ "tags": "JSON",
672
+ "recorded_at": "TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
673
+ }
674
+ db_manager.create_table("system_metrics", metrics_schema)
675
+
676
+ logger.info("Analytics tables created successfully")