offagent 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. offagent/__init__.py +3 -0
  2. offagent/__main__.py +5 -0
  3. offagent/adapters/__init__.py +1 -0
  4. offagent/adapters/docx_adapter.py +1237 -0
  5. offagent/adapters/embedding_provider.py +132 -0
  6. offagent/adapters/pptx_adapter.py +940 -0
  7. offagent/adapters/xlsx_adapter.py +1266 -0
  8. offagent/app/__init__.py +1 -0
  9. offagent/app/progress.py +52 -0
  10. offagent/app/services.py +4267 -0
  11. offagent/config.py +287 -0
  12. offagent/domain/__init__.py +1 -0
  13. offagent/domain/locators.py +444 -0
  14. offagent/domain/models.py +477 -0
  15. offagent/domain/text_fragments.py +136 -0
  16. offagent/errors.py +29 -0
  17. offagent/indexing/__init__.py +1 -0
  18. offagent/indexing/store.py +795 -0
  19. offagent/interfaces/__init__.py +1 -0
  20. offagent/interfaces/cli.py +438 -0
  21. offagent/interfaces/cli_output.py +139 -0
  22. offagent/interfaces/cli_progress.py +120 -0
  23. offagent/interfaces/mcp.py +1145 -0
  24. offagent/interfaces/mcp_converters.py +80 -0
  25. offagent/interfaces/mcp_models.py +923 -0
  26. offagent/objects/__init__.py +3 -0
  27. offagent/objects/base.py +26 -0
  28. offagent/objects/docx_objects.py +951 -0
  29. offagent/objects/pptx_objects.py +895 -0
  30. offagent/objects/xlsx_objects.py +962 -0
  31. offagent/path_policy.py +42 -0
  32. offagent/storage/__init__.py +1 -0
  33. offagent/storage/versioning.py +31 -0
  34. offagent-0.10.0.dist-info/METADATA +546 -0
  35. offagent-0.10.0.dist-info/RECORD +39 -0
  36. offagent-0.10.0.dist-info/WHEEL +5 -0
  37. offagent-0.10.0.dist-info/entry_points.txt +2 -0
  38. offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
  39. offagent-0.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,795 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime as dt
4
+ import json
5
+ import sqlite3
6
+ from pathlib import Path
7
+ from typing import Sequence
8
+
9
+ from offagent.domain.models import DocumentRef, IndexedItem
10
+
11
+ DOCUMENTS_SQL = """
12
+ CREATE TABLE IF NOT EXISTS documents (
13
+ document_id TEXT PRIMARY KEY,
14
+ path TEXT NOT NULL UNIQUE,
15
+ file_type TEXT NOT NULL,
16
+ display_name TEXT NOT NULL,
17
+ modified_time REAL NOT NULL,
18
+ content_hash TEXT,
19
+ is_active INTEGER NOT NULL DEFAULT 1
20
+ );
21
+ """
22
+
23
+ ITEMS_SQL = """
24
+ CREATE TABLE IF NOT EXISTS items (
25
+ storage_id TEXT PRIMARY KEY,
26
+ document_id TEXT NOT NULL,
27
+ item_id TEXT NOT NULL,
28
+ item_type TEXT NOT NULL,
29
+ locator TEXT NOT NULL,
30
+ preview TEXT NOT NULL,
31
+ content_text TEXT NOT NULL DEFAULT '',
32
+ metadata_json TEXT NOT NULL DEFAULT '{}',
33
+ UNIQUE(document_id, item_id),
34
+ FOREIGN KEY (document_id) REFERENCES documents(document_id)
35
+ );
36
+ """
37
+
38
+ ITEMS_FTS_SQL = """
39
+ CREATE VIRTUAL TABLE items_fts USING fts5(
40
+ storage_id UNINDEXED,
41
+ item_id UNINDEXED,
42
+ document_id UNINDEXED,
43
+ content_text
44
+ );
45
+ """
46
+
47
+ ITEM_EMBEDDINGS_SQL = """
48
+ CREATE TABLE IF NOT EXISTS item_embeddings (
49
+ storage_id TEXT PRIMARY KEY REFERENCES items(storage_id),
50
+ model_name TEXT NOT NULL,
51
+ dimensions INTEGER NOT NULL,
52
+ embedding BLOB NOT NULL,
53
+ updated_at TEXT NOT NULL
54
+ );
55
+ """
56
+
57
+ XLSX_ROW_EMBEDDINGS_SQL = """
58
+ CREATE TABLE IF NOT EXISTS xlsx_row_embeddings (
59
+ embedding_id TEXT PRIMARY KEY,
60
+ document_id TEXT NOT NULL,
61
+ sheet_name TEXT NOT NULL,
62
+ row_number INTEGER NOT NULL,
63
+ representative_storage_id TEXT NOT NULL REFERENCES items(storage_id),
64
+ content_text TEXT NOT NULL,
65
+ preview TEXT NOT NULL,
66
+ model_name TEXT NOT NULL,
67
+ dimensions INTEGER NOT NULL,
68
+ embedding BLOB NOT NULL,
69
+ updated_at TEXT NOT NULL,
70
+ UNIQUE(document_id, sheet_name, row_number),
71
+ FOREIGN KEY (document_id) REFERENCES documents(document_id)
72
+ );
73
+ """
74
+
75
+ XLSX_ROW_EMBEDDING_CELLS_SQL = """
76
+ CREATE TABLE IF NOT EXISTS xlsx_row_embedding_cells (
77
+ embedding_id TEXT NOT NULL REFERENCES xlsx_row_embeddings(embedding_id),
78
+ storage_id TEXT NOT NULL REFERENCES items(storage_id),
79
+ cell_coordinate TEXT NOT NULL,
80
+ cell_order INTEGER NOT NULL,
81
+ is_representative INTEGER NOT NULL DEFAULT 0,
82
+ PRIMARY KEY (embedding_id, storage_id)
83
+ );
84
+ """
85
+
86
+ EMBEDDING_META_SQL = """
87
+ CREATE TABLE IF NOT EXISTS embedding_meta (
88
+ key TEXT PRIMARY KEY,
89
+ value TEXT NOT NULL
90
+ );
91
+ """
92
+
93
+ EMBEDDING_META_KEYS = {
94
+ "model_name",
95
+ "dimensions",
96
+ "similarity_metric",
97
+ "schema_version",
98
+ }
99
+ EMBEDDING_SCHEMA_VERSION = "1"
100
+ SIMILARITY_METRIC = "cosine"
101
+
102
+
103
+ class StoreCapabilityError(RuntimeError):
104
+ """Raised when the runtime cannot satisfy store requirements."""
105
+
106
+
107
+ def connect(index_path: Path) -> sqlite3.Connection:
108
+ index_path.parent.mkdir(parents=True, exist_ok=True)
109
+ connection = sqlite3.connect(index_path)
110
+ connection.row_factory = sqlite3.Row
111
+ return connection
112
+
113
+
114
+ def supports_fts5(connection: sqlite3.Connection) -> bool:
115
+ table_name = "fts5_probe"
116
+ try:
117
+ connection.execute(
118
+ f"CREATE VIRTUAL TABLE {table_name} USING fts5(content_text)"
119
+ )
120
+ connection.execute(f"DROP TABLE {table_name}")
121
+ return True
122
+ except sqlite3.OperationalError:
123
+ return False
124
+
125
+
126
+ def initialize_schema(connection: sqlite3.Connection) -> None:
127
+ if not supports_fts5(connection):
128
+ raise StoreCapabilityError("SQLite FTS5 support is required.")
129
+
130
+ connection.executescript(DOCUMENTS_SQL)
131
+ connection.executescript(ITEMS_SQL)
132
+ connection.executescript(ITEM_EMBEDDINGS_SQL)
133
+ connection.executescript(XLSX_ROW_EMBEDDINGS_SQL)
134
+ connection.executescript(XLSX_ROW_EMBEDDING_CELLS_SQL)
135
+ connection.executescript(EMBEDDING_META_SQL)
136
+ _migrate_documents_table(connection)
137
+ _migrate_items_table(connection)
138
+ _rebuild_items_fts(connection)
139
+ connection.commit()
140
+
141
+
142
+ def ensure_ready(index_path: Path) -> sqlite3.Connection:
143
+ connection = connect(index_path)
144
+ try:
145
+ initialize_schema(connection)
146
+ except Exception:
147
+ connection.close()
148
+ raise
149
+ return connection
150
+
151
+
152
+ def make_storage_id(document_id: str, item_id: str) -> str:
153
+ return f"{document_id}:{item_id}"
154
+
155
+
156
+ def make_xlsx_row_embedding_id(
157
+ document_id: str, sheet_name: str, row_number: int
158
+ ) -> str:
159
+ return f"{document_id}:xlsx-row:{sheet_name}!{row_number}"
160
+
161
+
162
+ def upsert_document(connection: sqlite3.Connection, document: DocumentRef) -> None:
163
+ connection.execute(
164
+ """
165
+ INSERT INTO documents (
166
+ document_id,
167
+ path,
168
+ file_type,
169
+ display_name,
170
+ modified_time,
171
+ content_hash,
172
+ is_active
173
+ )
174
+ VALUES (?, ?, ?, ?, ?, ?, 1)
175
+ ON CONFLICT(document_id) DO UPDATE SET
176
+ path = excluded.path,
177
+ file_type = excluded.file_type,
178
+ display_name = excluded.display_name,
179
+ modified_time = excluded.modified_time,
180
+ content_hash = excluded.content_hash,
181
+ is_active = 1
182
+ """,
183
+ (
184
+ document.document_id,
185
+ str(document.path),
186
+ document.file_type,
187
+ document.display_name,
188
+ document.modified_time,
189
+ document.content_hash,
190
+ ),
191
+ )
192
+
193
+
194
+ def replace_document_items(
195
+ connection: sqlite3.Connection,
196
+ document_id: str,
197
+ items: Sequence[IndexedItem],
198
+ ) -> None:
199
+ connection.execute("DELETE FROM items WHERE document_id = ?", (document_id,))
200
+ connection.execute("DELETE FROM items_fts WHERE document_id = ?", (document_id,))
201
+
202
+ for item in items:
203
+ storage_id = make_storage_id(document_id, item.item_id)
204
+ connection.execute(
205
+ """
206
+ INSERT INTO items (
207
+ storage_id,
208
+ document_id,
209
+ item_id,
210
+ item_type,
211
+ locator,
212
+ preview,
213
+ content_text,
214
+ metadata_json
215
+ )
216
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
217
+ """,
218
+ (
219
+ storage_id,
220
+ document_id,
221
+ item.item_id,
222
+ item.item_type,
223
+ item.locator,
224
+ item.preview,
225
+ item.content_text,
226
+ json.dumps(item.metadata, sort_keys=True),
227
+ ),
228
+ )
229
+ connection.execute(
230
+ """
231
+ INSERT INTO items_fts (
232
+ storage_id,
233
+ item_id,
234
+ document_id,
235
+ content_text
236
+ )
237
+ VALUES (?, ?, ?, ?)
238
+ """,
239
+ (storage_id, item.item_id, document_id, item.content_text),
240
+ )
241
+
242
+
243
+ def fetch_document_by_path(
244
+ connection: sqlite3.Connection, document_path: Path
245
+ ) -> sqlite3.Row | None:
246
+ return connection.execute(
247
+ """
248
+ SELECT d.*, COUNT(i.storage_id) AS item_count
249
+ FROM documents AS d
250
+ LEFT JOIN items AS i ON i.document_id = d.document_id
251
+ WHERE d.path = ? AND d.is_active = 1
252
+ GROUP BY d.document_id
253
+ """,
254
+ (str(document_path.resolve()),),
255
+ ).fetchone()
256
+
257
+
258
+ def fetch_document_by_id(
259
+ connection: sqlite3.Connection, document_id: str
260
+ ) -> sqlite3.Row | None:
261
+ return connection.execute(
262
+ """
263
+ SELECT d.*, COUNT(i.storage_id) AS item_count
264
+ FROM documents AS d
265
+ LEFT JOIN items AS i ON i.document_id = d.document_id
266
+ WHERE d.document_id = ? AND d.is_active = 1
267
+ GROUP BY d.document_id
268
+ """,
269
+ (document_id,),
270
+ ).fetchone()
271
+
272
+
273
+ def fetch_documents(connection: sqlite3.Connection) -> list[sqlite3.Row]:
274
+ return list(
275
+ connection.execute(
276
+ """
277
+ SELECT d.*, COUNT(i.storage_id) AS item_count
278
+ FROM documents AS d
279
+ LEFT JOIN items AS i ON i.document_id = d.document_id
280
+ WHERE d.is_active = 1
281
+ GROUP BY d.document_id
282
+ ORDER BY d.path
283
+ """
284
+ ).fetchall()
285
+ )
286
+
287
+
288
+ def fetch_item_by_id(
289
+ connection: sqlite3.Connection,
290
+ document_id: str,
291
+ item_id: str,
292
+ ) -> sqlite3.Row | None:
293
+ return connection.execute(
294
+ """
295
+ SELECT *
296
+ FROM items
297
+ WHERE document_id = ? AND item_id = ?
298
+ """,
299
+ (document_id, item_id),
300
+ ).fetchone()
301
+
302
+
303
+ def fetch_items_for_document(
304
+ connection: sqlite3.Connection,
305
+ document_id: str,
306
+ ) -> list[sqlite3.Row]:
307
+ return list(
308
+ connection.execute(
309
+ """
310
+ SELECT *
311
+ FROM items
312
+ WHERE document_id = ?
313
+ ORDER BY item_id
314
+ """,
315
+ (document_id,),
316
+ ).fetchall()
317
+ )
318
+
319
+
320
+ def search_items(
321
+ connection: sqlite3.Connection,
322
+ query: str,
323
+ *,
324
+ file_type: str | None = None,
325
+ document_path: Path | None = None,
326
+ limit: int = 20,
327
+ ) -> list[sqlite3.Row]:
328
+ sql = """
329
+ SELECT
330
+ i.storage_id,
331
+ i.document_id,
332
+ i.item_id,
333
+ i.item_type,
334
+ i.locator,
335
+ i.preview,
336
+ i.content_text,
337
+ i.metadata_json,
338
+ d.path,
339
+ d.display_name,
340
+ bm25(items_fts) AS score
341
+ FROM items_fts
342
+ JOIN items AS i ON i.storage_id = items_fts.storage_id
343
+ JOIN documents AS d ON d.document_id = i.document_id
344
+ WHERE items_fts MATCH ?
345
+ AND d.is_active = 1
346
+ """
347
+ params: list[object] = [query]
348
+
349
+ if file_type is not None:
350
+ sql += " AND d.file_type = ?"
351
+ params.append(file_type)
352
+
353
+ if document_path is not None:
354
+ sql += " AND d.path = ?"
355
+ params.append(str(document_path.resolve()))
356
+
357
+ sql += " ORDER BY score, d.path, i.item_id LIMIT ?"
358
+ params.append(limit)
359
+
360
+ return list(connection.execute(sql, params).fetchall())
361
+
362
+
363
+ def fetch_item_embeddings(
364
+ connection: sqlite3.Connection,
365
+ *,
366
+ file_type: str | None = None,
367
+ document_path: Path | None = None,
368
+ ) -> list[sqlite3.Row]:
369
+ sql = """
370
+ SELECT
371
+ e.storage_id,
372
+ e.model_name,
373
+ e.dimensions,
374
+ e.embedding,
375
+ e.updated_at,
376
+ i.document_id,
377
+ i.item_id,
378
+ i.item_type,
379
+ i.locator,
380
+ i.preview,
381
+ i.content_text,
382
+ i.metadata_json,
383
+ d.path,
384
+ d.display_name
385
+ FROM item_embeddings AS e
386
+ JOIN items AS i ON i.storage_id = e.storage_id
387
+ JOIN documents AS d ON d.document_id = i.document_id
388
+ WHERE d.is_active = 1
389
+ """
390
+ params: list[object] = []
391
+
392
+ if file_type is not None:
393
+ sql += " AND d.file_type = ?"
394
+ params.append(file_type)
395
+
396
+ if document_path is not None:
397
+ sql += " AND d.path = ?"
398
+ params.append(str(document_path.resolve()))
399
+
400
+ sql += " ORDER BY d.path, i.item_id"
401
+ return list(connection.execute(sql, params).fetchall())
402
+
403
+
404
+ def fetch_xlsx_row_embeddings(
405
+ connection: sqlite3.Connection,
406
+ *,
407
+ file_type: str | None = None,
408
+ document_path: Path | None = None,
409
+ ) -> list[sqlite3.Row]:
410
+ if file_type not in (None, "xlsx"):
411
+ return []
412
+
413
+ sql = """
414
+ SELECT
415
+ e.embedding_id,
416
+ e.document_id,
417
+ e.sheet_name,
418
+ e.row_number,
419
+ e.representative_storage_id,
420
+ e.content_text,
421
+ e.preview AS row_preview,
422
+ e.model_name,
423
+ e.dimensions,
424
+ e.embedding,
425
+ e.updated_at,
426
+ i.item_id,
427
+ i.item_type,
428
+ i.locator,
429
+ i.preview,
430
+ i.content_text AS item_content_text,
431
+ i.metadata_json,
432
+ d.path,
433
+ d.display_name
434
+ FROM xlsx_row_embeddings AS e
435
+ JOIN documents AS d ON d.document_id = e.document_id
436
+ JOIN items AS i ON i.storage_id = e.representative_storage_id
437
+ WHERE d.is_active = 1
438
+ AND d.file_type = 'xlsx'
439
+ """
440
+ params: list[object] = []
441
+
442
+ if document_path is not None:
443
+ sql += " AND d.path = ?"
444
+ params.append(str(document_path.resolve()))
445
+
446
+ sql += " ORDER BY d.path, e.sheet_name, e.row_number"
447
+ return list(connection.execute(sql, params).fetchall())
448
+
449
+
450
+ def fetch_xlsx_row_embedding_cells(
451
+ connection: sqlite3.Connection,
452
+ embedding_id: str,
453
+ ) -> list[sqlite3.Row]:
454
+ return list(
455
+ connection.execute(
456
+ """
457
+ SELECT
458
+ embedding_id,
459
+ storage_id,
460
+ cell_coordinate,
461
+ cell_order,
462
+ is_representative
463
+ FROM xlsx_row_embedding_cells
464
+ WHERE embedding_id = ?
465
+ ORDER BY cell_order, cell_coordinate
466
+ """,
467
+ (embedding_id,),
468
+ ).fetchall()
469
+ )
470
+
471
+
472
+ def has_item_embeddings(
473
+ connection: sqlite3.Connection,
474
+ *,
475
+ file_type: str | None = None,
476
+ document_path: Path | None = None,
477
+ ) -> bool:
478
+ if file_type == "xlsx":
479
+ sql = """
480
+ SELECT 1
481
+ FROM xlsx_row_embeddings AS e
482
+ JOIN documents AS d ON d.document_id = e.document_id
483
+ WHERE d.is_active = 1
484
+ AND d.file_type = 'xlsx'
485
+ """
486
+ params: list[object] = []
487
+ if document_path is not None:
488
+ sql += " AND d.path = ?"
489
+ params.append(str(document_path.resolve()))
490
+ sql += " LIMIT 1"
491
+ return connection.execute(sql, params).fetchone() is not None
492
+
493
+ if file_type is None:
494
+ if has_item_embeddings(
495
+ connection, file_type="docx", document_path=document_path
496
+ ):
497
+ return True
498
+ if has_item_embeddings(
499
+ connection, file_type="pptx", document_path=document_path
500
+ ):
501
+ return True
502
+ return has_item_embeddings(
503
+ connection, file_type="xlsx", document_path=document_path
504
+ )
505
+
506
+ sql = """
507
+ SELECT 1
508
+ FROM item_embeddings AS e
509
+ JOIN items AS i ON i.storage_id = e.storage_id
510
+ JOIN documents AS d ON d.document_id = i.document_id
511
+ WHERE d.is_active = 1
512
+ """
513
+ params: list[object] = []
514
+
515
+ if file_type is not None:
516
+ sql += " AND d.file_type = ?"
517
+ params.append(file_type)
518
+
519
+ if document_path is not None:
520
+ sql += " AND d.path = ?"
521
+ params.append(str(document_path.resolve()))
522
+
523
+ sql += " LIMIT 1"
524
+ return connection.execute(sql, params).fetchone() is not None
525
+
526
+
527
+ def delete_document_embeddings(
528
+ connection: sqlite3.Connection, document_id: str
529
+ ) -> None:
530
+ delete_document_xlsx_row_embeddings(connection, document_id)
531
+ connection.execute(
532
+ """
533
+ DELETE FROM item_embeddings
534
+ WHERE storage_id LIKE ?
535
+ """,
536
+ (f"{document_id}:%",),
537
+ )
538
+
539
+
540
+ def delete_document_xlsx_row_embeddings(
541
+ connection: sqlite3.Connection, document_id: str
542
+ ) -> None:
543
+ connection.execute(
544
+ """
545
+ DELETE FROM xlsx_row_embedding_cells
546
+ WHERE embedding_id IN (
547
+ SELECT embedding_id
548
+ FROM xlsx_row_embeddings
549
+ WHERE document_id = ?
550
+ )
551
+ """,
552
+ (document_id,),
553
+ )
554
+ connection.execute(
555
+ """
556
+ DELETE FROM xlsx_row_embeddings
557
+ WHERE document_id = ?
558
+ """,
559
+ (document_id,),
560
+ )
561
+
562
+
563
+ def replace_document_embeddings(
564
+ connection: sqlite3.Connection,
565
+ *,
566
+ document_id: str,
567
+ model_name: str,
568
+ dimensions: int,
569
+ embeddings: Sequence[tuple[str, bytes]],
570
+ ) -> None:
571
+ delete_document_embeddings(connection, document_id)
572
+ updated_at = dt.datetime.now(dt.timezone.utc).isoformat()
573
+ for storage_id, embedding in embeddings:
574
+ connection.execute(
575
+ """
576
+ INSERT INTO item_embeddings (
577
+ storage_id,
578
+ model_name,
579
+ dimensions,
580
+ embedding,
581
+ updated_at
582
+ )
583
+ VALUES (?, ?, ?, ?, ?)
584
+ """,
585
+ (storage_id, model_name, dimensions, embedding, updated_at),
586
+ )
587
+
588
+
589
+ def replace_xlsx_row_embeddings(
590
+ connection: sqlite3.Connection,
591
+ *,
592
+ document_id: str,
593
+ model_name: str,
594
+ dimensions: int,
595
+ row_embeddings: Sequence[
596
+ tuple[
597
+ str,
598
+ str,
599
+ int,
600
+ str,
601
+ str,
602
+ str,
603
+ bytes,
604
+ Sequence[tuple[str, str, int, bool]],
605
+ ]
606
+ ],
607
+ ) -> None:
608
+ delete_document_xlsx_row_embeddings(connection, document_id)
609
+ updated_at = dt.datetime.now(dt.timezone.utc).isoformat()
610
+ for (
611
+ embedding_id,
612
+ sheet_name,
613
+ row_number,
614
+ representative_storage_id,
615
+ content_text,
616
+ preview,
617
+ embedding,
618
+ contributing_cells,
619
+ ) in row_embeddings:
620
+ connection.execute(
621
+ """
622
+ INSERT INTO xlsx_row_embeddings (
623
+ embedding_id,
624
+ document_id,
625
+ sheet_name,
626
+ row_number,
627
+ representative_storage_id,
628
+ content_text,
629
+ preview,
630
+ model_name,
631
+ dimensions,
632
+ embedding,
633
+ updated_at
634
+ )
635
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
636
+ """,
637
+ (
638
+ embedding_id,
639
+ document_id,
640
+ sheet_name,
641
+ row_number,
642
+ representative_storage_id,
643
+ content_text,
644
+ preview,
645
+ model_name,
646
+ dimensions,
647
+ embedding,
648
+ updated_at,
649
+ ),
650
+ )
651
+ for (
652
+ storage_id,
653
+ cell_coordinate,
654
+ cell_order,
655
+ is_representative,
656
+ ) in contributing_cells:
657
+ connection.execute(
658
+ """
659
+ INSERT INTO xlsx_row_embedding_cells (
660
+ embedding_id,
661
+ storage_id,
662
+ cell_coordinate,
663
+ cell_order,
664
+ is_representative
665
+ )
666
+ VALUES (?, ?, ?, ?, ?)
667
+ """,
668
+ (
669
+ embedding_id,
670
+ storage_id,
671
+ cell_coordinate,
672
+ cell_order,
673
+ 1 if is_representative else 0,
674
+ ),
675
+ )
676
+
677
+
678
+ def fetch_embedding_meta(connection: sqlite3.Connection) -> dict[str, str]:
679
+ return {
680
+ row["key"]: row["value"]
681
+ for row in connection.execute(
682
+ """
683
+ SELECT key, value
684
+ FROM embedding_meta
685
+ """
686
+ ).fetchall()
687
+ }
688
+
689
+
690
+ def ensure_embedding_meta(
691
+ connection: sqlite3.Connection,
692
+ *,
693
+ model_name: str,
694
+ dimensions: int,
695
+ ) -> None:
696
+ expected = {
697
+ "model_name": model_name,
698
+ "dimensions": str(dimensions),
699
+ "similarity_metric": SIMILARITY_METRIC,
700
+ "schema_version": EMBEDDING_SCHEMA_VERSION,
701
+ }
702
+ existing = fetch_embedding_meta(connection)
703
+
704
+ if not existing:
705
+ for key, value in expected.items():
706
+ connection.execute(
707
+ """
708
+ INSERT INTO embedding_meta (key, value)
709
+ VALUES (?, ?)
710
+ """,
711
+ (key, value),
712
+ )
713
+ return
714
+
715
+ if set(existing) != EMBEDDING_META_KEYS:
716
+ raise RuntimeError("Stored embedding metadata is incomplete or unsupported.")
717
+
718
+ for key, expected_value in expected.items():
719
+ actual_value = existing.get(key)
720
+ if actual_value != expected_value:
721
+ raise RuntimeError(
722
+ f"Embedding metadata mismatch for {key}: expected {expected_value}, found {actual_value}."
723
+ )
724
+
725
+
726
+ def _migrate_documents_table(connection: sqlite3.Connection) -> None:
727
+ document_columns = _table_columns(connection, "documents")
728
+ if "is_active" not in document_columns:
729
+ connection.execute(
730
+ "ALTER TABLE documents ADD COLUMN is_active INTEGER NOT NULL DEFAULT 1"
731
+ )
732
+
733
+
734
+ def _migrate_items_table(connection: sqlite3.Connection) -> None:
735
+ item_columns = _table_columns(connection, "items")
736
+ required_columns = {
737
+ "storage_id",
738
+ "document_id",
739
+ "item_id",
740
+ "item_type",
741
+ "locator",
742
+ "preview",
743
+ "content_text",
744
+ "metadata_json",
745
+ }
746
+ if required_columns.issubset(item_columns):
747
+ return
748
+
749
+ if item_columns:
750
+ connection.execute("ALTER TABLE items RENAME TO items_legacy")
751
+ connection.executescript(ITEMS_SQL)
752
+ legacy_columns = _table_columns(connection, "items_legacy")
753
+ metadata_expr = "metadata_json" if "metadata_json" in legacy_columns else "'{}'"
754
+ connection.execute(
755
+ f"""
756
+ INSERT INTO items (
757
+ storage_id,
758
+ document_id,
759
+ item_id,
760
+ item_type,
761
+ locator,
762
+ preview,
763
+ content_text,
764
+ metadata_json
765
+ )
766
+ SELECT
767
+ document_id || ':' || item_id,
768
+ document_id,
769
+ item_id,
770
+ item_type,
771
+ locator,
772
+ preview,
773
+ content_text,
774
+ {metadata_expr}
775
+ FROM items_legacy
776
+ """
777
+ )
778
+ connection.execute("DROP TABLE items_legacy")
779
+
780
+
781
+ def _rebuild_items_fts(connection: sqlite3.Connection) -> None:
782
+ connection.execute("DROP TABLE IF EXISTS items_fts")
783
+ connection.executescript(ITEMS_FTS_SQL)
784
+ connection.execute(
785
+ """
786
+ INSERT INTO items_fts (storage_id, item_id, document_id, content_text)
787
+ SELECT storage_id, item_id, document_id, content_text
788
+ FROM items
789
+ """
790
+ )
791
+
792
+
793
+ def _table_columns(connection: sqlite3.Connection, table_name: str) -> set[str]:
794
+ rows = connection.execute(f"PRAGMA table_info({table_name})").fetchall()
795
+ return {row[1] for row in rows}