offagent 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- offagent/__init__.py +3 -0
- offagent/__main__.py +5 -0
- offagent/adapters/__init__.py +1 -0
- offagent/adapters/docx_adapter.py +1237 -0
- offagent/adapters/embedding_provider.py +132 -0
- offagent/adapters/pptx_adapter.py +940 -0
- offagent/adapters/xlsx_adapter.py +1266 -0
- offagent/app/__init__.py +1 -0
- offagent/app/progress.py +52 -0
- offagent/app/services.py +4267 -0
- offagent/config.py +287 -0
- offagent/domain/__init__.py +1 -0
- offagent/domain/locators.py +444 -0
- offagent/domain/models.py +477 -0
- offagent/domain/text_fragments.py +136 -0
- offagent/errors.py +29 -0
- offagent/indexing/__init__.py +1 -0
- offagent/indexing/store.py +795 -0
- offagent/interfaces/__init__.py +1 -0
- offagent/interfaces/cli.py +438 -0
- offagent/interfaces/cli_output.py +139 -0
- offagent/interfaces/cli_progress.py +120 -0
- offagent/interfaces/mcp.py +1145 -0
- offagent/interfaces/mcp_converters.py +80 -0
- offagent/interfaces/mcp_models.py +923 -0
- offagent/objects/__init__.py +3 -0
- offagent/objects/base.py +26 -0
- offagent/objects/docx_objects.py +951 -0
- offagent/objects/pptx_objects.py +895 -0
- offagent/objects/xlsx_objects.py +962 -0
- offagent/path_policy.py +42 -0
- offagent/storage/__init__.py +1 -0
- offagent/storage/versioning.py +31 -0
- offagent-0.10.0.dist-info/METADATA +546 -0
- offagent-0.10.0.dist-info/RECORD +39 -0
- offagent-0.10.0.dist-info/WHEEL +5 -0
- offagent-0.10.0.dist-info/entry_points.txt +2 -0
- offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
- offagent-0.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,795 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import json
|
|
5
|
+
import sqlite3
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
|
|
9
|
+
from offagent.domain.models import DocumentRef, IndexedItem
|
|
10
|
+
|
|
11
|
+
DOCUMENTS_SQL = """
|
|
12
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
13
|
+
document_id TEXT PRIMARY KEY,
|
|
14
|
+
path TEXT NOT NULL UNIQUE,
|
|
15
|
+
file_type TEXT NOT NULL,
|
|
16
|
+
display_name TEXT NOT NULL,
|
|
17
|
+
modified_time REAL NOT NULL,
|
|
18
|
+
content_hash TEXT,
|
|
19
|
+
is_active INTEGER NOT NULL DEFAULT 1
|
|
20
|
+
);
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
ITEMS_SQL = """
|
|
24
|
+
CREATE TABLE IF NOT EXISTS items (
|
|
25
|
+
storage_id TEXT PRIMARY KEY,
|
|
26
|
+
document_id TEXT NOT NULL,
|
|
27
|
+
item_id TEXT NOT NULL,
|
|
28
|
+
item_type TEXT NOT NULL,
|
|
29
|
+
locator TEXT NOT NULL,
|
|
30
|
+
preview TEXT NOT NULL,
|
|
31
|
+
content_text TEXT NOT NULL DEFAULT '',
|
|
32
|
+
metadata_json TEXT NOT NULL DEFAULT '{}',
|
|
33
|
+
UNIQUE(document_id, item_id),
|
|
34
|
+
FOREIGN KEY (document_id) REFERENCES documents(document_id)
|
|
35
|
+
);
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
ITEMS_FTS_SQL = """
|
|
39
|
+
CREATE VIRTUAL TABLE items_fts USING fts5(
|
|
40
|
+
storage_id UNINDEXED,
|
|
41
|
+
item_id UNINDEXED,
|
|
42
|
+
document_id UNINDEXED,
|
|
43
|
+
content_text
|
|
44
|
+
);
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
ITEM_EMBEDDINGS_SQL = """
|
|
48
|
+
CREATE TABLE IF NOT EXISTS item_embeddings (
|
|
49
|
+
storage_id TEXT PRIMARY KEY REFERENCES items(storage_id),
|
|
50
|
+
model_name TEXT NOT NULL,
|
|
51
|
+
dimensions INTEGER NOT NULL,
|
|
52
|
+
embedding BLOB NOT NULL,
|
|
53
|
+
updated_at TEXT NOT NULL
|
|
54
|
+
);
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
XLSX_ROW_EMBEDDINGS_SQL = """
|
|
58
|
+
CREATE TABLE IF NOT EXISTS xlsx_row_embeddings (
|
|
59
|
+
embedding_id TEXT PRIMARY KEY,
|
|
60
|
+
document_id TEXT NOT NULL,
|
|
61
|
+
sheet_name TEXT NOT NULL,
|
|
62
|
+
row_number INTEGER NOT NULL,
|
|
63
|
+
representative_storage_id TEXT NOT NULL REFERENCES items(storage_id),
|
|
64
|
+
content_text TEXT NOT NULL,
|
|
65
|
+
preview TEXT NOT NULL,
|
|
66
|
+
model_name TEXT NOT NULL,
|
|
67
|
+
dimensions INTEGER NOT NULL,
|
|
68
|
+
embedding BLOB NOT NULL,
|
|
69
|
+
updated_at TEXT NOT NULL,
|
|
70
|
+
UNIQUE(document_id, sheet_name, row_number),
|
|
71
|
+
FOREIGN KEY (document_id) REFERENCES documents(document_id)
|
|
72
|
+
);
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
XLSX_ROW_EMBEDDING_CELLS_SQL = """
|
|
76
|
+
CREATE TABLE IF NOT EXISTS xlsx_row_embedding_cells (
|
|
77
|
+
embedding_id TEXT NOT NULL REFERENCES xlsx_row_embeddings(embedding_id),
|
|
78
|
+
storage_id TEXT NOT NULL REFERENCES items(storage_id),
|
|
79
|
+
cell_coordinate TEXT NOT NULL,
|
|
80
|
+
cell_order INTEGER NOT NULL,
|
|
81
|
+
is_representative INTEGER NOT NULL DEFAULT 0,
|
|
82
|
+
PRIMARY KEY (embedding_id, storage_id)
|
|
83
|
+
);
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
EMBEDDING_META_SQL = """
|
|
87
|
+
CREATE TABLE IF NOT EXISTS embedding_meta (
|
|
88
|
+
key TEXT PRIMARY KEY,
|
|
89
|
+
value TEXT NOT NULL
|
|
90
|
+
);
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
EMBEDDING_META_KEYS = {
|
|
94
|
+
"model_name",
|
|
95
|
+
"dimensions",
|
|
96
|
+
"similarity_metric",
|
|
97
|
+
"schema_version",
|
|
98
|
+
}
|
|
99
|
+
EMBEDDING_SCHEMA_VERSION = "1"
|
|
100
|
+
SIMILARITY_METRIC = "cosine"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class StoreCapabilityError(RuntimeError):
|
|
104
|
+
"""Raised when the runtime cannot satisfy store requirements."""
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def connect(index_path: Path) -> sqlite3.Connection:
|
|
108
|
+
index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
connection = sqlite3.connect(index_path)
|
|
110
|
+
connection.row_factory = sqlite3.Row
|
|
111
|
+
return connection
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def supports_fts5(connection: sqlite3.Connection) -> bool:
|
|
115
|
+
table_name = "fts5_probe"
|
|
116
|
+
try:
|
|
117
|
+
connection.execute(
|
|
118
|
+
f"CREATE VIRTUAL TABLE {table_name} USING fts5(content_text)"
|
|
119
|
+
)
|
|
120
|
+
connection.execute(f"DROP TABLE {table_name}")
|
|
121
|
+
return True
|
|
122
|
+
except sqlite3.OperationalError:
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def initialize_schema(connection: sqlite3.Connection) -> None:
|
|
127
|
+
if not supports_fts5(connection):
|
|
128
|
+
raise StoreCapabilityError("SQLite FTS5 support is required.")
|
|
129
|
+
|
|
130
|
+
connection.executescript(DOCUMENTS_SQL)
|
|
131
|
+
connection.executescript(ITEMS_SQL)
|
|
132
|
+
connection.executescript(ITEM_EMBEDDINGS_SQL)
|
|
133
|
+
connection.executescript(XLSX_ROW_EMBEDDINGS_SQL)
|
|
134
|
+
connection.executescript(XLSX_ROW_EMBEDDING_CELLS_SQL)
|
|
135
|
+
connection.executescript(EMBEDDING_META_SQL)
|
|
136
|
+
_migrate_documents_table(connection)
|
|
137
|
+
_migrate_items_table(connection)
|
|
138
|
+
_rebuild_items_fts(connection)
|
|
139
|
+
connection.commit()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def ensure_ready(index_path: Path) -> sqlite3.Connection:
|
|
143
|
+
connection = connect(index_path)
|
|
144
|
+
try:
|
|
145
|
+
initialize_schema(connection)
|
|
146
|
+
except Exception:
|
|
147
|
+
connection.close()
|
|
148
|
+
raise
|
|
149
|
+
return connection
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def make_storage_id(document_id: str, item_id: str) -> str:
|
|
153
|
+
return f"{document_id}:{item_id}"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def make_xlsx_row_embedding_id(
|
|
157
|
+
document_id: str, sheet_name: str, row_number: int
|
|
158
|
+
) -> str:
|
|
159
|
+
return f"{document_id}:xlsx-row:{sheet_name}!{row_number}"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def upsert_document(connection: sqlite3.Connection, document: DocumentRef) -> None:
|
|
163
|
+
connection.execute(
|
|
164
|
+
"""
|
|
165
|
+
INSERT INTO documents (
|
|
166
|
+
document_id,
|
|
167
|
+
path,
|
|
168
|
+
file_type,
|
|
169
|
+
display_name,
|
|
170
|
+
modified_time,
|
|
171
|
+
content_hash,
|
|
172
|
+
is_active
|
|
173
|
+
)
|
|
174
|
+
VALUES (?, ?, ?, ?, ?, ?, 1)
|
|
175
|
+
ON CONFLICT(document_id) DO UPDATE SET
|
|
176
|
+
path = excluded.path,
|
|
177
|
+
file_type = excluded.file_type,
|
|
178
|
+
display_name = excluded.display_name,
|
|
179
|
+
modified_time = excluded.modified_time,
|
|
180
|
+
content_hash = excluded.content_hash,
|
|
181
|
+
is_active = 1
|
|
182
|
+
""",
|
|
183
|
+
(
|
|
184
|
+
document.document_id,
|
|
185
|
+
str(document.path),
|
|
186
|
+
document.file_type,
|
|
187
|
+
document.display_name,
|
|
188
|
+
document.modified_time,
|
|
189
|
+
document.content_hash,
|
|
190
|
+
),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def replace_document_items(
|
|
195
|
+
connection: sqlite3.Connection,
|
|
196
|
+
document_id: str,
|
|
197
|
+
items: Sequence[IndexedItem],
|
|
198
|
+
) -> None:
|
|
199
|
+
connection.execute("DELETE FROM items WHERE document_id = ?", (document_id,))
|
|
200
|
+
connection.execute("DELETE FROM items_fts WHERE document_id = ?", (document_id,))
|
|
201
|
+
|
|
202
|
+
for item in items:
|
|
203
|
+
storage_id = make_storage_id(document_id, item.item_id)
|
|
204
|
+
connection.execute(
|
|
205
|
+
"""
|
|
206
|
+
INSERT INTO items (
|
|
207
|
+
storage_id,
|
|
208
|
+
document_id,
|
|
209
|
+
item_id,
|
|
210
|
+
item_type,
|
|
211
|
+
locator,
|
|
212
|
+
preview,
|
|
213
|
+
content_text,
|
|
214
|
+
metadata_json
|
|
215
|
+
)
|
|
216
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
217
|
+
""",
|
|
218
|
+
(
|
|
219
|
+
storage_id,
|
|
220
|
+
document_id,
|
|
221
|
+
item.item_id,
|
|
222
|
+
item.item_type,
|
|
223
|
+
item.locator,
|
|
224
|
+
item.preview,
|
|
225
|
+
item.content_text,
|
|
226
|
+
json.dumps(item.metadata, sort_keys=True),
|
|
227
|
+
),
|
|
228
|
+
)
|
|
229
|
+
connection.execute(
|
|
230
|
+
"""
|
|
231
|
+
INSERT INTO items_fts (
|
|
232
|
+
storage_id,
|
|
233
|
+
item_id,
|
|
234
|
+
document_id,
|
|
235
|
+
content_text
|
|
236
|
+
)
|
|
237
|
+
VALUES (?, ?, ?, ?)
|
|
238
|
+
""",
|
|
239
|
+
(storage_id, item.item_id, document_id, item.content_text),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def fetch_document_by_path(
|
|
244
|
+
connection: sqlite3.Connection, document_path: Path
|
|
245
|
+
) -> sqlite3.Row | None:
|
|
246
|
+
return connection.execute(
|
|
247
|
+
"""
|
|
248
|
+
SELECT d.*, COUNT(i.storage_id) AS item_count
|
|
249
|
+
FROM documents AS d
|
|
250
|
+
LEFT JOIN items AS i ON i.document_id = d.document_id
|
|
251
|
+
WHERE d.path = ? AND d.is_active = 1
|
|
252
|
+
GROUP BY d.document_id
|
|
253
|
+
""",
|
|
254
|
+
(str(document_path.resolve()),),
|
|
255
|
+
).fetchone()
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def fetch_document_by_id(
|
|
259
|
+
connection: sqlite3.Connection, document_id: str
|
|
260
|
+
) -> sqlite3.Row | None:
|
|
261
|
+
return connection.execute(
|
|
262
|
+
"""
|
|
263
|
+
SELECT d.*, COUNT(i.storage_id) AS item_count
|
|
264
|
+
FROM documents AS d
|
|
265
|
+
LEFT JOIN items AS i ON i.document_id = d.document_id
|
|
266
|
+
WHERE d.document_id = ? AND d.is_active = 1
|
|
267
|
+
GROUP BY d.document_id
|
|
268
|
+
""",
|
|
269
|
+
(document_id,),
|
|
270
|
+
).fetchone()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def fetch_documents(connection: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
274
|
+
return list(
|
|
275
|
+
connection.execute(
|
|
276
|
+
"""
|
|
277
|
+
SELECT d.*, COUNT(i.storage_id) AS item_count
|
|
278
|
+
FROM documents AS d
|
|
279
|
+
LEFT JOIN items AS i ON i.document_id = d.document_id
|
|
280
|
+
WHERE d.is_active = 1
|
|
281
|
+
GROUP BY d.document_id
|
|
282
|
+
ORDER BY d.path
|
|
283
|
+
"""
|
|
284
|
+
).fetchall()
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def fetch_item_by_id(
|
|
289
|
+
connection: sqlite3.Connection,
|
|
290
|
+
document_id: str,
|
|
291
|
+
item_id: str,
|
|
292
|
+
) -> sqlite3.Row | None:
|
|
293
|
+
return connection.execute(
|
|
294
|
+
"""
|
|
295
|
+
SELECT *
|
|
296
|
+
FROM items
|
|
297
|
+
WHERE document_id = ? AND item_id = ?
|
|
298
|
+
""",
|
|
299
|
+
(document_id, item_id),
|
|
300
|
+
).fetchone()
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def fetch_items_for_document(
|
|
304
|
+
connection: sqlite3.Connection,
|
|
305
|
+
document_id: str,
|
|
306
|
+
) -> list[sqlite3.Row]:
|
|
307
|
+
return list(
|
|
308
|
+
connection.execute(
|
|
309
|
+
"""
|
|
310
|
+
SELECT *
|
|
311
|
+
FROM items
|
|
312
|
+
WHERE document_id = ?
|
|
313
|
+
ORDER BY item_id
|
|
314
|
+
""",
|
|
315
|
+
(document_id,),
|
|
316
|
+
).fetchall()
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def search_items(
|
|
321
|
+
connection: sqlite3.Connection,
|
|
322
|
+
query: str,
|
|
323
|
+
*,
|
|
324
|
+
file_type: str | None = None,
|
|
325
|
+
document_path: Path | None = None,
|
|
326
|
+
limit: int = 20,
|
|
327
|
+
) -> list[sqlite3.Row]:
|
|
328
|
+
sql = """
|
|
329
|
+
SELECT
|
|
330
|
+
i.storage_id,
|
|
331
|
+
i.document_id,
|
|
332
|
+
i.item_id,
|
|
333
|
+
i.item_type,
|
|
334
|
+
i.locator,
|
|
335
|
+
i.preview,
|
|
336
|
+
i.content_text,
|
|
337
|
+
i.metadata_json,
|
|
338
|
+
d.path,
|
|
339
|
+
d.display_name,
|
|
340
|
+
bm25(items_fts) AS score
|
|
341
|
+
FROM items_fts
|
|
342
|
+
JOIN items AS i ON i.storage_id = items_fts.storage_id
|
|
343
|
+
JOIN documents AS d ON d.document_id = i.document_id
|
|
344
|
+
WHERE items_fts MATCH ?
|
|
345
|
+
AND d.is_active = 1
|
|
346
|
+
"""
|
|
347
|
+
params: list[object] = [query]
|
|
348
|
+
|
|
349
|
+
if file_type is not None:
|
|
350
|
+
sql += " AND d.file_type = ?"
|
|
351
|
+
params.append(file_type)
|
|
352
|
+
|
|
353
|
+
if document_path is not None:
|
|
354
|
+
sql += " AND d.path = ?"
|
|
355
|
+
params.append(str(document_path.resolve()))
|
|
356
|
+
|
|
357
|
+
sql += " ORDER BY score, d.path, i.item_id LIMIT ?"
|
|
358
|
+
params.append(limit)
|
|
359
|
+
|
|
360
|
+
return list(connection.execute(sql, params).fetchall())
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def fetch_item_embeddings(
|
|
364
|
+
connection: sqlite3.Connection,
|
|
365
|
+
*,
|
|
366
|
+
file_type: str | None = None,
|
|
367
|
+
document_path: Path | None = None,
|
|
368
|
+
) -> list[sqlite3.Row]:
|
|
369
|
+
sql = """
|
|
370
|
+
SELECT
|
|
371
|
+
e.storage_id,
|
|
372
|
+
e.model_name,
|
|
373
|
+
e.dimensions,
|
|
374
|
+
e.embedding,
|
|
375
|
+
e.updated_at,
|
|
376
|
+
i.document_id,
|
|
377
|
+
i.item_id,
|
|
378
|
+
i.item_type,
|
|
379
|
+
i.locator,
|
|
380
|
+
i.preview,
|
|
381
|
+
i.content_text,
|
|
382
|
+
i.metadata_json,
|
|
383
|
+
d.path,
|
|
384
|
+
d.display_name
|
|
385
|
+
FROM item_embeddings AS e
|
|
386
|
+
JOIN items AS i ON i.storage_id = e.storage_id
|
|
387
|
+
JOIN documents AS d ON d.document_id = i.document_id
|
|
388
|
+
WHERE d.is_active = 1
|
|
389
|
+
"""
|
|
390
|
+
params: list[object] = []
|
|
391
|
+
|
|
392
|
+
if file_type is not None:
|
|
393
|
+
sql += " AND d.file_type = ?"
|
|
394
|
+
params.append(file_type)
|
|
395
|
+
|
|
396
|
+
if document_path is not None:
|
|
397
|
+
sql += " AND d.path = ?"
|
|
398
|
+
params.append(str(document_path.resolve()))
|
|
399
|
+
|
|
400
|
+
sql += " ORDER BY d.path, i.item_id"
|
|
401
|
+
return list(connection.execute(sql, params).fetchall())
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def fetch_xlsx_row_embeddings(
|
|
405
|
+
connection: sqlite3.Connection,
|
|
406
|
+
*,
|
|
407
|
+
file_type: str | None = None,
|
|
408
|
+
document_path: Path | None = None,
|
|
409
|
+
) -> list[sqlite3.Row]:
|
|
410
|
+
if file_type not in (None, "xlsx"):
|
|
411
|
+
return []
|
|
412
|
+
|
|
413
|
+
sql = """
|
|
414
|
+
SELECT
|
|
415
|
+
e.embedding_id,
|
|
416
|
+
e.document_id,
|
|
417
|
+
e.sheet_name,
|
|
418
|
+
e.row_number,
|
|
419
|
+
e.representative_storage_id,
|
|
420
|
+
e.content_text,
|
|
421
|
+
e.preview AS row_preview,
|
|
422
|
+
e.model_name,
|
|
423
|
+
e.dimensions,
|
|
424
|
+
e.embedding,
|
|
425
|
+
e.updated_at,
|
|
426
|
+
i.item_id,
|
|
427
|
+
i.item_type,
|
|
428
|
+
i.locator,
|
|
429
|
+
i.preview,
|
|
430
|
+
i.content_text AS item_content_text,
|
|
431
|
+
i.metadata_json,
|
|
432
|
+
d.path,
|
|
433
|
+
d.display_name
|
|
434
|
+
FROM xlsx_row_embeddings AS e
|
|
435
|
+
JOIN documents AS d ON d.document_id = e.document_id
|
|
436
|
+
JOIN items AS i ON i.storage_id = e.representative_storage_id
|
|
437
|
+
WHERE d.is_active = 1
|
|
438
|
+
AND d.file_type = 'xlsx'
|
|
439
|
+
"""
|
|
440
|
+
params: list[object] = []
|
|
441
|
+
|
|
442
|
+
if document_path is not None:
|
|
443
|
+
sql += " AND d.path = ?"
|
|
444
|
+
params.append(str(document_path.resolve()))
|
|
445
|
+
|
|
446
|
+
sql += " ORDER BY d.path, e.sheet_name, e.row_number"
|
|
447
|
+
return list(connection.execute(sql, params).fetchall())
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def fetch_xlsx_row_embedding_cells(
|
|
451
|
+
connection: sqlite3.Connection,
|
|
452
|
+
embedding_id: str,
|
|
453
|
+
) -> list[sqlite3.Row]:
|
|
454
|
+
return list(
|
|
455
|
+
connection.execute(
|
|
456
|
+
"""
|
|
457
|
+
SELECT
|
|
458
|
+
embedding_id,
|
|
459
|
+
storage_id,
|
|
460
|
+
cell_coordinate,
|
|
461
|
+
cell_order,
|
|
462
|
+
is_representative
|
|
463
|
+
FROM xlsx_row_embedding_cells
|
|
464
|
+
WHERE embedding_id = ?
|
|
465
|
+
ORDER BY cell_order, cell_coordinate
|
|
466
|
+
""",
|
|
467
|
+
(embedding_id,),
|
|
468
|
+
).fetchall()
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def has_item_embeddings(
|
|
473
|
+
connection: sqlite3.Connection,
|
|
474
|
+
*,
|
|
475
|
+
file_type: str | None = None,
|
|
476
|
+
document_path: Path | None = None,
|
|
477
|
+
) -> bool:
|
|
478
|
+
if file_type == "xlsx":
|
|
479
|
+
sql = """
|
|
480
|
+
SELECT 1
|
|
481
|
+
FROM xlsx_row_embeddings AS e
|
|
482
|
+
JOIN documents AS d ON d.document_id = e.document_id
|
|
483
|
+
WHERE d.is_active = 1
|
|
484
|
+
AND d.file_type = 'xlsx'
|
|
485
|
+
"""
|
|
486
|
+
params: list[object] = []
|
|
487
|
+
if document_path is not None:
|
|
488
|
+
sql += " AND d.path = ?"
|
|
489
|
+
params.append(str(document_path.resolve()))
|
|
490
|
+
sql += " LIMIT 1"
|
|
491
|
+
return connection.execute(sql, params).fetchone() is not None
|
|
492
|
+
|
|
493
|
+
if file_type is None:
|
|
494
|
+
if has_item_embeddings(
|
|
495
|
+
connection, file_type="docx", document_path=document_path
|
|
496
|
+
):
|
|
497
|
+
return True
|
|
498
|
+
if has_item_embeddings(
|
|
499
|
+
connection, file_type="pptx", document_path=document_path
|
|
500
|
+
):
|
|
501
|
+
return True
|
|
502
|
+
return has_item_embeddings(
|
|
503
|
+
connection, file_type="xlsx", document_path=document_path
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
sql = """
|
|
507
|
+
SELECT 1
|
|
508
|
+
FROM item_embeddings AS e
|
|
509
|
+
JOIN items AS i ON i.storage_id = e.storage_id
|
|
510
|
+
JOIN documents AS d ON d.document_id = i.document_id
|
|
511
|
+
WHERE d.is_active = 1
|
|
512
|
+
"""
|
|
513
|
+
params: list[object] = []
|
|
514
|
+
|
|
515
|
+
if file_type is not None:
|
|
516
|
+
sql += " AND d.file_type = ?"
|
|
517
|
+
params.append(file_type)
|
|
518
|
+
|
|
519
|
+
if document_path is not None:
|
|
520
|
+
sql += " AND d.path = ?"
|
|
521
|
+
params.append(str(document_path.resolve()))
|
|
522
|
+
|
|
523
|
+
sql += " LIMIT 1"
|
|
524
|
+
return connection.execute(sql, params).fetchone() is not None
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def delete_document_embeddings(
|
|
528
|
+
connection: sqlite3.Connection, document_id: str
|
|
529
|
+
) -> None:
|
|
530
|
+
delete_document_xlsx_row_embeddings(connection, document_id)
|
|
531
|
+
connection.execute(
|
|
532
|
+
"""
|
|
533
|
+
DELETE FROM item_embeddings
|
|
534
|
+
WHERE storage_id LIKE ?
|
|
535
|
+
""",
|
|
536
|
+
(f"{document_id}:%",),
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def delete_document_xlsx_row_embeddings(
|
|
541
|
+
connection: sqlite3.Connection, document_id: str
|
|
542
|
+
) -> None:
|
|
543
|
+
connection.execute(
|
|
544
|
+
"""
|
|
545
|
+
DELETE FROM xlsx_row_embedding_cells
|
|
546
|
+
WHERE embedding_id IN (
|
|
547
|
+
SELECT embedding_id
|
|
548
|
+
FROM xlsx_row_embeddings
|
|
549
|
+
WHERE document_id = ?
|
|
550
|
+
)
|
|
551
|
+
""",
|
|
552
|
+
(document_id,),
|
|
553
|
+
)
|
|
554
|
+
connection.execute(
|
|
555
|
+
"""
|
|
556
|
+
DELETE FROM xlsx_row_embeddings
|
|
557
|
+
WHERE document_id = ?
|
|
558
|
+
""",
|
|
559
|
+
(document_id,),
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def replace_document_embeddings(
|
|
564
|
+
connection: sqlite3.Connection,
|
|
565
|
+
*,
|
|
566
|
+
document_id: str,
|
|
567
|
+
model_name: str,
|
|
568
|
+
dimensions: int,
|
|
569
|
+
embeddings: Sequence[tuple[str, bytes]],
|
|
570
|
+
) -> None:
|
|
571
|
+
delete_document_embeddings(connection, document_id)
|
|
572
|
+
updated_at = dt.datetime.now(dt.timezone.utc).isoformat()
|
|
573
|
+
for storage_id, embedding in embeddings:
|
|
574
|
+
connection.execute(
|
|
575
|
+
"""
|
|
576
|
+
INSERT INTO item_embeddings (
|
|
577
|
+
storage_id,
|
|
578
|
+
model_name,
|
|
579
|
+
dimensions,
|
|
580
|
+
embedding,
|
|
581
|
+
updated_at
|
|
582
|
+
)
|
|
583
|
+
VALUES (?, ?, ?, ?, ?)
|
|
584
|
+
""",
|
|
585
|
+
(storage_id, model_name, dimensions, embedding, updated_at),
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def replace_xlsx_row_embeddings(
|
|
590
|
+
connection: sqlite3.Connection,
|
|
591
|
+
*,
|
|
592
|
+
document_id: str,
|
|
593
|
+
model_name: str,
|
|
594
|
+
dimensions: int,
|
|
595
|
+
row_embeddings: Sequence[
|
|
596
|
+
tuple[
|
|
597
|
+
str,
|
|
598
|
+
str,
|
|
599
|
+
int,
|
|
600
|
+
str,
|
|
601
|
+
str,
|
|
602
|
+
str,
|
|
603
|
+
bytes,
|
|
604
|
+
Sequence[tuple[str, str, int, bool]],
|
|
605
|
+
]
|
|
606
|
+
],
|
|
607
|
+
) -> None:
|
|
608
|
+
delete_document_xlsx_row_embeddings(connection, document_id)
|
|
609
|
+
updated_at = dt.datetime.now(dt.timezone.utc).isoformat()
|
|
610
|
+
for (
|
|
611
|
+
embedding_id,
|
|
612
|
+
sheet_name,
|
|
613
|
+
row_number,
|
|
614
|
+
representative_storage_id,
|
|
615
|
+
content_text,
|
|
616
|
+
preview,
|
|
617
|
+
embedding,
|
|
618
|
+
contributing_cells,
|
|
619
|
+
) in row_embeddings:
|
|
620
|
+
connection.execute(
|
|
621
|
+
"""
|
|
622
|
+
INSERT INTO xlsx_row_embeddings (
|
|
623
|
+
embedding_id,
|
|
624
|
+
document_id,
|
|
625
|
+
sheet_name,
|
|
626
|
+
row_number,
|
|
627
|
+
representative_storage_id,
|
|
628
|
+
content_text,
|
|
629
|
+
preview,
|
|
630
|
+
model_name,
|
|
631
|
+
dimensions,
|
|
632
|
+
embedding,
|
|
633
|
+
updated_at
|
|
634
|
+
)
|
|
635
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
636
|
+
""",
|
|
637
|
+
(
|
|
638
|
+
embedding_id,
|
|
639
|
+
document_id,
|
|
640
|
+
sheet_name,
|
|
641
|
+
row_number,
|
|
642
|
+
representative_storage_id,
|
|
643
|
+
content_text,
|
|
644
|
+
preview,
|
|
645
|
+
model_name,
|
|
646
|
+
dimensions,
|
|
647
|
+
embedding,
|
|
648
|
+
updated_at,
|
|
649
|
+
),
|
|
650
|
+
)
|
|
651
|
+
for (
|
|
652
|
+
storage_id,
|
|
653
|
+
cell_coordinate,
|
|
654
|
+
cell_order,
|
|
655
|
+
is_representative,
|
|
656
|
+
) in contributing_cells:
|
|
657
|
+
connection.execute(
|
|
658
|
+
"""
|
|
659
|
+
INSERT INTO xlsx_row_embedding_cells (
|
|
660
|
+
embedding_id,
|
|
661
|
+
storage_id,
|
|
662
|
+
cell_coordinate,
|
|
663
|
+
cell_order,
|
|
664
|
+
is_representative
|
|
665
|
+
)
|
|
666
|
+
VALUES (?, ?, ?, ?, ?)
|
|
667
|
+
""",
|
|
668
|
+
(
|
|
669
|
+
embedding_id,
|
|
670
|
+
storage_id,
|
|
671
|
+
cell_coordinate,
|
|
672
|
+
cell_order,
|
|
673
|
+
1 if is_representative else 0,
|
|
674
|
+
),
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def fetch_embedding_meta(connection: sqlite3.Connection) -> dict[str, str]:
|
|
679
|
+
return {
|
|
680
|
+
row["key"]: row["value"]
|
|
681
|
+
for row in connection.execute(
|
|
682
|
+
"""
|
|
683
|
+
SELECT key, value
|
|
684
|
+
FROM embedding_meta
|
|
685
|
+
"""
|
|
686
|
+
).fetchall()
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def ensure_embedding_meta(
|
|
691
|
+
connection: sqlite3.Connection,
|
|
692
|
+
*,
|
|
693
|
+
model_name: str,
|
|
694
|
+
dimensions: int,
|
|
695
|
+
) -> None:
|
|
696
|
+
expected = {
|
|
697
|
+
"model_name": model_name,
|
|
698
|
+
"dimensions": str(dimensions),
|
|
699
|
+
"similarity_metric": SIMILARITY_METRIC,
|
|
700
|
+
"schema_version": EMBEDDING_SCHEMA_VERSION,
|
|
701
|
+
}
|
|
702
|
+
existing = fetch_embedding_meta(connection)
|
|
703
|
+
|
|
704
|
+
if not existing:
|
|
705
|
+
for key, value in expected.items():
|
|
706
|
+
connection.execute(
|
|
707
|
+
"""
|
|
708
|
+
INSERT INTO embedding_meta (key, value)
|
|
709
|
+
VALUES (?, ?)
|
|
710
|
+
""",
|
|
711
|
+
(key, value),
|
|
712
|
+
)
|
|
713
|
+
return
|
|
714
|
+
|
|
715
|
+
if set(existing) != EMBEDDING_META_KEYS:
|
|
716
|
+
raise RuntimeError("Stored embedding metadata is incomplete or unsupported.")
|
|
717
|
+
|
|
718
|
+
for key, expected_value in expected.items():
|
|
719
|
+
actual_value = existing.get(key)
|
|
720
|
+
if actual_value != expected_value:
|
|
721
|
+
raise RuntimeError(
|
|
722
|
+
f"Embedding metadata mismatch for {key}: expected {expected_value}, found {actual_value}."
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _migrate_documents_table(connection: sqlite3.Connection) -> None:
|
|
727
|
+
document_columns = _table_columns(connection, "documents")
|
|
728
|
+
if "is_active" not in document_columns:
|
|
729
|
+
connection.execute(
|
|
730
|
+
"ALTER TABLE documents ADD COLUMN is_active INTEGER NOT NULL DEFAULT 1"
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def _migrate_items_table(connection: sqlite3.Connection) -> None:
|
|
735
|
+
item_columns = _table_columns(connection, "items")
|
|
736
|
+
required_columns = {
|
|
737
|
+
"storage_id",
|
|
738
|
+
"document_id",
|
|
739
|
+
"item_id",
|
|
740
|
+
"item_type",
|
|
741
|
+
"locator",
|
|
742
|
+
"preview",
|
|
743
|
+
"content_text",
|
|
744
|
+
"metadata_json",
|
|
745
|
+
}
|
|
746
|
+
if required_columns.issubset(item_columns):
|
|
747
|
+
return
|
|
748
|
+
|
|
749
|
+
if item_columns:
|
|
750
|
+
connection.execute("ALTER TABLE items RENAME TO items_legacy")
|
|
751
|
+
connection.executescript(ITEMS_SQL)
|
|
752
|
+
legacy_columns = _table_columns(connection, "items_legacy")
|
|
753
|
+
metadata_expr = "metadata_json" if "metadata_json" in legacy_columns else "'{}'"
|
|
754
|
+
connection.execute(
|
|
755
|
+
f"""
|
|
756
|
+
INSERT INTO items (
|
|
757
|
+
storage_id,
|
|
758
|
+
document_id,
|
|
759
|
+
item_id,
|
|
760
|
+
item_type,
|
|
761
|
+
locator,
|
|
762
|
+
preview,
|
|
763
|
+
content_text,
|
|
764
|
+
metadata_json
|
|
765
|
+
)
|
|
766
|
+
SELECT
|
|
767
|
+
document_id || ':' || item_id,
|
|
768
|
+
document_id,
|
|
769
|
+
item_id,
|
|
770
|
+
item_type,
|
|
771
|
+
locator,
|
|
772
|
+
preview,
|
|
773
|
+
content_text,
|
|
774
|
+
{metadata_expr}
|
|
775
|
+
FROM items_legacy
|
|
776
|
+
"""
|
|
777
|
+
)
|
|
778
|
+
connection.execute("DROP TABLE items_legacy")
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _rebuild_items_fts(connection: sqlite3.Connection) -> None:
|
|
782
|
+
connection.execute("DROP TABLE IF EXISTS items_fts")
|
|
783
|
+
connection.executescript(ITEMS_FTS_SQL)
|
|
784
|
+
connection.execute(
|
|
785
|
+
"""
|
|
786
|
+
INSERT INTO items_fts (storage_id, item_id, document_id, content_text)
|
|
787
|
+
SELECT storage_id, item_id, document_id, content_text
|
|
788
|
+
FROM items
|
|
789
|
+
"""
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def _table_columns(connection: sqlite3.Connection, table_name: str) -> set[str]:
|
|
794
|
+
rows = connection.execute(f"PRAGMA table_info({table_name})").fetchall()
|
|
795
|
+
return {row[1] for row in rows}
|