@arabold/docs-mcp-server 1.26.0 → 1.26.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/db/migrations/009-add-pages-table.sql +124 -0
- package/dist/index.js +5374 -5201
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
-- Migration: Add pages table to normalize page-level metadata and support Etag tracking
|
|
2
|
+
-- This migration introduces a pages table to store page-level metadata once per URL
|
|
3
|
+
-- and links document chunks to their parent pages via page_id foreign key
|
|
4
|
+
|
|
5
|
+
-- 1. Create pages table to store unique page-level metadata
|
|
6
|
+
CREATE TABLE IF NOT EXISTS pages (
|
|
7
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
8
|
+
version_id INTEGER NOT NULL REFERENCES versions(id),
|
|
9
|
+
url TEXT NOT NULL,
|
|
10
|
+
title TEXT,
|
|
11
|
+
etag TEXT,
|
|
12
|
+
last_modified TEXT,
|
|
13
|
+
content_type TEXT,
|
|
14
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
15
|
+
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
16
|
+
UNIQUE(version_id, url)
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
-- 2. Add indexes for efficient querying
|
|
20
|
+
CREATE INDEX IF NOT EXISTS idx_pages_version_id ON pages(version_id);
|
|
21
|
+
CREATE INDEX IF NOT EXISTS idx_pages_url ON pages(url);
|
|
22
|
+
CREATE INDEX IF NOT EXISTS idx_pages_etag ON pages(etag);
|
|
23
|
+
|
|
24
|
+
-- 3. Create new documents table with page_id foreign key
|
|
25
|
+
CREATE TABLE documents_new (
|
|
26
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
27
|
+
page_id INTEGER NOT NULL REFERENCES pages(id),
|
|
28
|
+
content TEXT,
|
|
29
|
+
metadata JSON, -- Now contains only chunk-specific metadata (level, path)
|
|
30
|
+
sort_order INTEGER NOT NULL,
|
|
31
|
+
embedding BLOB, -- Store embeddings directly in documents table
|
|
32
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
33
|
+
);
|
|
34
|
+
|
|
35
|
+
-- 4. Create indexes for the new documents table
|
|
36
|
+
CREATE INDEX IF NOT EXISTS idx_documents_page_id ON documents_new(page_id);
|
|
37
|
+
CREATE INDEX IF NOT EXISTS idx_documents_sort_order ON documents_new(page_id, sort_order);
|
|
38
|
+
|
|
39
|
+
-- 5. Migrate data from old documents table to new structure
|
|
40
|
+
-- First, populate pages table with unique page data from existing documents
|
|
41
|
+
-- Group by version_id and url to ensure uniqueness, using MAX() to handle any duplicates
|
|
42
|
+
INSERT INTO pages (version_id, url, title, created_at, updated_at)
|
|
43
|
+
SELECT
|
|
44
|
+
version_id,
|
|
45
|
+
url,
|
|
46
|
+
MAX(json_extract(metadata, '$.title')) as title,
|
|
47
|
+
MAX(COALESCE(indexed_at, CURRENT_TIMESTAMP)) as created_at,
|
|
48
|
+
MAX(COALESCE(indexed_at, CURRENT_TIMESTAMP)) as updated_at
|
|
49
|
+
FROM documents
|
|
50
|
+
GROUP BY version_id, url;
|
|
51
|
+
|
|
52
|
+
-- 6. Migrate document chunks to new table structure
|
|
53
|
+
-- Preserve all existing metadata except page-level fields (url, title, library, version)
|
|
54
|
+
-- that are now stored in pages and versions tables
|
|
55
|
+
INSERT INTO documents_new (id, page_id, content, metadata, sort_order, created_at)
|
|
56
|
+
SELECT
|
|
57
|
+
d.id,
|
|
58
|
+
p.id as page_id,
|
|
59
|
+
d.content,
|
|
60
|
+
json_remove(
|
|
61
|
+
json_remove(
|
|
62
|
+
json_remove(
|
|
63
|
+
json_remove(d.metadata, '$.url'),
|
|
64
|
+
'$.title'
|
|
65
|
+
),
|
|
66
|
+
'$.library'
|
|
67
|
+
),
|
|
68
|
+
'$.version'
|
|
69
|
+
) as metadata,
|
|
70
|
+
d.sort_order,
|
|
71
|
+
COALESCE(d.indexed_at, CURRENT_TIMESTAMP)
|
|
72
|
+
FROM documents d
|
|
73
|
+
JOIN pages p ON d.version_id = p.version_id AND d.url = p.url;
|
|
74
|
+
|
|
75
|
+
-- 7. Drop the old documents table
|
|
76
|
+
DROP TABLE documents;
|
|
77
|
+
|
|
78
|
+
-- 8. Rename the new table to documents
|
|
79
|
+
ALTER TABLE documents_new RENAME TO documents;
|
|
80
|
+
|
|
81
|
+
-- 9. Recreate FTS5 virtual table to work with new structure
|
|
82
|
+
-- Drop existing FTS table and triggers
|
|
83
|
+
DROP TRIGGER IF EXISTS documents_fts_after_delete;
|
|
84
|
+
DROP TRIGGER IF EXISTS documents_fts_after_update;
|
|
85
|
+
DROP TRIGGER IF EXISTS documents_fts_after_insert;
|
|
86
|
+
DROP TABLE IF EXISTS documents_fts;
|
|
87
|
+
|
|
88
|
+
-- Create new FTS table
|
|
89
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
|
|
90
|
+
content,
|
|
91
|
+
title,
|
|
92
|
+
url,
|
|
93
|
+
path,
|
|
94
|
+
tokenize='porter unicode61'
|
|
95
|
+
);
|
|
96
|
+
|
|
97
|
+
-- 10. Create new FTS triggers that join with pages table
|
|
98
|
+
CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN
|
|
99
|
+
DELETE FROM documents_fts WHERE rowid = old.id;
|
|
100
|
+
END;
|
|
101
|
+
|
|
102
|
+
CREATE TRIGGER IF NOT EXISTS documents_fts_after_update AFTER UPDATE ON documents BEGIN
|
|
103
|
+
DELETE FROM documents_fts WHERE rowid = old.id;
|
|
104
|
+
INSERT INTO documents_fts(rowid, content, title, url, path)
|
|
105
|
+
SELECT new.id, new.content, p.title, p.url, json_extract(new.metadata, '$.path')
|
|
106
|
+
FROM pages p WHERE p.id = new.page_id;
|
|
107
|
+
END;
|
|
108
|
+
|
|
109
|
+
CREATE TRIGGER IF NOT EXISTS documents_fts_after_insert AFTER INSERT ON documents BEGIN
|
|
110
|
+
INSERT INTO documents_fts(rowid, content, title, url, path)
|
|
111
|
+
SELECT new.id, new.content, p.title, p.url, json_extract(new.metadata, '$.path')
|
|
112
|
+
FROM pages p WHERE p.id = new.page_id;
|
|
113
|
+
END;
|
|
114
|
+
|
|
115
|
+
-- 11. Create trigger to update pages.updated_at when page title changes
|
|
116
|
+
CREATE TRIGGER IF NOT EXISTS pages_updated_at_trigger AFTER UPDATE ON pages BEGIN
|
|
117
|
+
UPDATE pages SET updated_at = CURRENT_TIMESTAMP WHERE id = new.id;
|
|
118
|
+
END;
|
|
119
|
+
|
|
120
|
+
-- 12. Rebuild FTS index from migrated data
|
|
121
|
+
INSERT INTO documents_fts(rowid, content, title, url, path)
|
|
122
|
+
SELECT d.id, d.content, p.title, p.url, json_extract(d.metadata, '$.path')
|
|
123
|
+
FROM documents d
|
|
124
|
+
JOIN pages p ON d.page_id = p.id;
|