@anmol-srv/sigil 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +417 -0
- package/dist/cli.js +1019 -0
- package/dist/hooks/post-tool-use.js +70 -0
- package/dist/hooks/session-end.js +222 -0
- package/dist/hooks/stop.js +259 -0
- package/dist/hooks/user-prompt-submit.js +279 -0
- package/dist/server.js +573 -0
- package/integrations/hermes/README.md +41 -0
- package/integrations/hermes/plugin/README.md +72 -0
- package/integrations/hermes/plugin/__init__.py +353 -0
- package/integrations/hermes/plugin/plugin.yaml +10 -0
- package/knexfile.js +15 -0
- package/package.json +100 -0
- package/prompts/audm-decision.md +31 -0
- package/prompts/chunk-context.md +23 -0
- package/prompts/default-extraction.md +35 -0
- package/prompts/entity-extraction.md +37 -0
- package/prompts/input-classifier.md +23 -0
- package/prompts/query-router.md +18 -0
- package/src/db/migrations/20260310120000_create-cortex-document-table.cjs +21 -0
- package/src/db/migrations/20260310120001_create-cortex-chunk-table.cjs +37 -0
- package/src/db/migrations/20260310120002_create-cortex-fact-table.cjs +37 -0
- package/src/db/migrations/20260310120003_create-cortex-entity-table.cjs +26 -0
- package/src/db/migrations/20260310120004_create-cortex-relation-table.cjs +27 -0
- package/src/db/migrations/20260310120005_create-cortex-history-table.cjs +16 -0
- package/src/db/migrations/20260311120000_add-entity-namespace-and-relation-indexes.cjs +32 -0
- package/src/db/migrations/20260312120000_add-fact-entity-linking.cjs +22 -0
- package/src/db/migrations/20260313093130_create-api-key-table.cjs +15 -0
- package/src/db/migrations/20260313120000_add-entity-dedup-support.cjs +13 -0
- package/src/db/migrations/20260313150000_create-connector-tables.cjs +46 -0
- package/src/db/migrations/20260318120000_add-contextual-chunk-prefix.cjs +11 -0
- package/src/db/migrations/20260318120001_add-fact-temporal-validity.cjs +15 -0
- package/src/db/migrations/20260318120002_add-fact-importance.cjs +11 -0
- package/src/db/migrations/20260318120003_add-fact-access-tracking.cjs +13 -0
- package/src/db/migrations/20260405120000_add-unique-constraints.cjs +58 -0
- package/src/db/migrations/20260405140000_create-llm-log-table.cjs +21 -0
- package/src/db/migrations/20260424120000_split-fact-lifecycle.cjs +86 -0
- package/src/db/migrations/20260424120002_create-embedding-cache.cjs +26 -0
- package/src/db/migrations/20260429120000_halfvec-index-compression.cjs +34 -0
- package/src/db/migrations/20260429120100_create-hebbian-edge-table.cjs +37 -0
- package/src/db/migrations/20260429120200_upgrade-embedding-dim-1024.cjs +68 -0
- package/src/db/migrations/20260504120000_scope-document-source-path-uniqueness.cjs +45 -0
- package/src/db/migrations/20260508001733_add-entity-aliases.cjs +42 -0
- package/src/db/migrations/20260512120000_create-entity-hebbian-edge.cjs +42 -0
- package/src/db/migrations/20260512120000_create-pod-tables.cjs +71 -0
- package/src/db/migrations/20260512120100_create-pod-membership.cjs +50 -0
- package/src/db/migrations/20260512120200_add-document-source-metadata.cjs +32 -0
- package/src/db/migrations/20260514023428_rewrite-session-pods-and-add-fact-attribution-columns.cjs +86 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
You are extracting structured facts from a document in a personal knowledge base. Extract every discrete, atomic fact that would be useful for someone querying this knowledge base later.
|
|
2
|
+
|
|
3
|
+
## Categories
|
|
4
|
+
|
|
5
|
+
1. **preference** — Personal likes, dislikes, favorites, preferred tools/foods/methods
|
|
6
|
+
2. **opinion** — Personal views, assessments, evaluations of tools/concepts/approaches
|
|
7
|
+
3. **personal** — Personal facts: birthday, workplace, location, biographical details
|
|
8
|
+
4. **experience** — Personal experiences: projects built, tools used, skills acquired, years of use
|
|
9
|
+
5. **business_rule** — Organizational rules, policies, constraints, requirements
|
|
10
|
+
6. **workflow** — Process flows, state transitions, step-by-step procedures
|
|
11
|
+
7. **architecture** — System design, service interactions, infrastructure decisions
|
|
12
|
+
8. **convention** — Coding patterns, naming rules, team standards, style guidelines
|
|
13
|
+
9. **decision** — Why choices were made, tradeoffs considered, alternatives rejected
|
|
14
|
+
10. **domain_knowledge** — Domain-specific terminology, concepts, definitions
|
|
15
|
+
11. **key_insight** — Important takeaways, notable explanations, lessons learned
|
|
16
|
+
12. **metric** — Quantitative data, measurements, statistics, benchmarks
|
|
17
|
+
13. **issue** — Known problems, bugs, limitations, risks, caveats
|
|
18
|
+
14. **action_item** — Tasks, follow-ups, assignments, deadlines, TODOs
|
|
19
|
+
|
|
20
|
+
## Rules
|
|
21
|
+
|
|
22
|
+
- Each fact must be **self-contained** — include enough context (names, identifiers) so the fact makes sense without the source document.
|
|
23
|
+
- Facts should be **atomic** — one idea per fact. Don't combine multiple facts into one.
|
|
24
|
+
- Include **specific details** — numbers, names, identifiers, exact values when available. **Never paraphrase or drop specific terms** (brand names, technical terms, model names, proper nouns). "sodium vapor lamps" must appear as "sodium vapor lamps", not just "lamps".
|
|
25
|
+
- Do NOT extract generic knowledge (widely known programming concepts, common definitions). Only extract facts specific to THIS document and THIS person/organization.
|
|
26
|
+
- Use personal categories (preference, opinion, personal, experience) when the content expresses subjective or biographical information. Use knowledge categories for objective/organizational information.
|
|
27
|
+
- Set confidence to "high" when the fact comes directly from explicit statements. Set to "medium" for facts inferred or summarized. Set to "low" for uncertain or speculative information.
|
|
28
|
+
- Set importance to "vital" if the fact is essential to understanding the topic — core preferences, key decisions, critical constraints. Set to "supplementary" for supporting details, examples, or background context.
|
|
29
|
+
- Aim for 5-20 facts per document depending on length and density. Don't pad with low-value facts, but don't miss important details either.
|
|
30
|
+
|
|
31
|
+
## Anti-Redundancy Rules
|
|
32
|
+
|
|
33
|
+
- **No rephrased duplicates.** If you already extracted a fact about a topic, don't extract the same thing in different words.
|
|
34
|
+
- **Combine related items.** If multiple items convey the same point, combine into one fact.
|
|
35
|
+
- **Be specific, not generic.** "I prefer Fastify over Express for Node.js APIs" is good. "The user has framework preferences" is bad.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
You are extracting topic entities from a set of facts in an organizational knowledge base.
|
|
2
|
+
A "topic" is a distinct concept, technology, system, process, or subject referenced in the facts.
|
|
3
|
+
|
|
4
|
+
## Rules
|
|
5
|
+
|
|
6
|
+
- Extract 3-8 topics. Only meaningful, distinct topics.
|
|
7
|
+
- Use canonical names: "normalization" not "database normalization concepts", "React hooks" not "React.js hooks pattern".
|
|
8
|
+
- If two facts mention the same topic with different wording, extract it once with the canonical name.
|
|
9
|
+
- Include a brief description (1 sentence) for context.
|
|
10
|
+
- Do NOT extract generic terms like "programming", "coding", "software". Be specific.
|
|
11
|
+
- Do NOT extract people names or document titles — those are handled separately.
|
|
12
|
+
- Topics should be reusable across documents — "database indexing" not "the indexing discussion in doc 12".
|
|
13
|
+
|
|
14
|
+
## Rename contexts — ALWAYS extract BOTH names
|
|
15
|
+
|
|
16
|
+
When the source mentions a rename ("X is now named Y", "X has been renamed to Y", "X used to be called Y", "we renamed X to Y", etc.) — extract **both** the old and new names as separate topic entries. Do not collapse the rename into a single topic. The downstream entity resolver needs both names so it can recognise the rename and merge them into one entity with the old name preserved as an alias. Skipping the old name will cause the system to create a duplicate entity.
|
|
17
|
+
|
|
18
|
+
## Output Format
|
|
19
|
+
|
|
20
|
+
Respond with ONLY a JSON array. Each item:
|
|
21
|
+
- "name" (string): canonical topic name, lowercase
|
|
22
|
+
- "description" (string): one-sentence description of what this topic covers in context
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
[
|
|
26
|
+
{ "name": "3NF normalization", "description": "Third normal form and eliminating transitive dependencies in relational databases" },
|
|
27
|
+
{ "name": "foreign key cascades", "description": "CASCADE vs SET NULL behavior when deleting referenced rows" },
|
|
28
|
+
{ "name": "query optimization", "description": "Techniques for improving SQL query performance including indexing and query plans" }
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
Rename example — note BOTH names are extracted:
|
|
32
|
+
Input: "Smara is now named Sigil"
|
|
33
|
+
Output:
|
|
34
|
+
[
|
|
35
|
+
{ "name": "smara", "description": "the project's previous name (renamed)" },
|
|
36
|
+
{ "name": "sigil", "description": "the project's current name; was previously called Smara" }
|
|
37
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Classify this input for a personal knowledge base. Determine how it should be stored.
|
|
2
|
+
|
|
3
|
+
## Routes
|
|
4
|
+
|
|
5
|
+
- **thought**: Short personal statement — preference, opinion, note, personal fact, experience. Extract facts directly. No chunking needed.
|
|
6
|
+
- **knowledge**: Structured information with substance — explanations, rules, procedures, technical content. Needs full extraction pipeline.
|
|
7
|
+
- **noise**: Not worth storing — greetings, incomplete fragments, test input, nonsense. Skip entirely.
|
|
8
|
+
|
|
9
|
+
## Categories for extracted facts
|
|
10
|
+
|
|
11
|
+
preference, opinion, personal, experience, business_rule, workflow, architecture, convention, decision, domain_knowledge, key_insight, metric, issue, action_item
|
|
12
|
+
|
|
13
|
+
## Rules
|
|
14
|
+
|
|
15
|
+
- If input expresses a preference, opinion, personal fact, or experience, route as "thought"
|
|
16
|
+
- For "thought" route: extract 1-3 atomic facts. Each fact must be self-contained.
|
|
17
|
+
- **Preserve all specific details verbatim** — technical terms, brand names, proper nouns, measurements, model names. Never paraphrase, generalize, or drop specifics. "sodium vapor streetlamps" must stay "sodium vapor streetlamps", not become "streetlamps".
|
|
18
|
+
- If the input mentions multiple specific things (e.g. "amber, sodium vapor, streetlamps"), include all of them in the fact content.
|
|
19
|
+
- For "knowledge" route: set facts to empty array (the pipeline will extract them)
|
|
20
|
+
- For "noise" route: set facts to empty array
|
|
21
|
+
- Always list mentioned entities (people, tools, technologies, places, concepts)
|
|
22
|
+
- Confidence: "high" for explicit statements, "medium" for inferred
|
|
23
|
+
- Importance: "vital" for core preferences/facts, "supplementary" for casual mentions
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Classify the intent of this search query for a personal knowledge base.
|
|
2
|
+
|
|
3
|
+
## Intents
|
|
4
|
+
|
|
5
|
+
- **preference**: Asking about likes, dislikes, preferences, opinions ("which fruit do I like", "what's my favorite tool", "do I prefer X or Y")
|
|
6
|
+
- **factual**: Asking for specific information ("how does X work", "what is the rule for Y", "what BLEU score did the model achieve")
|
|
7
|
+
- **entity_lookup**: Looking up a specific person, tool, system, or concept by name ("tell me about Redis", "what do I know about John")
|
|
8
|
+
- **exploratory**: Broad exploration of a topic ("everything about auth", "what do I know about databases", "summarize my knowledge on X")
|
|
9
|
+
- **temporal**: Time-dependent query ("what changed last month", "what was the status in January")
|
|
10
|
+
|
|
11
|
+
## Rules
|
|
12
|
+
|
|
13
|
+
- For preference: set categories to ["preference", "opinion", "personal"]
|
|
14
|
+
- For factual: set categories to [] (search all categories)
|
|
15
|
+
- For entity_lookup: set entities to the entity name(s), categories to []
|
|
16
|
+
- For exploratory: set expand to true, categories to []
|
|
17
|
+
- For temporal: extract the time reference as an ISO date in pointInTime
|
|
18
|
+
- Always list entity names mentioned in the query
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.raw('CREATE EXTENSION IF NOT EXISTS vector').then(() =>
|
|
3
|
+
knex.schema.createTable('document', (table) => {
|
|
4
|
+
table.increments('id').primary();
|
|
5
|
+
table.text('uid').notNullable().unique();
|
|
6
|
+
table.text('source_path').notNullable();
|
|
7
|
+
table.text('source_type').notNullable();
|
|
8
|
+
table.text('title');
|
|
9
|
+
table.text('content_hash');
|
|
10
|
+
table.text('namespace').notNullable().index();
|
|
11
|
+
table.integer('chunk_count').defaultTo(0);
|
|
12
|
+
table.integer('fact_count').defaultTo(0);
|
|
13
|
+
table.timestamp('last_ingested_at');
|
|
14
|
+
table.timestamps(false, true);
|
|
15
|
+
})
|
|
16
|
+
);
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
exports.down = function (knex) {
|
|
20
|
+
return knex.schema.dropTable('document');
|
|
21
|
+
};
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.createTable('chunk', (table) => {
|
|
4
|
+
table.increments('id').primary();
|
|
5
|
+
table
|
|
6
|
+
.integer('document_id')
|
|
7
|
+
.notNullable()
|
|
8
|
+
.references('id')
|
|
9
|
+
.inTable('document')
|
|
10
|
+
.onDelete('CASCADE');
|
|
11
|
+
table.integer('chunk_index').notNullable();
|
|
12
|
+
table.text('content').notNullable();
|
|
13
|
+
table.text('section_heading');
|
|
14
|
+
table.text('namespace').notNullable().index();
|
|
15
|
+
table.specificType('search_vector', 'tsvector');
|
|
16
|
+
table.timestamps(false, true);
|
|
17
|
+
})
|
|
18
|
+
.then(() =>
|
|
19
|
+
knex.raw(
|
|
20
|
+
`ALTER TABLE chunk ADD COLUMN embedding vector(768)`
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
.then(() =>
|
|
24
|
+
knex.raw(
|
|
25
|
+
`CREATE INDEX chunk_embedding_idx ON chunk USING hnsw (embedding vector_cosine_ops)`
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
.then(() =>
|
|
29
|
+
knex.raw(
|
|
30
|
+
`CREATE INDEX chunk_search_idx ON chunk USING gin (search_vector)`
|
|
31
|
+
)
|
|
32
|
+
);
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
exports.down = function (knex) {
|
|
36
|
+
return knex.schema.dropTable('chunk');
|
|
37
|
+
};
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.createTable('fact', (table) => {
|
|
4
|
+
table.increments('id').primary();
|
|
5
|
+
table.text('uid').notNullable().unique();
|
|
6
|
+
table.text('content').notNullable();
|
|
7
|
+
table.text('category').notNullable().index();
|
|
8
|
+
table.text('confidence').defaultTo('medium');
|
|
9
|
+
table.text('namespace').notNullable().index();
|
|
10
|
+
table.text('status').notNullable().defaultTo('active').index();
|
|
11
|
+
table.integer('contradicted_by_id').references('id').inTable('fact');
|
|
12
|
+
table.integer('superseded_by_id').references('id').inTable('fact');
|
|
13
|
+
table.specificType('source_document_ids', 'integer[]');
|
|
14
|
+
table.text('source_section');
|
|
15
|
+
table.specificType('search_vector', 'tsvector');
|
|
16
|
+
table.timestamps(false, true);
|
|
17
|
+
})
|
|
18
|
+
.then(() =>
|
|
19
|
+
knex.raw(
|
|
20
|
+
`ALTER TABLE fact ADD COLUMN embedding vector(768)`
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
.then(() =>
|
|
24
|
+
knex.raw(
|
|
25
|
+
`CREATE INDEX fact_embedding_idx ON fact USING hnsw (embedding vector_cosine_ops)`
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
.then(() =>
|
|
29
|
+
knex.raw(
|
|
30
|
+
`CREATE INDEX fact_search_idx ON fact USING gin (search_vector)`
|
|
31
|
+
)
|
|
32
|
+
);
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
exports.down = function (knex) {
|
|
36
|
+
return knex.schema.dropTable('fact');
|
|
37
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.createTable('entity', (table) => {
|
|
4
|
+
table.increments('id').primary();
|
|
5
|
+
table.text('uid').notNullable().unique();
|
|
6
|
+
table.text('name').notNullable();
|
|
7
|
+
table.text('entity_type').notNullable().index();
|
|
8
|
+
table.text('description');
|
|
9
|
+
table.integer('mention_count').defaultTo(0);
|
|
10
|
+
table.timestamps(false, true);
|
|
11
|
+
})
|
|
12
|
+
.then(() =>
|
|
13
|
+
knex.raw(
|
|
14
|
+
`ALTER TABLE entity ADD COLUMN embedding vector(768)`
|
|
15
|
+
)
|
|
16
|
+
)
|
|
17
|
+
.then(() =>
|
|
18
|
+
knex.raw(
|
|
19
|
+
`CREATE INDEX entity_embedding_idx ON entity USING hnsw (embedding vector_cosine_ops)`
|
|
20
|
+
)
|
|
21
|
+
);
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
exports.down = function (knex) {
|
|
25
|
+
return knex.schema.dropTable('entity');
|
|
26
|
+
};
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.createTable('relation', (table) => {
|
|
3
|
+
table.increments('id').primary();
|
|
4
|
+
table
|
|
5
|
+
.integer('source_id')
|
|
6
|
+
.notNullable()
|
|
7
|
+
.references('id')
|
|
8
|
+
.inTable('entity')
|
|
9
|
+
.onDelete('CASCADE');
|
|
10
|
+
table
|
|
11
|
+
.integer('target_id')
|
|
12
|
+
.notNullable()
|
|
13
|
+
.references('id')
|
|
14
|
+
.inTable('entity')
|
|
15
|
+
.onDelete('CASCADE');
|
|
16
|
+
table.text('relation_type').notNullable().index();
|
|
17
|
+
table.integer('source_fact_id').references('id').inTable('fact');
|
|
18
|
+
table.integer('mention_count').defaultTo(1);
|
|
19
|
+
table.timestamp('valid_at');
|
|
20
|
+
table.timestamp('invalid_at');
|
|
21
|
+
table.timestamps(false, true);
|
|
22
|
+
});
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
exports.down = function (knex) {
|
|
26
|
+
return knex.schema.dropTable('relation');
|
|
27
|
+
};
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.createTable('history', (table) => {
|
|
3
|
+
table.increments('id').primary();
|
|
4
|
+
table.text('target_type').notNullable();
|
|
5
|
+
table.integer('target_id').notNullable();
|
|
6
|
+
table.text('event').notNullable();
|
|
7
|
+
table.text('old_content');
|
|
8
|
+
table.text('new_content');
|
|
9
|
+
table.text('triggered_by');
|
|
10
|
+
table.timestamps(false, true);
|
|
11
|
+
});
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
exports.down = function (knex) {
|
|
15
|
+
return knex.schema.dropTable('history');
|
|
16
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.alterTable('entity', (table) => {
|
|
4
|
+
table.text('namespace').defaultTo('product/lms').index();
|
|
5
|
+
table.text('external_id');
|
|
6
|
+
table.unique(['name', 'entity_type', 'namespace']);
|
|
7
|
+
})
|
|
8
|
+
.then(() =>
|
|
9
|
+
knex.raw(`
|
|
10
|
+
CREATE INDEX relation_source_type_idx
|
|
11
|
+
ON relation (source_id, relation_type)
|
|
12
|
+
WHERE invalid_at IS NULL;
|
|
13
|
+
CREATE INDEX relation_target_type_idx
|
|
14
|
+
ON relation (target_id, relation_type)
|
|
15
|
+
WHERE invalid_at IS NULL;
|
|
16
|
+
`)
|
|
17
|
+
);
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
exports.down = function (knex) {
|
|
21
|
+
return knex.raw(`
|
|
22
|
+
DROP INDEX IF EXISTS relation_target_type_idx;
|
|
23
|
+
DROP INDEX IF EXISTS relation_source_type_idx;
|
|
24
|
+
`)
|
|
25
|
+
.then(() =>
|
|
26
|
+
knex.schema.alterTable('entity', (table) => {
|
|
27
|
+
table.dropUnique(['name', 'entity_type', 'namespace']);
|
|
28
|
+
table.dropColumn('external_id');
|
|
29
|
+
table.dropColumn('namespace');
|
|
30
|
+
})
|
|
31
|
+
);
|
|
32
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.createTable('fact_entity', (table) => {
|
|
4
|
+
table.increments('id').primary();
|
|
5
|
+
table.integer('fact_id').notNullable().references('id').inTable('fact').onDelete('CASCADE');
|
|
6
|
+
table.integer('entity_id').notNullable().references('id').inTable('entity').onDelete('CASCADE');
|
|
7
|
+
table.text('mention_type').defaultTo('content');
|
|
8
|
+
table.integer('mention_count').defaultTo(1);
|
|
9
|
+
table.timestamps(false, true);
|
|
10
|
+
})
|
|
11
|
+
.then(() =>
|
|
12
|
+
knex.raw(`
|
|
13
|
+
CREATE INDEX fact_entity_fact_id_idx ON fact_entity (fact_id);
|
|
14
|
+
CREATE INDEX fact_entity_entity_id_idx ON fact_entity (entity_id);
|
|
15
|
+
CREATE UNIQUE INDEX fact_entity_unique_idx ON fact_entity (fact_id, entity_id, mention_type);
|
|
16
|
+
`)
|
|
17
|
+
);
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
exports.down = function (knex) {
|
|
21
|
+
return knex.schema.dropTable('fact_entity');
|
|
22
|
+
};
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.createTable('api_key', (table) => {
|
|
3
|
+
table.increments('id').primary();
|
|
4
|
+
table.text('key_hash').notNullable().unique();
|
|
5
|
+
table.text('name').notNullable();
|
|
6
|
+
table.specificType('namespaces', 'text[]').notNullable().defaultTo('{}');
|
|
7
|
+
table.text('role').notNullable().defaultTo('reader');
|
|
8
|
+
table.boolean('active').notNullable().defaultTo(true);
|
|
9
|
+
table.timestamps(false, true);
|
|
10
|
+
});
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
exports.down = function (knex) {
|
|
14
|
+
return knex.schema.dropTable('api_key');
|
|
15
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.table('entity', (table) => {
|
|
3
|
+
table.text('entity_types').nullable();
|
|
4
|
+
table.integer('merged_with').nullable().references('id').inTable('entity');
|
|
5
|
+
});
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
exports.down = function (knex) {
|
|
9
|
+
return knex.schema.table('entity', (table) => {
|
|
10
|
+
table.dropColumn('entity_types');
|
|
11
|
+
table.dropColumn('merged_with');
|
|
12
|
+
});
|
|
13
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.createTable('connection', (table) => {
|
|
4
|
+
table.increments('id').primary();
|
|
5
|
+
table.text('uid').notNullable().unique();
|
|
6
|
+
table.text('name').notNullable();
|
|
7
|
+
table.text('connector_type').notNullable();
|
|
8
|
+
table.jsonb('config').notNullable().defaultTo('{}');
|
|
9
|
+
table.binary('credentials_encrypted');
|
|
10
|
+
table.text('namespace').notNullable();
|
|
11
|
+
table.text('status').notNullable().defaultTo('pending');
|
|
12
|
+
table.timestamp('last_check_at');
|
|
13
|
+
table.timestamps(false, true);
|
|
14
|
+
})
|
|
15
|
+
.createTable('sync_run', (table) => {
|
|
16
|
+
table.increments('id').primary();
|
|
17
|
+
table.text('uid').notNullable().unique();
|
|
18
|
+
table.integer('connection_id').notNullable().references('id').inTable('connection').onDelete('CASCADE');
|
|
19
|
+
table.text('pipeline_type').notNullable();
|
|
20
|
+
table.text('sync_type').notNullable().defaultTo('full');
|
|
21
|
+
table.text('status').notNullable().defaultTo('pending');
|
|
22
|
+
table.jsonb('state_before');
|
|
23
|
+
table.jsonb('state_after');
|
|
24
|
+
table.integer('records_read').defaultTo(0);
|
|
25
|
+
table.integer('records_written').defaultTo(0);
|
|
26
|
+
table.text('error_message');
|
|
27
|
+
table.timestamp('started_at');
|
|
28
|
+
table.timestamp('completed_at');
|
|
29
|
+
table.timestamps(false, true);
|
|
30
|
+
})
|
|
31
|
+
.createTable('sync_schedule', (table) => {
|
|
32
|
+
table.increments('id').primary();
|
|
33
|
+
table.integer('connection_id').notNullable().references('id').inTable('connection').onDelete('CASCADE');
|
|
34
|
+
table.text('cron_expression');
|
|
35
|
+
table.text('sync_type').notNullable().defaultTo('incremental');
|
|
36
|
+
table.boolean('enabled').notNullable().defaultTo(true);
|
|
37
|
+
table.timestamps(false, true);
|
|
38
|
+
});
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
exports.down = function (knex) {
|
|
42
|
+
return knex.schema
|
|
43
|
+
.dropTableIfExists('sync_schedule')
|
|
44
|
+
.dropTableIfExists('sync_run')
|
|
45
|
+
.dropTableIfExists('connection');
|
|
46
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.alterTable('chunk', (table) => {
|
|
3
|
+
table.text('contextual_prefix');
|
|
4
|
+
});
|
|
5
|
+
};
|
|
6
|
+
|
|
7
|
+
exports.down = function (knex) {
|
|
8
|
+
return knex.schema.alterTable('chunk', (table) => {
|
|
9
|
+
table.dropColumn('contextual_prefix');
|
|
10
|
+
});
|
|
11
|
+
};
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema
|
|
3
|
+
.alterTable('fact', (table) => {
|
|
4
|
+
table.timestamp('valid_from');
|
|
5
|
+
table.timestamp('valid_until');
|
|
6
|
+
})
|
|
7
|
+
.then(() => knex.raw('UPDATE fact SET valid_from = created_at'));
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
exports.down = function (knex) {
|
|
11
|
+
return knex.schema.alterTable('fact', (table) => {
|
|
12
|
+
table.dropColumn('valid_from');
|
|
13
|
+
table.dropColumn('valid_until');
|
|
14
|
+
});
|
|
15
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.alterTable('fact', (table) => {
|
|
3
|
+
table.text('importance').defaultTo('supplementary');
|
|
4
|
+
});
|
|
5
|
+
};
|
|
6
|
+
|
|
7
|
+
exports.down = function (knex) {
|
|
8
|
+
return knex.schema.alterTable('fact', (table) => {
|
|
9
|
+
table.dropColumn('importance');
|
|
10
|
+
});
|
|
11
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.alterTable('fact', (table) => {
|
|
3
|
+
table.integer('access_count').defaultTo(0);
|
|
4
|
+
table.timestamp('last_accessed_at');
|
|
5
|
+
});
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
exports.down = function (knex) {
|
|
9
|
+
return knex.schema.alterTable('fact', (table) => {
|
|
10
|
+
table.dropColumn('access_count');
|
|
11
|
+
table.dropColumn('last_accessed_at');
|
|
12
|
+
});
|
|
13
|
+
};
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
exports.up = async function (knex) {
|
|
2
|
+
// Count duplicates before deleting so the operator knows what was removed
|
|
3
|
+
const docDupes = await knex.raw(`
|
|
4
|
+
SELECT COUNT(*) AS cnt FROM document
|
|
5
|
+
WHERE id NOT IN (SELECT MAX(id) FROM document GROUP BY source_path)
|
|
6
|
+
`);
|
|
7
|
+
const docCount = parseInt(docDupes.rows?.[0]?.cnt ?? 0, 10);
|
|
8
|
+
if (docCount > 0) {
|
|
9
|
+
console.warn(`[migration] Removing ${docCount} duplicate document rows (keeping latest per source_path)`);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
await knex.raw(`
|
|
13
|
+
DELETE FROM document
|
|
14
|
+
WHERE id NOT IN (
|
|
15
|
+
SELECT MAX(id) FROM document GROUP BY source_path
|
|
16
|
+
)
|
|
17
|
+
`);
|
|
18
|
+
|
|
19
|
+
await knex.schema.alterTable('document', (table) => {
|
|
20
|
+
table.unique('source_path');
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
const relDupes = await knex.raw(`
|
|
24
|
+
SELECT COUNT(*) AS cnt FROM relation
|
|
25
|
+
WHERE id NOT IN (
|
|
26
|
+
SELECT MAX(id) FROM relation
|
|
27
|
+
WHERE invalid_at IS NULL
|
|
28
|
+
GROUP BY source_id, target_id, relation_type
|
|
29
|
+
)
|
|
30
|
+
`);
|
|
31
|
+
const relCount = parseInt(relDupes.rows?.[0]?.cnt ?? 0, 10);
|
|
32
|
+
if (relCount > 0) {
|
|
33
|
+
console.warn(`[migration] Removing ${relCount} duplicate relation rows (keeping latest per source/target/type)`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
await knex.raw(`
|
|
37
|
+
DELETE FROM relation
|
|
38
|
+
WHERE id NOT IN (
|
|
39
|
+
SELECT MAX(id) FROM relation
|
|
40
|
+
WHERE invalid_at IS NULL
|
|
41
|
+
GROUP BY source_id, target_id, relation_type
|
|
42
|
+
)
|
|
43
|
+
`);
|
|
44
|
+
|
|
45
|
+
await knex.schema.alterTable('relation', (table) => {
|
|
46
|
+
table.unique(['source_id', 'target_id', 'relation_type']);
|
|
47
|
+
});
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
exports.down = async function (knex) {
|
|
51
|
+
await knex.schema.alterTable('relation', (table) => {
|
|
52
|
+
table.dropUnique(['source_id', 'target_id', 'relation_type']);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
await knex.schema.alterTable('document', (table) => {
|
|
56
|
+
table.dropUnique('source_path');
|
|
57
|
+
});
|
|
58
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
exports.up = function (knex) {
|
|
2
|
+
return knex.schema.createTable('llm_log', (table) => {
|
|
3
|
+
table.increments('id').primary();
|
|
4
|
+
table.text('provider').notNullable().index();
|
|
5
|
+
table.text('model').notNullable().index();
|
|
6
|
+
table.text('caller').index();
|
|
7
|
+
table.text('input');
|
|
8
|
+
table.text('response');
|
|
9
|
+
table.integer('input_tokens').defaultTo(0);
|
|
10
|
+
table.integer('output_tokens').defaultTo(0);
|
|
11
|
+
table.decimal('cost', 10, 6).defaultTo(0);
|
|
12
|
+
table.integer('duration_ms').defaultTo(0);
|
|
13
|
+
table.text('status').defaultTo('success').index();
|
|
14
|
+
table.text('error');
|
|
15
|
+
table.timestamp('created_at').defaultTo(knex.fn.now());
|
|
16
|
+
});
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
exports.down = function (knex) {
|
|
20
|
+
return knex.schema.dropTable('llm_log');
|
|
21
|
+
};
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Split volatile lifecycle state (access_count, last_accessed_at) off the fact row
|
|
3
|
+
* into a dedicated fact_lifecycle table.
|
|
4
|
+
*
|
|
5
|
+
* Reason: Postgres HNSW indexes cannot do HOT updates. When a column on a row with
|
|
6
|
+
* an HNSW index is UPDATEd — even a column unrelated to the embedding — Postgres
|
|
7
|
+
* creates a new tuple and rewrites the index entry. At high search volume, this
|
|
8
|
+
* causes catastrophic HNSW index bloat and autovacuum pressure.
|
|
9
|
+
*
|
|
10
|
+
* Fix: keep the fact row read-mostly. Lifecycle state (access_count, last_accessed_at)
|
|
11
|
+
* lives in fact_lifecycle with a FK and a trigger for auto-insert on new facts.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
exports.up = async function (knex) {
|
|
15
|
+
await knex.schema.createTable('fact_lifecycle', (table) => {
|
|
16
|
+
table.bigInteger('fact_id').primary().references('id').inTable('fact').onDelete('CASCADE');
|
|
17
|
+
table.integer('access_count').notNullable().defaultTo(0);
|
|
18
|
+
table.timestamp('last_accessed_at');
|
|
19
|
+
table.string('stage').notNullable().defaultTo('fresh'); // fresh | stable | editing
|
|
20
|
+
table.timestamp('stage_entered_at').notNullable().defaultTo(knex.fn.now());
|
|
21
|
+
table.timestamp('created_at').notNullable().defaultTo(knex.fn.now());
|
|
22
|
+
|
|
23
|
+
table.index('last_accessed_at');
|
|
24
|
+
table.index(['stage', 'stage_entered_at']);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
// Backfill existing facts into fact_lifecycle
|
|
28
|
+
await knex.raw(`
|
|
29
|
+
INSERT INTO fact_lifecycle (fact_id, access_count, last_accessed_at, stage, stage_entered_at, created_at)
|
|
30
|
+
SELECT
|
|
31
|
+
id,
|
|
32
|
+
COALESCE(access_count, 0),
|
|
33
|
+
last_accessed_at,
|
|
34
|
+
'stable' AS stage,
|
|
35
|
+
COALESCE(created_at, NOW()) AS stage_entered_at,
|
|
36
|
+
COALESCE(created_at, NOW()) AS created_at
|
|
37
|
+
FROM fact
|
|
38
|
+
ON CONFLICT (fact_id) DO NOTHING
|
|
39
|
+
`);
|
|
40
|
+
|
|
41
|
+
// Trigger to auto-insert a lifecycle row when a new fact is inserted.
|
|
42
|
+
await knex.raw(`
|
|
43
|
+
CREATE OR REPLACE FUNCTION fact_init_lifecycle() RETURNS trigger AS $$
|
|
44
|
+
BEGIN
|
|
45
|
+
INSERT INTO fact_lifecycle (fact_id, access_count, last_accessed_at, stage, stage_entered_at, created_at)
|
|
46
|
+
VALUES (NEW.id, 0, NULL, 'fresh', NOW(), NOW())
|
|
47
|
+
ON CONFLICT (fact_id) DO NOTHING;
|
|
48
|
+
RETURN NEW;
|
|
49
|
+
END;
|
|
50
|
+
$$ LANGUAGE plpgsql;
|
|
51
|
+
`);
|
|
52
|
+
|
|
53
|
+
await knex.raw(`
|
|
54
|
+
DROP TRIGGER IF EXISTS fact_init_lifecycle_trigger ON fact;
|
|
55
|
+
CREATE TRIGGER fact_init_lifecycle_trigger
|
|
56
|
+
AFTER INSERT ON fact
|
|
57
|
+
FOR EACH ROW EXECUTE FUNCTION fact_init_lifecycle();
|
|
58
|
+
`);
|
|
59
|
+
|
|
60
|
+
// Drop the old columns from fact — these have moved to fact_lifecycle.
|
|
61
|
+
await knex.schema.alterTable('fact', (table) => {
|
|
62
|
+
table.dropColumn('access_count');
|
|
63
|
+
table.dropColumn('last_accessed_at');
|
|
64
|
+
});
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
exports.down = async function (knex) {
|
|
68
|
+
// Re-add columns to fact
|
|
69
|
+
await knex.schema.alterTable('fact', (table) => {
|
|
70
|
+
table.integer('access_count').defaultTo(0);
|
|
71
|
+
table.timestamp('last_accessed_at');
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
// Copy data back
|
|
75
|
+
await knex.raw(`
|
|
76
|
+
UPDATE fact f
|
|
77
|
+
SET access_count = fl.access_count,
|
|
78
|
+
last_accessed_at = fl.last_accessed_at
|
|
79
|
+
FROM fact_lifecycle fl
|
|
80
|
+
WHERE f.id = fl.fact_id
|
|
81
|
+
`);
|
|
82
|
+
|
|
83
|
+
await knex.raw('DROP TRIGGER IF EXISTS fact_init_lifecycle_trigger ON fact');
|
|
84
|
+
await knex.raw('DROP FUNCTION IF EXISTS fact_init_lifecycle()');
|
|
85
|
+
await knex.schema.dropTable('fact_lifecycle');
|
|
86
|
+
};
|