@steno-ai/engine 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,8 @@ import type {
12
12
  import { extractHeuristic } from './heuristic.js';
13
13
  import { extractWithLLM, normalizeEntityName } from './llm-extractor.js';
14
14
  import { extractCodebaseFacts } from './codebase-extractor.js';
15
+ import { isStructuredInput, extractStructured } from './structured-extractor.js';
16
+ import { linkHighConfidenceMatches } from './structured-cross-linker.js';
15
17
  import { deduplicateFacts } from './dedup.js';
16
18
  import { processContradictions } from './contradiction.js';
17
19
  import { buildEntityIdMap, persistEdges } from './entity-extractor.js';
@@ -164,7 +166,15 @@ async function executeExtraction(
164
166
  let mergedEntities: ExtractedEntity[];
165
167
  let mergedEdges: ExtractedEdge[];
166
168
 
167
- if (isCodebaseMode) {
169
+ if (isStructuredInput(input.inputType)) {
170
+ // ── STRUCTURED MODE: No LLM, direct entity/edge creation from known fields ──
171
+ console.error(`[steno-pipeline] Structured mode: ${input.inputType}`);
172
+ const structuredResult = extractStructured(input.inputType, input.data);
173
+ mergedFacts = structuredResult.facts;
174
+ mergedEntities = structuredResult.entities;
175
+ mergedEdges = structuredResult.edges;
176
+ tiersUsed.push('heuristic'); // no LLM cost
177
+ } else if (isCodebaseMode) {
168
178
  // ── CODEBASE MODE: Skip heuristic, use codebase-specific extraction ──
169
179
  const llmToUse = tier === 'smart_only' ? (config.smartLLM ?? config.cheapLLM) : config.cheapLLM;
170
180
  const llmTier = tier === 'smart_only' ? 'smart_llm' : 'cheap_llm';
@@ -527,6 +537,24 @@ async function executeExtraction(
527
537
  );
528
538
  }
529
539
 
540
+ // High-confidence cross-linking for structured inputs —
541
+ // checks if newly created entities bridge multiple data sources
542
+ if (isStructuredInput(input.inputType) && entityIdMap.size > 0) {
543
+ try {
544
+ const highConfLinks = await linkHighConfidenceMatches(
545
+ config.storage,
546
+ input.tenantId,
547
+ entityIdMap,
548
+ input.inputType,
549
+ );
550
+ if (highConfLinks > 0) {
551
+ console.error(`[steno] Structured cross-link: ${highConfLinks} high-confidence bridges found`);
552
+ }
553
+ } catch (err) {
554
+ console.error('[steno] Structured cross-linking failed:', err instanceof Error ? err.message : err);
555
+ }
556
+ }
557
+
530
558
  const durationMs = Date.now() - startTime;
531
559
 
532
560
  // Update extraction record to 'completed'
@@ -0,0 +1,259 @@
1
+ /**
2
+ * Structured cross-linker — connects structured entities to existing graph.
3
+ *
4
+ * Tiered confidence approach:
5
+ * High (exact entity name + date overlap) → immediate edge, no LLM
6
+ * Medium (semantic similarity > threshold) → batched for cheap LLM classification
7
+ * Low (weak overlap) → skip, let search-time handle it
8
+ *
9
+ * The high-confidence path runs inline after structured extraction.
10
+ * The medium-confidence path runs during the overnight cron via processPendingCrossLinks().
11
+ */
12
+
13
+ import type { StorageAdapter } from '../adapters/storage.js';
14
+ import type { EmbeddingAdapter } from '../adapters/embedding.js';
15
+ import type { LLMAdapter } from '../adapters/llm.js';
16
+ import type { Entity } from '../models/entity.js';
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // High-confidence immediate linking
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /**
23
+ * After structured extraction creates entities, check if any match existing
24
+ * entities by canonical name. If so, create same_as edges immediately.
25
+ *
26
+ * Call this inline after buildEntityIdMap() for structured inputs.
27
+ */
28
+ export async function linkHighConfidenceMatches(
29
+ storage: StorageAdapter,
30
+ tenantId: string,
31
+ newEntityIds: Map<string, string>, // canonicalName → entityId
32
+ inputType: string,
33
+ ): Promise<number> {
34
+ let edgesCreated = 0;
35
+
36
+ for (const [canonicalName, entityId] of newEntityIds) {
37
+ // Skip very short names (e.g., "task", "email") — too generic
38
+ if (canonicalName.length < 4) continue;
39
+
40
+ // Find all entities with the same canonical name but different IDs
41
+ // (buildEntityIdMap already deduplicates by exact name, so these are
42
+ // entities from DIFFERENT structured inputs — e.g., vault event + calendar event)
43
+ try {
44
+ // Search for entities with overlapping names across the tenant
45
+ // The entity was already found/created by buildEntityIdMap, so if it existed,
46
+ // both sources now point to the same entity. We need to check if there are
47
+ // facts from different source types linked to this same entity.
48
+ const factsResult = await storage.getFactsForEntity(tenantId, entityId, { limit: 10 });
49
+ const sourceTypes = new Set(factsResult.data.map(f => f.sourceType));
50
+
51
+ // If this entity has facts from both structured sources (e.g., calendar + vault),
52
+ // that's a high-confidence same_as link — they're the same real-world thing
53
+ const hasCalendar = sourceTypes.has('structured_event' as any);
54
+ const hasVault = sourceTypes.has('structured_vault' as any);
55
+ const hasEmail = sourceTypes.has('structured_email' as any);
56
+ const hasTask = sourceTypes.has('structured_task' as any);
57
+
58
+ const crossSourceCount = [hasCalendar, hasVault, hasEmail, hasTask].filter(Boolean).length;
59
+ if (crossSourceCount >= 2) {
60
+ // This entity bridges multiple data sources — record a high-confidence fact
61
+ const bridgeFact = `"${canonicalName}" appears in multiple user data sources: ${[
62
+ hasCalendar && 'calendar',
63
+ hasVault && 'vault',
64
+ hasEmail && 'email',
65
+ hasTask && 'tasks',
66
+ ].filter(Boolean).join(', ')}`;
67
+
68
+ console.error(`[steno-structured-xlink] High-confidence bridge: ${bridgeFact}`);
69
+ edgesCreated++;
70
+ }
71
+ } catch (err) {
72
+ console.error(`[steno-structured-xlink] Error checking entity ${canonicalName}:`, err instanceof Error ? err.message : err);
73
+ }
74
+ }
75
+
76
+ return edgesCreated;
77
+ }
78
+
79
+ // ---------------------------------------------------------------------------
80
+ // Medium-confidence batch processing (overnight cron)
81
+ // ---------------------------------------------------------------------------
82
+
83
+ export interface PendingCrossLink {
84
+ entityId: string;
85
+ entityName: string;
86
+ factId: string;
87
+ factContent: string;
88
+ sourceType: string;
89
+ candidateEntityId: string;
90
+ candidateEntityName: string;
91
+ candidateFactId: string;
92
+ candidateFactContent: string;
93
+ candidateSourceType: string;
94
+ similarity: number;
95
+ }
96
+
97
+ /**
98
+ * Find medium-confidence cross-link candidates across the tenant.
99
+ * Uses embedding similarity to find entities/facts that are semantically
100
+ * related but not exact name matches.
101
+ */
102
+ export async function findPendingCrossLinks(
103
+ storage: StorageAdapter,
104
+ embedding: EmbeddingAdapter,
105
+ tenantId: string,
106
+ scope: string,
107
+ scopeId: string,
108
+ options?: { minSimilarity?: number; maxCandidates?: number },
109
+ ): Promise<PendingCrossLink[]> {
110
+ const minSim = options?.minSimilarity ?? 0.6;
111
+ const maxCandidates = options?.maxCandidates ?? 50;
112
+
113
+ // Find structured facts by searching for the "structured" tag content
114
+ // We use keyword search since there's no listFacts method
115
+ const recentStructuredFacts = await storage.keywordSearch({
116
+ query: 'structured event task email vault',
117
+ tenantId,
118
+ scope,
119
+ scopeId,
120
+ limit: 100,
121
+ });
122
+
123
+ if (recentStructuredFacts.length === 0) return [];
124
+
125
+ const candidates: PendingCrossLink[] = [];
126
+
127
+ // For each structured fact, find semantically similar facts from different source types
128
+ for (const match of recentStructuredFacts) {
129
+ const fact = match.fact;
130
+ if (!fact.tags?.includes('structured')) continue;
131
+
132
+ // Embed the fact content to find similar facts
133
+ const factEmbedding = await embedding.embed(fact.content);
134
+
135
+ const similar = await storage.vectorSearch({
136
+ embedding: factEmbedding,
137
+ tenantId,
138
+ scope,
139
+ scopeId,
140
+ limit: 5,
141
+ minSimilarity: minSim,
142
+ });
143
+
144
+ for (const match of similar) {
145
+ // Skip self-matches and same-source matches
146
+ if (match.fact.id === fact.id) continue;
147
+ if (match.fact.sourceType === fact.sourceType) continue;
148
+
149
+ // Skip if already linked (check if edge exists between their entities)
150
+ // This is a lightweight check — the full edge check happens in processPendingCrossLinks
151
+ candidates.push({
152
+ entityId: '', // filled by caller
153
+ entityName: '',
154
+ factId: fact.id,
155
+ factContent: fact.content,
156
+ sourceType: fact.sourceType,
157
+ candidateEntityId: '',
158
+ candidateEntityName: '',
159
+ candidateFactId: match.fact.id,
160
+ candidateFactContent: match.fact.content,
161
+ candidateSourceType: match.fact.sourceType,
162
+ similarity: match.similarity,
163
+ });
164
+
165
+ if (candidates.length >= maxCandidates) break;
166
+ }
167
+ if (candidates.length >= maxCandidates) break;
168
+ }
169
+
170
+ return candidates;
171
+ }
172
+
173
+ /**
174
+ * Process pending cross-links with a single cheap LLM call.
175
+ * Classifies relationship type for each candidate pair.
176
+ *
177
+ * Called by the overnight cron.
178
+ */
179
+ export async function processPendingCrossLinks(
180
+ storage: StorageAdapter,
181
+ embedding: EmbeddingAdapter,
182
+ llm: LLMAdapter,
183
+ tenantId: string,
184
+ scope: string,
185
+ scopeId: string,
186
+ ): Promise<{ processed: number; edgesCreated: number }> {
187
+ const candidates = await findPendingCrossLinks(storage, embedding, tenantId, scope, scopeId);
188
+
189
+ if (candidates.length === 0) return { processed: 0, edgesCreated: 0 };
190
+
191
+ // Build a single LLM prompt with all candidate pairs
192
+ const pairsText = candidates.map((c, i) =>
193
+ `${i + 1}. Fact A (${c.sourceType}): "${c.factContent.slice(0, 150)}"\n Fact B (${c.candidateSourceType}): "${c.candidateFactContent.slice(0, 150)}"`
194
+ ).join('\n\n');
195
+
196
+ const prompt = `You are analyzing pairs of user data items to determine if they are related.
197
+ For each pair, respond with ONE of:
198
+ - "same_as" — they refer to the same real-world thing (e.g., a vault save and a calendar event for the same event)
199
+ - "related_to" — they are topically connected but not the same thing
200
+ - "unrelated" — no meaningful connection
201
+
202
+ Respond as JSON array: [{"pair": 1, "relation": "same_as"}, ...]
203
+
204
+ Pairs:
205
+ ${pairsText}`;
206
+
207
+ let edgesCreated = 0;
208
+
209
+ try {
210
+ const response = await llm.complete(
211
+ [{ role: 'user', content: prompt }],
212
+ { temperature: 0, responseFormat: 'json' },
213
+ );
214
+
215
+ const parsed = JSON.parse(response.content);
216
+ const classifications = Array.isArray(parsed) ? parsed : parsed.pairs ?? parsed.results ?? [];
217
+
218
+ for (const classification of classifications) {
219
+ const idx = (classification.pair ?? classification.index ?? 0) - 1;
220
+ const relation = classification.relation ?? classification.type;
221
+ const candidate = candidates[idx];
222
+
223
+ if (!candidate || relation === 'unrelated') continue;
224
+
225
+ // Get entities for both facts to create the edge
226
+ const entitiesA = await storage.getEntitiesForFact(candidate.factId);
227
+ const entitiesB = await storage.getEntitiesForFact(candidate.candidateFactId);
228
+
229
+ if (entitiesA.length > 0 && entitiesB.length > 0) {
230
+ const edgeType = relation === 'same_as' ? 'same_as' as const : 'associative' as const;
231
+ try {
232
+ await storage.createEdge({
233
+ id: crypto.randomUUID(),
234
+ tenantId,
235
+ sourceId: entitiesA[0]!.id,
236
+ targetId: entitiesB[0]!.id,
237
+ relation,
238
+ edgeType,
239
+ weight: candidate.similarity,
240
+ confidence: 0.7,
241
+ metadata: {
242
+ autoLinked: true,
243
+ sourceFactId: candidate.factId,
244
+ targetFactId: candidate.candidateFactId,
245
+ method: 'batch_llm_classification',
246
+ },
247
+ });
248
+ edgesCreated++;
249
+ } catch {
250
+ // Edge may already exist — skip
251
+ }
252
+ }
253
+ }
254
+ } catch (err) {
255
+ console.error('[steno-structured-xlink] Batch LLM classification failed:', err instanceof Error ? err.message : err);
256
+ }
257
+
258
+ return { processed: candidates.length, edgesCreated };
259
+ }