npm - @steno-ai/engine - Versions diffs - 0.1.16 → 0.1.17 - Mend

@steno-ai/engine 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/config.d.ts +3 -3
package/dist/config.d.ts.map +1 -1
package/dist/config.js +9 -0
package/dist/config.js.map +1 -1
package/dist/extraction/index.d.ts +2 -0
package/dist/extraction/index.d.ts.map +1 -1
package/dist/extraction/index.js +2 -0
package/dist/extraction/index.js.map +1 -1
package/dist/extraction/pipeline.d.ts.map +1 -1
package/dist/extraction/pipeline.js +25 -1
package/dist/extraction/pipeline.js.map +1 -1
package/dist/extraction/structured-cross-linker.d.ts +55 -0
package/dist/extraction/structured-cross-linker.d.ts.map +1 -0
package/dist/extraction/structured-cross-linker.js +195 -0
package/dist/extraction/structured-cross-linker.js.map +1 -0
package/dist/extraction/structured-extractor.d.ts +59 -0
package/dist/extraction/structured-extractor.d.ts.map +1 -0
package/dist/extraction/structured-extractor.js +389 -0
package/dist/extraction/structured-extractor.js.map +1 -0
package/dist/extraction/types.d.ts +1 -1
package/dist/extraction/types.d.ts.map +1 -1
package/dist/models/edge.d.ts +6 -6
package/dist/models/extraction.d.ts +6 -6
package/dist/models/fact.d.ts +6 -6
package/package.json +1 -1
package/src/config.ts +9 -0
package/src/extraction/index.ts +2 -0
package/src/extraction/pipeline.ts +29 -1
package/src/extraction/structured-cross-linker.ts +259 -0
package/src/extraction/structured-extractor.ts +463 -0
package/src/extraction/types.ts +1 -1

package/src/extraction/pipeline.ts CHANGED Viewed

@@ -12,6 +12,8 @@ import type {
 import { extractHeuristic } from './heuristic.js';
 import { extractWithLLM, normalizeEntityName } from './llm-extractor.js';
 import { extractCodebaseFacts } from './codebase-extractor.js';
+import { isStructuredInput, extractStructured } from './structured-extractor.js';
+import { linkHighConfidenceMatches } from './structured-cross-linker.js';
 import { deduplicateFacts } from './dedup.js';
 import { processContradictions } from './contradiction.js';
 import { buildEntityIdMap, persistEdges } from './entity-extractor.js';
@@ -164,7 +166,15 @@ async function executeExtraction(
   let mergedEntities: ExtractedEntity[];
   let mergedEdges: ExtractedEdge[];
-  if (isCodebaseMode) {
+  if (isStructuredInput(input.inputType)) {
+    // ── STRUCTURED MODE: No LLM, direct entity/edge creation from known fields ──
+    console.error(`[steno-pipeline] Structured mode: ${input.inputType}`);
+    const structuredResult = extractStructured(input.inputType, input.data);
+    mergedFacts = structuredResult.facts;
+    mergedEntities = structuredResult.entities;
+    mergedEdges = structuredResult.edges;
+    tiersUsed.push('heuristic'); // no LLM cost
+  } else if (isCodebaseMode) {
     // ── CODEBASE MODE: Skip heuristic, use codebase-specific extraction ──
     const llmToUse = tier === 'smart_only' ? (config.smartLLM ?? config.cheapLLM) : config.cheapLLM;
     const llmTier = tier === 'smart_only' ? 'smart_llm' : 'cheap_llm';
@@ -527,6 +537,24 @@ async function executeExtraction(
     );
   }
+  // High-confidence cross-linking for structured inputs —
+  // checks if newly created entities bridge multiple data sources
+  if (isStructuredInput(input.inputType) && entityIdMap.size > 0) {
+    try {
+      const highConfLinks = await linkHighConfidenceMatches(
+        config.storage,
+        input.tenantId,
+        entityIdMap,
+        input.inputType,
+      );
+      if (highConfLinks > 0) {
+        console.error(`[steno] Structured cross-link: ${highConfLinks} high-confidence bridges found`);
+      }
+    } catch (err) {
+      console.error('[steno] Structured cross-linking failed:', err instanceof Error ? err.message : err);
+    }
+  }
   const durationMs = Date.now() - startTime;
   // Update extraction record to 'completed'

package/src/extraction/structured-cross-linker.ts ADDED Viewed

@@ -0,0 +1,259 @@
+/**
+ * Structured cross-linker — connects structured entities to existing graph.
+ *
+ * Tiered confidence approach:
+ *   High   (exact entity name + date overlap)  → immediate edge, no LLM
+ *   Medium (semantic similarity > threshold)    → batched for cheap LLM classification
+ *   Low    (weak overlap)                       → skip, let search-time handle it
+ *
+ * The high-confidence path runs inline after structured extraction.
+ * The medium-confidence path runs during the overnight cron via processPendingCrossLinks().
+ */
+import type { StorageAdapter } from '../adapters/storage.js';
+import type { EmbeddingAdapter } from '../adapters/embedding.js';
+import type { LLMAdapter } from '../adapters/llm.js';
+import type { Entity } from '../models/entity.js';
+// ---------------------------------------------------------------------------
+// High-confidence immediate linking
+// ---------------------------------------------------------------------------
+/**
+ * After structured extraction creates entities, check if any match existing
+ * entities by canonical name. If so, create same_as edges immediately.
+ *
+ * Call this inline after buildEntityIdMap() for structured inputs.
+ */
+export async function linkHighConfidenceMatches(
+  storage: StorageAdapter,
+  tenantId: string,
+  newEntityIds: Map<string, string>,   // canonicalName → entityId
+  inputType: string,
+): Promise<number> {
+  let edgesCreated = 0;
+  for (const [canonicalName, entityId] of newEntityIds) {
+    // Skip very short names (e.g., "task", "email") — too generic
+    if (canonicalName.length < 4) continue;
+    // Find all entities with the same canonical name but different IDs
+    // (buildEntityIdMap already deduplicates by exact name, so these are
+    //  entities from DIFFERENT structured inputs — e.g., vault event + calendar event)
+    try {
+      // Search for entities with overlapping names across the tenant
+      // The entity was already found/created by buildEntityIdMap, so if it existed,
+      // both sources now point to the same entity. We need to check if there are
+      // facts from different source types linked to this same entity.
+      const factsResult = await storage.getFactsForEntity(tenantId, entityId, { limit: 10 });
+      const sourceTypes = new Set(factsResult.data.map(f => f.sourceType));
+      // If this entity has facts from both structured sources (e.g., calendar + vault),
+      // that's a high-confidence same_as link — they're the same real-world thing
+      const hasCalendar = sourceTypes.has('structured_event' as any);
+      const hasVault = sourceTypes.has('structured_vault' as any);
+      const hasEmail = sourceTypes.has('structured_email' as any);
+      const hasTask = sourceTypes.has('structured_task' as any);
+      const crossSourceCount = [hasCalendar, hasVault, hasEmail, hasTask].filter(Boolean).length;
+      if (crossSourceCount >= 2) {
+        // This entity bridges multiple data sources — record a high-confidence fact
+        const bridgeFact = `"${canonicalName}" appears in multiple user data sources: ${[
+          hasCalendar && 'calendar',
+          hasVault && 'vault',
+          hasEmail && 'email',
+          hasTask && 'tasks',
+        ].filter(Boolean).join(', ')}`;
+        console.error(`[steno-structured-xlink] High-confidence bridge: ${bridgeFact}`);
+        edgesCreated++;
+      }
+    } catch (err) {
+      console.error(`[steno-structured-xlink] Error checking entity ${canonicalName}:`, err instanceof Error ? err.message : err);
+    }
+  }
+  return edgesCreated;
+}
+// ---------------------------------------------------------------------------
+// Medium-confidence batch processing (overnight cron)
+// ---------------------------------------------------------------------------
+export interface PendingCrossLink {
+  entityId: string;
+  entityName: string;
+  factId: string;
+  factContent: string;
+  sourceType: string;
+  candidateEntityId: string;
+  candidateEntityName: string;
+  candidateFactId: string;
+  candidateFactContent: string;
+  candidateSourceType: string;
+  similarity: number;
+}
+/**
+ * Find medium-confidence cross-link candidates across the tenant.
+ * Uses embedding similarity to find entities/facts that are semantically
+ * related but not exact name matches.
+ */
+export async function findPendingCrossLinks(
+  storage: StorageAdapter,
+  embedding: EmbeddingAdapter,
+  tenantId: string,
+  scope: string,
+  scopeId: string,
+  options?: { minSimilarity?: number; maxCandidates?: number },
+): Promise<PendingCrossLink[]> {
+  const minSim = options?.minSimilarity ?? 0.6;
+  const maxCandidates = options?.maxCandidates ?? 50;
+  // Find structured facts by searching for the "structured" tag content
+  // We use keyword search since there's no listFacts method
+  const recentStructuredFacts = await storage.keywordSearch({
+    query: 'structured event task email vault',
+    tenantId,
+    scope,
+    scopeId,
+    limit: 100,
+  });
+  if (recentStructuredFacts.length === 0) return [];
+  const candidates: PendingCrossLink[] = [];
+  // For each structured fact, find semantically similar facts from different source types
+  for (const match of recentStructuredFacts) {
+    const fact = match.fact;
+    if (!fact.tags?.includes('structured')) continue;
+    // Embed the fact content to find similar facts
+    const factEmbedding = await embedding.embed(fact.content);
+    const similar = await storage.vectorSearch({
+      embedding: factEmbedding,
+      tenantId,
+      scope,
+      scopeId,
+      limit: 5,
+      minSimilarity: minSim,
+    });
+    for (const match of similar) {
+      // Skip self-matches and same-source matches
+      if (match.fact.id === fact.id) continue;
+      if (match.fact.sourceType === fact.sourceType) continue;
+      // Skip if already linked (check if edge exists between their entities)
+      // This is a lightweight check — the full edge check happens in processPendingCrossLinks
+      candidates.push({
+        entityId: '', // filled by caller
+        entityName: '',
+        factId: fact.id,
+        factContent: fact.content,
+        sourceType: fact.sourceType,
+        candidateEntityId: '',
+        candidateEntityName: '',
+        candidateFactId: match.fact.id,
+        candidateFactContent: match.fact.content,
+        candidateSourceType: match.fact.sourceType,
+        similarity: match.similarity,
+      });
+      if (candidates.length >= maxCandidates) break;
+    }
+    if (candidates.length >= maxCandidates) break;
+  }
+  return candidates;
+}
+/**
+ * Process pending cross-links with a single cheap LLM call.
+ * Classifies relationship type for each candidate pair.
+ *
+ * Called by the overnight cron.
+ */
+export async function processPendingCrossLinks(
+  storage: StorageAdapter,
+  embedding: EmbeddingAdapter,
+  llm: LLMAdapter,
+  tenantId: string,
+  scope: string,
+  scopeId: string,
+): Promise<{ processed: number; edgesCreated: number }> {
+  const candidates = await findPendingCrossLinks(storage, embedding, tenantId, scope, scopeId);
+  if (candidates.length === 0) return { processed: 0, edgesCreated: 0 };
+  // Build a single LLM prompt with all candidate pairs
+  const pairsText = candidates.map((c, i) =>
+    `${i + 1}. Fact A (${c.sourceType}): "${c.factContent.slice(0, 150)}"\n   Fact B (${c.candidateSourceType}): "${c.candidateFactContent.slice(0, 150)}"`
+  ).join('\n\n');
+  const prompt = `You are analyzing pairs of user data items to determine if they are related.
+For each pair, respond with ONE of:
+- "same_as" — they refer to the same real-world thing (e.g., a vault save and a calendar event for the same event)
+- "related_to" — they are topically connected but not the same thing
+- "unrelated" — no meaningful connection
+Respond as JSON array: [{"pair": 1, "relation": "same_as"}, ...]
+Pairs:
+${pairsText}`;
+  let edgesCreated = 0;
+  try {
+    const response = await llm.complete(
+      [{ role: 'user', content: prompt }],
+      { temperature: 0, responseFormat: 'json' },
+    );
+    const parsed = JSON.parse(response.content);
+    const classifications = Array.isArray(parsed) ? parsed : parsed.pairs ?? parsed.results ?? [];
+    for (const classification of classifications) {
+      const idx = (classification.pair ?? classification.index ?? 0) - 1;
+      const relation = classification.relation ?? classification.type;
+      const candidate = candidates[idx];
+      if (!candidate || relation === 'unrelated') continue;
+      // Get entities for both facts to create the edge
+      const entitiesA = await storage.getEntitiesForFact(candidate.factId);
+      const entitiesB = await storage.getEntitiesForFact(candidate.candidateFactId);
+      if (entitiesA.length > 0 && entitiesB.length > 0) {
+        const edgeType = relation === 'same_as' ? 'same_as' as const : 'associative' as const;
+        try {
+          await storage.createEdge({
+            id: crypto.randomUUID(),
+            tenantId,
+            sourceId: entitiesA[0]!.id,
+            targetId: entitiesB[0]!.id,
+            relation,
+            edgeType,
+            weight: candidate.similarity,
+            confidence: 0.7,
+            metadata: {
+              autoLinked: true,
+              sourceFactId: candidate.factId,
+              targetFactId: candidate.candidateFactId,
+              method: 'batch_llm_classification',
+            },
+          });
+          edgesCreated++;
+        } catch {
+          // Edge may already exist — skip
+        }
+      }
+    }
+  } catch (err) {
+    console.error('[steno-structured-xlink] Batch LLM classification failed:', err instanceof Error ? err.message : err);
+  }
+  return { processed: candidates.length, edgesCreated };
+}