npm - @soleri/core - Versions diffs - 9.14.0 → 9.15.0 - Mend

@soleri/core 9.14.0 → 9.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

package/src/chat/chat-transport.test.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  */
 import { describe, test, expect, beforeEach, afterEach } from 'vitest';
-import { mkdtempSync, rmSync } from 'node:fs';
+import { mkdtempSync, rmSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
 import { join } from 'node:path';
 import { tmpdir } from 'node:os';
 import { ChatSessionManager } from './chat-session.js';
@@ -136,6 +136,19 @@ describe('ChatSessionManager', () => {
       expect(all).toContain('chat-2');
     });
+    test('listAll ignores non-session JSON files in the storage root', () => {
+      manager.getOrCreate('chat-1');
+      writeFileSync(
+        join(dir, 'plans.json'),
+        JSON.stringify({ version: '1.0', plans: [] }),
+        'utf-8',
+      );
+      const all = manager.listAll();
+      expect(all).toContain('chat-1');
+      expect(all).not.toContain('plans');
+    });
     test('setMeta updates metadata', () => {
       manager.getOrCreate('chat-1');
       manager.setMeta('chat-1', { mood: 'happy' });
@@ -162,6 +175,23 @@ describe('ChatSessionManager', () => {
       manager2.close();
     });
+    test('session files are namespaced away from plans.json collisions', () => {
+      writeFileSync(
+        join(dir, 'plans.json'),
+        JSON.stringify({ version: '1.0', plans: [{ id: 'plan-1' }] }),
+        'utf-8',
+      );
+      const session = manager.getOrCreate('plans');
+      expect(session.messages).toEqual([]);
+      expect(JSON.parse(readFileSync(join(dir, 'plans.json'), 'utf-8'))).toEqual({
+        version: '1.0',
+        plans: [{ id: 'plan-1' }],
+      });
+      expect(existsSync(join(dir, 'sessions', 'plans.json'))).toBe(true);
+    });
     test('delete removes from disk', () => {
       manager.getOrCreate('chat-1');
       manager.delete('chat-1');

package/src/curator/curator.ts CHANGED Viewed

@@ -40,6 +40,10 @@ import {
 import { initializeTables } from './schema.js';
 import { computeHealthAudit, type HealthDataProvider } from './health-audit.js';
 import { enrichEntryMetadata } from './metadata-enricher.js';
+import {
+  computeEditDistance,
+  normalizeTags as normalizeTagsCanonical,
+} from '../vault/tag-normalizer.js';
 // ─── Constants ──────────────────────────────────────────────────────
@@ -359,15 +363,141 @@ export class Curator {
       if (batch.length < DEFAULT_BATCH_SIZE) break;
       offset += DEFAULT_BATCH_SIZE;
     }
+    // Synonym merge: detect tag pairs with edit-distance ≤ 1 and merge lower-frequency into higher
+    const synonymMerges = this.mergeSynonymTags();
     return {
       totalEntries,
       groomedCount: totalEntries,
       tagsNormalized,
       staleCount,
       durationMs: Date.now() - start,
+      synonymMerges,
     };
   }
+  /**
+   * Detect tag pairs where edit-distance ≤ 1 (e.g. 'workflow'/'workflows') and merge
+   * the lower-frequency tag into the higher-frequency one across all entries.
+   * Returns count of tags merged.
+   */
+  private mergeSynonymTags(): number {
+    // Collect all unique tags and their usage counts
+    const rows = this.provider.all<{ tags: string }>(
+      'SELECT tags FROM entries WHERE tags IS NOT NULL',
+    );
+    const tagCounts = new Map<string, number>();
+    for (const row of rows) {
+      let tags: string[];
+      try {
+        tags = JSON.parse(row.tags) as string[];
+      } catch {
+        continue;
+      }
+      for (const tag of tags) {
+        if (typeof tag === 'string' && tag.length > 0) {
+          tagCounts.set(tag, (tagCounts.get(tag) ?? 0) + 1);
+        }
+      }
+    }
+    const allTags = Array.from(tagCounts.keys());
+    if (allTags.length < 2) return 0;
+    // Build synonym merge map: minorTag → majorTag
+    // Only merge if edit-distance ≤ 1 and major has higher or equal frequency
+    const mergeMap = new Map<string, string>(); // minor → major
+    const processed = new Set<string>();
+    // Bucket tags by length to reduce comparisons from O(n²) to O(n * avg_bucket_size)
+    const buckets = new Map<number, string[]>();
+    for (const tag of allTags) {
+      const len = tag.length;
+      const bucket = buckets.get(len);
+      if (bucket) {
+        bucket.push(tag);
+      } else {
+        buckets.set(len, [tag]);
+      }
+    }
+    for (const a of allTags) {
+      if (processed.has(a)) continue;
+      // Only compare against tags of the same or adjacent length (edit distance ≤ 1)
+      const candidates: string[] = [
+        ...(buckets.get(a.length) ?? []),
+        ...(buckets.get(a.length - 1) ?? []),
+        ...(buckets.get(a.length + 1) ?? []),
+      ];
+      for (const b of candidates) {
+        if (b === a) continue;
+        if (processed.has(a) || processed.has(b)) continue;
+        if (computeEditDistance(a, b) <= 1) {
+          const countA = tagCounts.get(a) ?? 0;
+          const countB = tagCounts.get(b) ?? 0;
+          // Merge lower-frequency into higher-frequency
+          if (countA >= countB) {
+            mergeMap.set(b, a);
+            processed.add(b);
+          } else {
+            mergeMap.set(a, b);
+            processed.add(a);
+          }
+        }
+      }
+    }
+    if (mergeMap.size === 0) return 0;
+    // Apply merges to all affected entries
+    let mergeCount = 0;
+    const allEntryRows = this.provider.all<{ id: string; tags: string }>(
+      'SELECT id, tags FROM entries WHERE tags IS NOT NULL',
+    );
+    for (const row of allEntryRows) {
+      let tags: string[];
+      try {
+        tags = JSON.parse(row.tags) as string[];
+      } catch {
+        continue;
+      }
+      let changed = false;
+      const updated = [
+        ...new Set(
+          tags.map((tag) => {
+            const replacement = mergeMap.get(tag);
+            if (replacement) {
+              changed = true;
+              return replacement;
+            }
+            return tag;
+          }),
+        ),
+      ];
+      if (changed) {
+        this.provider.run('UPDATE entries SET tags = ?, updated_at = unixepoch() WHERE id = ?', [
+          JSON.stringify(updated),
+          row.id,
+        ]);
+        this.logChange(
+          'synonym_merge',
+          row.id,
+          JSON.stringify(tags),
+          JSON.stringify(updated),
+          'Synonym tag merge (edit-distance ≤ 1)',
+        );
+        mergeCount++;
+      }
+    }
+    return mergeCount;
+  }
   // ─── Consolidation ───────────────────────────────────────────
   consolidate(options?: ConsolidationOptions): ConsolidationResult {
@@ -419,6 +549,55 @@ export class Curator {
         }
       }
     }
+    // Retag: run all entries through canonical normalization if requested
+    let retagged: number | undefined;
+    if (options?.retag && options.canonicalTags && options.canonicalTags.length > 0) {
+      const tagMode = options.tagConstraintMode ?? 'suggest';
+      const metaPrefixes = options.metadataTagPrefixes ?? ['source:'];
+      retagged = 0;
+      const entryRows = this.provider.all<{ id: string; tags: string }>(
+        'SELECT id, tags FROM entries WHERE tags IS NOT NULL',
+      );
+      for (const row of entryRows) {
+        let tags: string[];
+        try {
+          tags = JSON.parse(row.tags) as string[];
+        } catch {
+          continue;
+        }
+        const normalized = normalizeTagsCanonical(
+          tags,
+          options.canonicalTags,
+          tagMode,
+          metaPrefixes,
+        );
+        const tagsChanged =
+          normalized.length !== tags.length || normalized.some((t, i) => t !== tags[i]);
+        if (tagsChanged) {
+          if (!dryRun) {
+            this.provider.run(
+              'UPDATE entries SET tags = ?, updated_at = unixepoch() WHERE id = ?',
+              [JSON.stringify(normalized), row.id],
+            );
+            this.logChange(
+              'retag',
+              row.id,
+              JSON.stringify(tags),
+              JSON.stringify(normalized),
+              'Canonical retag during consolidation',
+            );
+            mutations++;
+          }
+          retagged++;
+        }
+      }
+    }
     return {
       dryRun,
       duplicates,
@@ -426,6 +605,7 @@ export class Curator {
       contradictions,
       mutations,
       durationMs: Date.now() - start,
+      retagged,
     };
   }

package/src/curator/types.ts CHANGED Viewed

@@ -62,6 +62,7 @@ export interface GroomAllResult {
   tagsNormalized: number;
   staleCount: number;
   durationMs: number;
+  synonymMerges: number;
 }
 // ─── Consolidation ──────────────────────────────────────────────────
@@ -71,6 +72,14 @@ export interface ConsolidationOptions {
   staleDaysThreshold?: number;
   duplicateThreshold?: number;
   contradictionThreshold?: number;
+  /** When true, run all entries through canonical tag normalization. Dry-run by default. */
+  retag?: boolean;
+  /** Canonical tag list for retag operation. Required when retag is true. */
+  canonicalTags?: string[];
+  /** Tag constraint mode for retag. Default: 'suggest'. */
+  tagConstraintMode?: 'enforce' | 'suggest' | 'off';
+  /** Metadata tag prefixes exempt from canonical normalization. Default: ['source:']. */
+  metadataTagPrefixes?: string[];
 }
 export interface ConsolidationResult {
@@ -80,6 +89,7 @@ export interface ConsolidationResult {
   contradictions: Contradiction[];
   mutations: number;
   durationMs: number;
+  retagged?: number;
 }
 // ─── Changelog & Health ─────────────────────────────────────────────

package/src/index.ts CHANGED Viewed

@@ -101,6 +101,13 @@ export type {
 } from './vault/vault-types.js';
 export { validatePlaybook, parsePlaybookFromEntry } from './vault/playbook.js';
 export type { Playbook, PlaybookStep, PlaybookValidationResult } from './vault/playbook.js';
+export { DEFAULT_CANONICAL_TAGS } from './vault/default-canonical-tags.js';
+export {
+  normalizeTag as normalizeTagCanonical,
+  normalizeTags as normalizeTagsCanonical,
+  isMetadataTag,
+  computeEditDistance,
+} from './vault/tag-normalizer.js';
 // ─── Playbook System (registry, matching, seeding) ─────────────────
 export {

package/src/intake/content-classifier.ts CHANGED Viewed

@@ -45,22 +45,40 @@ Rules:
 // CLASSIFIER
 // =============================================================================
+/**
+ * Build the classification system prompt, optionally injecting a canonical tag list.
+ * When canonical tags are provided, the LLM is guided to prefer them.
+ */
+export function buildClassificationPrompt(canonicalTags?: string[]): string {
+  if (!canonicalTags || canonicalTags.length === 0) {
+    return CLASSIFICATION_PROMPT;
+  }
+  const tagList = canonicalTags.join(', ');
+  return (
+    CLASSIFICATION_PROMPT +
+    `\n\nTag guidance: Use only tags from this approved list where possible: ${tagList}. Create a new tag only when nothing from the list fits the concept.`
+  );
+}
 /**
  * Classify a text chunk into structured knowledge items using an LLM.
  *
- * @param llm      - LLMClient instance
- * @param chunkText - The text to classify
- * @param citation  - Source citation (e.g. "book.pdf, pages 12-15")
+ * @param llm          - LLMClient instance
+ * @param chunkText    - The text to classify
+ * @param citation     - Source citation (e.g. "book.pdf, pages 12-15")
+ * @param canonicalTags - Optional canonical tag list to inject into the prompt
  * @returns Classified items, or [] on any error
  */
 export async function classifyChunk(
   llm: LLMClient,
   chunkText: string,
   citation: string,
+  canonicalTags?: string[],
 ): Promise<ClassifiedItem[]> {
   try {
+    const systemPrompt = buildClassificationPrompt(canonicalTags);
     const result = await llm.complete({
-      systemPrompt: CLASSIFICATION_PROMPT,
+      systemPrompt,
       userPrompt: chunkText,
       maxTokens: 4096,
       temperature: 0.3,

package/src/intake/text-ingester.ts CHANGED Viewed

@@ -11,6 +11,7 @@ import type { IntelligenceEntry } from '../intelligence/types.js';
 import type { ClassifiedItem } from './types.js';
 import { classifyChunk } from './content-classifier.js';
 import { dedupItems } from './dedup-gate.js';
+import { normalizeTags as normalizeTagsCanonical } from '../vault/tag-normalizer.js';
 // ─── Types ───────────────────────────────────────────────────────────
@@ -26,6 +27,12 @@ export interface IngestOptions {
   tags?: string[];
   /** Max chars per chunk for LLM classification. Default 4000. */
   chunkSize?: number;
+  /** Canonical tag list for normalization. If omitted, no canonical normalization. */
+  canonicalTags?: string[];
+  /** Tag constraint mode. Default: 'suggest'. */
+  tagConstraintMode?: 'enforce' | 'suggest' | 'off';
+  /** Metadata tag prefixes exempt from canonical normalization. Default: ['source:']. */
+  metadataTagPrefixes?: string[];
 }
 export interface IngestResult {
@@ -42,15 +49,30 @@ const FETCH_TIMEOUT_MS = 15000;
 // ─── Class ───────────────────────────────────────────────────────────
+interface CanonicalTagConfig {
+  canonicalTags: string[];
+  tagConstraintMode: 'enforce' | 'suggest' | 'off';
+  metadataTagPrefixes: string[];
+}
 export class TextIngester {
   private vault: Vault;
   private llm: LLMClient | null;
+  private canonicalTagConfig: CanonicalTagConfig | null = null;
   constructor(vault: Vault, llm: LLMClient | null) {
     this.vault = vault;
     this.llm = llm;
   }
+  /**
+   * Wire canonical tag config from runtime — used as defaults for all ingest calls.
+   * Caller-provided options in ingestText/ingestUrl/ingestBatch still take precedence.
+   */
+  setCanonicalTagConfig(cfg: CanonicalTagConfig): void {
+    this.canonicalTagConfig = cfg;
+  }
   /**
    * Ingest a URL — fetch, strip HTML, classify, dedup, store.
    */
@@ -101,11 +123,19 @@ export class TextIngester {
     const domain = opts?.domain ?? 'general';
     const extraTags = opts?.tags ?? [];
+    // Resolve canonical config — caller opts take precedence over runtime-wired config
+    const canonicalTagsForClassify = opts?.canonicalTags ?? this.canonicalTagConfig?.canonicalTags;
     // Classify all chunks
     const allItems: ClassifiedItem[] = [];
     for (const chunk of chunks) {
       // oxlint-disable-next-line eslint(no-await-in-loop)
-      const items = await classifyChunk(this.llm, chunk, `${source.type}: ${source.title}`);
+      const items = await classifyChunk(
+        this.llm,
+        chunk,
+        `${source.type}: ${source.title}`,
+        canonicalTagsForClassify,
+      );
       allItems.push(...items);
     }
@@ -121,18 +151,37 @@ export class TextIngester {
     // Build source attribution for context field
     const attribution = buildAttribution(source);
+    // Metadata tags use 'source:' prefix so they're exempt from canonical normalization
+    const metadataTags = [`source:ingested`, `source:${source.type}`];
+    // Apply canonical tag normalization if configured
+    // Caller-provided options take precedence over runtime-wired config
+    const canonicalTags = opts?.canonicalTags ?? this.canonicalTagConfig?.canonicalTags;
+    const tagMode =
+      opts?.tagConstraintMode ?? this.canonicalTagConfig?.tagConstraintMode ?? 'suggest';
     // Store in vault
-    const entries: IntelligenceEntry[] = unique.map((item, i) => ({
-      id: `ingest-${source.type}-${Date.now()}-${i}-${Math.random().toString(36).slice(2, 6)}`,
-      type: mapType(item.type),
-      domain,
-      title: item.title,
-      description: item.description,
-      severity: mapSeverity(item.severity),
-      tags: [...(item.tags ?? []), ...extraTags, 'ingested', source.type],
-      context: attribution,
-      origin: 'user' as const,
-    }));
+    const entries: IntelligenceEntry[] = unique.map((item, i) => {
+      const rawTags = [...(item.tags ?? []), ...extraTags];
+      // metaPrefixes not passed here — source: tags are added after normalization,
+      // so there is nothing to exempt at this point.
+      const normalizedTags =
+        canonicalTags && tagMode !== 'off'
+          ? normalizeTagsCanonical(rawTags, canonicalTags, tagMode)
+          : rawTags;
+      return {
+        id: `ingest-${source.type}-${Date.now()}-${i}-${Math.random().toString(36).slice(2, 6)}`,
+        type: mapType(item.type),
+        domain,
+        title: item.title,
+        description: item.description,
+        severity: mapSeverity(item.severity),
+        tags: [...normalizedTags, ...metadataTags],
+        context: attribution,
+        origin: 'user' as const,
+      };
+    });
     if (entries.length > 0) {
       this.vault.seed(entries);