@semiont/make-meaning 0.4.18 → 0.4.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10218,7 +10218,8 @@ var GraphDBConsumer = class _GraphDBConsumer {
10218
10218
  archived: false,
10219
10219
  dateCreated: (/* @__PURE__ */ new Date()).toISOString(),
10220
10220
  wasAttributedTo: didToAgent(event.userId),
10221
- creationMethod: event.payload.creationMethod
10221
+ creationMethod: event.payload.creationMethod,
10222
+ ...event.payload.storageUri ? { storageUri: event.payload.storageUri } : {}
10222
10223
  };
10223
10224
  }
10224
10225
  /**
@@ -10417,23 +10418,25 @@ var GraphDBConsumer = class _GraphDBConsumer {
10417
10418
  // src/smelter.ts
10418
10419
  var import_rxjs2 = __toESM(require_cjs(), 1);
10419
10420
  var import_operators2 = __toESM(require_operators(), 1);
10420
- import { EventQuery as EventQuery2 } from "@semiont/event-sourcing";
10421
10421
  import { burstBuffer as burstBuffer2 } from "@semiont/core";
10422
10422
  import { resourceId as makeResourceId2, annotationId as makeAnnotationId2 } from "@semiont/core";
10423
10423
  import { chunkText, DEFAULT_CHUNKING_CONFIG } from "@semiont/vectors";
10424
10424
  import { getExactText, getTargetSelector } from "@semiont/api-client";
10425
10425
  var Smelter = class _Smelter {
10426
- constructor(eventStore, eventBus, vectorStore, embeddingProvider, contentStore, logger, chunkingConfig) {
10427
- this.eventStore = eventStore;
10426
+ constructor(_eventStore, eventBus, vectorStore, embeddingProvider, contentStore, embeddingStore, viewStorage, logger, chunkingConfig) {
10428
10427
  this.eventBus = eventBus;
10429
10428
  this.vectorStore = vectorStore;
10430
10429
  this.embeddingProvider = embeddingProvider;
10431
10430
  this.contentStore = contentStore;
10431
+ this.embeddingStore = embeddingStore;
10432
+ this.viewStorage = viewStorage;
10432
10433
  this.logger = logger;
10433
10434
  this.chunkingConfig = chunkingConfig ?? DEFAULT_CHUNKING_CONFIG;
10434
10435
  }
10435
10436
  static SMELTER_RELEVANT_EVENTS = /* @__PURE__ */ new Set([
10436
10437
  "yield:created",
10438
+ "yield:updated",
10439
+ "yield:representation-added",
10437
10440
  "mark:archived",
10438
10441
  "mark:added",
10439
10442
  "mark:removed"
@@ -10485,68 +10488,121 @@ var Smelter = class _Smelter {
10485
10488
  this.logger.info("Smelter actor stopped");
10486
10489
  }
10487
10490
  /**
10488
- * Rebuild the vector store from persisted embedding events in the event log.
10489
- * Reads all embedding:computed / embedding:deleted events and replays them.
10490
- * Bypasses the live pipeline reads directly from the event store.
10491
+ * Rebuild the vector store from the EmbeddingStore (.semiont/embeddings/).
10492
+ *
10493
+ * For each stored file, checks whether the model matches the configured
10494
+ * provider. On mismatch, re-embeds from the stored text and overwrites the
10495
+ * file before upserting into Qdrant. On match, loads the stored vectors
10496
+ * directly — no embedding provider calls needed.
10491
10497
  */
10492
10498
  async rebuildAll() {
10493
- this.logger.info("Rebuilding vector store from events");
10499
+ this.logger.info("Rebuilding vector store from EmbeddingStore");
10494
10500
  await this.vectorStore.clearAll();
10495
- const allResourceIds = await this.eventStore.log.getAllResourceIds();
10496
- this.logger.info("Found resources to scan", { count: allResourceIds.length });
10497
- const query = new EventQuery2(this.eventStore.log.storage);
10498
- let indexed = 0;
10499
- for (const rid of allResourceIds) {
10500
- const events = await query.getResourceEvents(makeResourceId2(rid));
10501
- const embeddingEvents = events.filter(
10502
- (e) => e.type === "embedding:computed" || e.type === "embedding:deleted"
10503
- );
10504
- if (embeddingEvents.length === 0) continue;
10505
- const lastEvent = embeddingEvents[embeddingEvents.length - 1];
10506
- if (lastEvent.type === "embedding:deleted" && !lastEvent.payload.annotationId) {
10507
- continue;
10508
- }
10509
- const deletedAnnotations = /* @__PURE__ */ new Set();
10510
- for (const e of embeddingEvents) {
10511
- if (e.type === "embedding:deleted") {
10512
- const payload = e.payload;
10513
- if (payload.annotationId) deletedAnnotations.add(String(payload.annotationId));
10514
- }
10515
- }
10516
- const resourceChunks = [];
10517
- for (const e of embeddingEvents) {
10518
- if (e.type !== "embedding:computed") continue;
10519
- const payload = e.payload;
10520
- if (payload.annotationId) {
10521
- if (deletedAnnotations.has(String(payload.annotationId))) continue;
10522
- await this.vectorStore.upsertAnnotationVector(
10523
- makeAnnotationId2(String(payload.annotationId)),
10524
- payload.embedding,
10525
- {
10526
- annotationId: makeAnnotationId2(String(payload.annotationId)),
10527
- resourceId: makeResourceId2(e.resourceId),
10528
- motivation: "",
10529
- entityTypes: [],
10530
- exactText: payload.chunkText
10531
- }
10532
- );
10533
- } else {
10534
- resourceChunks.push({
10535
- chunkIndex: payload.chunkIndex,
10536
- text: payload.chunkText,
10537
- embedding: payload.embedding
10538
- });
10539
- }
10540
- }
10541
- if (resourceChunks.length > 0) {
10542
- await this.vectorStore.upsertResourceVectors(
10543
- makeResourceId2(rid),
10544
- resourceChunks
10501
+ const currentModel = this.embeddingProvider.model();
10502
+ const currentDimensions = this.embeddingProvider.dimensions();
10503
+ const resourceIds = await this.embeddingStore.getAllResourceIds();
10504
+ this.logger.info("Found resource embedding files", { count: resourceIds.length });
10505
+ let resourcesIndexed = 0;
10506
+ for (const rid of resourceIds) {
10507
+ const resourceId7 = makeResourceId2(rid);
10508
+ const stored = await this.embeddingStore.readResourceEmbeddings(resourceId7);
10509
+ if (!stored || stored.chunks.length === 0) continue;
10510
+ let chunks;
10511
+ if (stored.model !== currentModel) {
10512
+ this.logger.info("Re-embedding resource (model mismatch)", {
10513
+ resourceId: rid,
10514
+ storedModel: stored.model,
10515
+ currentModel
10516
+ });
10517
+ const texts = stored.chunks.map((c) => c.text);
10518
+ const embeddings = await this.embeddingProvider.embedBatch(texts);
10519
+ chunks = stored.chunks.map((c, i) => ({
10520
+ chunkIndex: c.chunkIndex,
10521
+ text: c.text,
10522
+ embedding: embeddings[i]
10523
+ }));
10524
+ await this.embeddingStore.writeResourceChunks(resourceId7, currentModel, currentDimensions, chunks);
10525
+ } else {
10526
+ chunks = stored.chunks;
10527
+ }
10528
+ await this.vectorStore.upsertResourceVectors(resourceId7, chunks);
10529
+ resourcesIndexed++;
10530
+ }
10531
+ const annotationIds = await this.embeddingStore.getAllAnnotationIds();
10532
+ this.logger.info("Found annotation embedding files", { count: annotationIds.length });
10533
+ let annotationsIndexed = 0;
10534
+ for (const aid of annotationIds) {
10535
+ const annotationId3 = makeAnnotationId2(aid);
10536
+ const stored = await this.embeddingStore.readAnnotationEmbedding(annotationId3);
10537
+ if (!stored) continue;
10538
+ let embedding;
10539
+ if (stored.model !== currentModel) {
10540
+ this.logger.info("Re-embedding annotation (model mismatch)", {
10541
+ annotationId: aid,
10542
+ storedModel: stored.model,
10543
+ currentModel
10544
+ });
10545
+ embedding = await this.embeddingProvider.embed(stored.text);
10546
+ await this.embeddingStore.writeAnnotationEmbedding(
10547
+ annotationId3,
10548
+ makeResourceId2(stored.resourceId),
10549
+ currentModel,
10550
+ currentDimensions,
10551
+ stored.text,
10552
+ embedding,
10553
+ stored.motivation,
10554
+ stored.entityTypes
10545
10555
  );
10556
+ } else {
10557
+ embedding = stored.embedding;
10558
+ }
10559
+ const payload = {
10560
+ annotationId: annotationId3,
10561
+ resourceId: makeResourceId2(stored.resourceId),
10562
+ motivation: stored.motivation,
10563
+ entityTypes: stored.entityTypes,
10564
+ exactText: stored.text
10565
+ };
10566
+ await this.vectorStore.upsertAnnotationVector(annotationId3, embedding, payload);
10567
+ annotationsIndexed++;
10568
+ }
10569
+ const storedResourceIdSet = new Set(resourceIds);
10570
+ const allViews = await this.viewStorage.getAll();
10571
+ let backfilled = 0;
10572
+ for (const view of allViews) {
10573
+ const ridStr = view.resource["@id"];
10574
+ if (storedResourceIdSet.has(ridStr)) continue;
10575
+ if (view.resource.archived) continue;
10576
+ if (!view.resource.storageUri) continue;
10577
+ let content;
10578
+ try {
10579
+ content = await this.contentStore.retrieve(view.resource.storageUri);
10580
+ } catch (err) {
10581
+ this.logger.warn("Smelter back-fill skipped \u2014 content missing", {
10582
+ resourceId: ridStr,
10583
+ storageUri: view.resource.storageUri,
10584
+ error: err instanceof Error ? err.message : String(err)
10585
+ });
10586
+ continue;
10546
10587
  }
10547
- indexed++;
10588
+ const text = new TextDecoder().decode(content);
10589
+ if (!text.trim()) continue;
10590
+ const chunks = chunkText(text, this.chunkingConfig);
10591
+ if (chunks.length === 0) continue;
10592
+ const rid = makeResourceId2(ridStr);
10593
+ const embeddings = await this.embeddingProvider.embedBatch(chunks);
10594
+ const embeddingChunks = chunks.map((chunkText2, i) => ({
10595
+ chunkIndex: i,
10596
+ text: chunkText2,
10597
+ embedding: embeddings[i]
10598
+ }));
10599
+ await this.embeddingStore.writeResourceChunks(rid, currentModel, currentDimensions, embeddingChunks);
10600
+ await this.vectorStore.upsertResourceVectors(rid, embeddingChunks);
10601
+ backfilled++;
10602
+ resourcesIndexed++;
10603
+ this.logger.info("Smelter back-filled missing resource embedding", { resourceId: ridStr });
10548
10604
  }
10549
- this.logger.info("Vector store rebuild complete", { resourcesIndexed: indexed });
10605
+ this.logger.info("Vector store rebuild complete", { resourcesIndexed, annotationsIndexed, backfilled });
10550
10606
  }
10551
10607
  async processBatch(events) {
10552
10608
  const runs = partitionByType(events);
@@ -10568,8 +10624,6 @@ var Smelter = class _Smelter {
10568
10624
  }
10569
10625
  /**
10570
10626
  * Batch-optimized processing for consecutive events of the same type.
10571
- * Collects all texts across events, embeds in a single embedBatch() call,
10572
- * then distributes results back to their respective resources/annotations.
10573
10627
  */
10574
10628
  async applyBatchByType(events) {
10575
10629
  const type = events[0].type;
@@ -10587,8 +10641,8 @@ var Smelter = class _Smelter {
10587
10641
  }
10588
10642
  }
10589
10643
  /**
10590
- * Batch-embed chunks from multiple resource.created events in a single
10591
- * embedBatch() call, then emit events and index per resource.
10644
+ * Batch-embed chunks from multiple yield:created events in a single
10645
+ * embedBatch() call, then write to EmbeddingStore and index per resource.
10592
10646
  */
10593
10647
  async batchResourceCreated(events) {
10594
10648
  const resourceData = [];
@@ -10613,18 +10667,12 @@ var Smelter = class _Smelter {
10613
10667
  const dimensions = this.embeddingProvider.dimensions();
10614
10668
  let offset = 0;
10615
10669
  for (const { rid, chunks } of resourceData) {
10616
- const embeddingChunks = chunks.map((text, i) => {
10617
- const embedding = allEmbeddings[offset + i];
10618
- this.eventBus.get("embedding:compute").next({
10619
- resourceId: rid,
10620
- chunkIndex: i,
10621
- chunkText: text,
10622
- embedding,
10623
- model,
10624
- dimensions
10625
- });
10626
- return { chunkIndex: i, text, embedding };
10627
- });
10670
+ const embeddingChunks = chunks.map((text, i) => ({
10671
+ chunkIndex: i,
10672
+ text,
10673
+ embedding: allEmbeddings[offset + i]
10674
+ }));
10675
+ await this.embeddingStore.writeResourceChunks(rid, model, dimensions, embeddingChunks);
10628
10676
  await this.vectorStore.upsertResourceVectors(rid, embeddingChunks);
10629
10677
  this.logger.debug("Smelter batch-indexed resource", {
10630
10678
  resourceId: String(rid),
@@ -10634,8 +10682,8 @@ var Smelter = class _Smelter {
10634
10682
  }
10635
10683
  }
10636
10684
  /**
10637
- * Batch-embed exact texts from multiple annotation.added events in a
10638
- * single embedBatch() call, then emit events and index per annotation.
10685
+ * Batch-embed exact texts from multiple mark:added events in a single
10686
+ * embedBatch() call, then write to EmbeddingStore and index per annotation.
10639
10687
  */
10640
10688
  async batchAnnotationAdded(events) {
10641
10689
  const annotationData = [];
@@ -10650,30 +10698,34 @@ var Smelter = class _Smelter {
10650
10698
  rid: makeResourceId2(event.resourceId),
10651
10699
  aid: makeAnnotationId2(annotation.id),
10652
10700
  exactText,
10653
- annotation
10701
+ motivation: annotation.motivation ?? "",
10702
+ entityTypes: annotation.entityTypes ?? []
10654
10703
  });
10655
10704
  }
10656
10705
  if (annotationData.length === 0) return;
10657
10706
  const allEmbeddings = await this.embeddingProvider.embedBatch(
10658
10707
  annotationData.map((a) => a.exactText)
10659
10708
  );
10709
+ const model = this.embeddingProvider.model();
10710
+ const dimensions = this.embeddingProvider.dimensions();
10660
10711
  for (let i = 0; i < annotationData.length; i++) {
10661
- const { rid, aid, exactText, annotation } = annotationData[i];
10712
+ const { rid, aid, exactText, motivation, entityTypes } = annotationData[i];
10662
10713
  const embedding = allEmbeddings[i];
10663
- this.eventBus.get("embedding:compute").next({
10664
- resourceId: rid,
10665
- annotationId: aid,
10666
- chunkIndex: 0,
10667
- chunkText: exactText,
10714
+ await this.embeddingStore.writeAnnotationEmbedding(
10715
+ aid,
10716
+ rid,
10717
+ model,
10718
+ dimensions,
10719
+ exactText,
10668
10720
  embedding,
10669
- model: this.embeddingProvider.model(),
10670
- dimensions: this.embeddingProvider.dimensions()
10671
- });
10721
+ motivation,
10722
+ entityTypes
10723
+ );
10672
10724
  const payload = {
10673
10725
  annotationId: aid,
10674
10726
  resourceId: rid,
10675
- motivation: annotation.motivation ?? "",
10676
- entityTypes: annotation.entityTypes ?? [],
10727
+ motivation,
10728
+ entityTypes,
10677
10729
  exactText
10678
10730
  };
10679
10731
  await this.vectorStore.upsertAnnotationVector(aid, embedding, payload);
@@ -10696,19 +10748,24 @@ var Smelter = class _Smelter {
10696
10748
  }
10697
10749
  }
10698
10750
  async processEvent(storedEvent) {
10699
- const event = storedEvent;
10700
- switch (event.type) {
10751
+ switch (storedEvent.type) {
10701
10752
  case "yield:created":
10702
- await this.handleResourceCreated(event);
10753
+ await this.handleResourceCreated(storedEvent);
10754
+ break;
10755
+ case "yield:updated":
10756
+ await this.handleResourceUpdated(storedEvent);
10757
+ break;
10758
+ case "yield:representation-added":
10759
+ await this.handleRepresentationAdded(storedEvent);
10703
10760
  break;
10704
10761
  case "mark:archived":
10705
- await this.handleResourceArchived(event);
10762
+ await this.handleResourceArchived(storedEvent);
10706
10763
  break;
10707
10764
  case "mark:added":
10708
- await this.handleAnnotationAdded(event);
10765
+ await this.handleAnnotationAdded(storedEvent);
10709
10766
  break;
10710
10767
  case "mark:removed":
10711
- await this.handleAnnotationRemoved(event);
10768
+ await this.handleAnnotationRemoved(storedEvent);
10712
10769
  break;
10713
10770
  }
10714
10771
  }
@@ -10743,18 +10800,13 @@ var Smelter = class _Smelter {
10743
10800
  dimensions,
10744
10801
  heapMB: Math.round(process.memoryUsage().heapUsed / 1024 / 1024)
10745
10802
  });
10746
- const embeddingChunks = chunks.map((text2, i) => {
10747
- this.eventBus.get("embedding:compute").next({
10748
- resourceId: rid,
10749
- chunkIndex: i,
10750
- chunkText: text2,
10751
- embedding: embeddings[i],
10752
- model,
10753
- dimensions
10754
- });
10755
- return { chunkIndex: i, text: text2, embedding: embeddings[i] };
10756
- });
10757
- this.logger.info("Smelter emitted events", {
10803
+ const embeddingChunks = chunks.map((text2, i) => ({
10804
+ chunkIndex: i,
10805
+ text: text2,
10806
+ embedding: embeddings[i]
10807
+ }));
10808
+ await this.embeddingStore.writeResourceChunks(rid, model, dimensions, embeddingChunks);
10809
+ this.logger.info("Smelter wrote resource embeddings to store", {
10758
10810
  resourceId: String(rid),
10759
10811
  chunkCount: embeddingChunks.length,
10760
10812
  heapMB: Math.round(process.memoryUsage().heapUsed / 1024 / 1024)
@@ -10766,13 +10818,51 @@ var Smelter = class _Smelter {
10766
10818
  heapMB: Math.round(process.memoryUsage().heapUsed / 1024 / 1024)
10767
10819
  });
10768
10820
  }
10821
+ /**
10822
+ * Re-embed a resource whose content has changed in-place.
10823
+ *
10824
+ * Used by yield:updated and yield:representation-added handlers. Reads the
10825
+ * current storageUri from the materialized view (which is updated before the
10826
+ * EventBus fires), deletes stale Qdrant vectors, and overwrites the
10827
+ * EmbeddingStore file with fresh chunks.
10828
+ */
10829
+ async reembedResource(rid) {
10830
+ const view = await this.viewStorage.get(rid);
10831
+ const storageUri = view?.resource.storageUri;
10832
+ if (!storageUri) return;
10833
+ const content = await this.contentStore.retrieve(storageUri);
10834
+ if (!content) return;
10835
+ const text = new TextDecoder().decode(content);
10836
+ if (!text.trim()) return;
10837
+ const chunks = chunkText(text, this.chunkingConfig);
10838
+ if (chunks.length === 0) return;
10839
+ const embeddings = await this.embeddingProvider.embedBatch(chunks);
10840
+ const model = this.embeddingProvider.model();
10841
+ const dimensions = this.embeddingProvider.dimensions();
10842
+ const embeddingChunks = chunks.map((chunkText2, i) => ({
10843
+ chunkIndex: i,
10844
+ text: chunkText2,
10845
+ embedding: embeddings[i]
10846
+ }));
10847
+ await this.embeddingStore.writeResourceChunks(rid, model, dimensions, embeddingChunks);
10848
+ await this.vectorStore.deleteResourceVectors(rid);
10849
+ await this.vectorStore.upsertResourceVectors(rid, embeddingChunks);
10850
+ this.logger.debug("Smelter re-embedded resource", {
10851
+ resourceId: String(rid),
10852
+ chunks: embeddingChunks.length
10853
+ });
10854
+ }
10855
+ async handleResourceUpdated(event) {
10856
+ await this.reembedResource(makeResourceId2(event.resourceId));
10857
+ }
10858
+ async handleRepresentationAdded(event) {
10859
+ await this.reembedResource(makeResourceId2(event.resourceId));
10860
+ }
10769
10861
  async handleResourceArchived(event) {
10770
10862
  const rid = makeResourceId2(event.resourceId);
10771
10863
  await this.vectorStore.deleteResourceVectors(rid);
10772
- this.eventBus.get("embedding:delete").next({ resourceId: rid });
10773
- this.logger.debug("Smelter deleted resource vectors", {
10774
- resourceId: String(rid)
10775
- });
10864
+ await this.embeddingStore.deleteResourceEmbeddings(rid);
10865
+ this.logger.debug("Smelter deleted resource vectors", { resourceId: String(rid) });
10776
10866
  }
10777
10867
  async handleAnnotationAdded(event) {
10778
10868
  await new Promise((resolve2) => setTimeout(resolve2, 0));
@@ -10790,20 +10880,25 @@ var Smelter = class _Smelter {
10790
10880
  heapMB: Math.round(process.memoryUsage().heapUsed / 1024 / 1024)
10791
10881
  });
10792
10882
  const embedding = await this.embeddingProvider.embed(exactText);
10793
- this.eventBus.get("embedding:compute").next({
10794
- resourceId: rid,
10795
- annotationId: aid,
10796
- chunkIndex: 0,
10797
- chunkText: exactText,
10883
+ const model = this.embeddingProvider.model();
10884
+ const dimensions = this.embeddingProvider.dimensions();
10885
+ const motivation = annotation.motivation ?? "";
10886
+ const entityTypes = annotation.entityTypes ?? [];
10887
+ await this.embeddingStore.writeAnnotationEmbedding(
10888
+ aid,
10889
+ rid,
10890
+ model,
10891
+ dimensions,
10892
+ exactText,
10798
10893
  embedding,
10799
- model: this.embeddingProvider.model(),
10800
- dimensions: this.embeddingProvider.dimensions()
10801
- });
10894
+ motivation,
10895
+ entityTypes
10896
+ );
10802
10897
  const payload = {
10803
10898
  annotationId: aid,
10804
10899
  resourceId: rid,
10805
- motivation: annotation.motivation ?? "",
10806
- entityTypes: annotation.entityTypes ?? [],
10900
+ motivation,
10901
+ entityTypes,
10807
10902
  exactText
10808
10903
  };
10809
10904
  await this.vectorStore.upsertAnnotationVector(aid, embedding, payload);
@@ -10815,16 +10910,134 @@ var Smelter = class _Smelter {
10815
10910
  async handleAnnotationRemoved(event) {
10816
10911
  const annotationId3 = String(event.payload.annotationId);
10817
10912
  if (!annotationId3) return;
10818
- const rid = makeResourceId2(event.resourceId);
10819
10913
  const aid = makeAnnotationId2(annotationId3);
10820
10914
  await this.vectorStore.deleteAnnotationVector(aid);
10821
- this.eventBus.get("embedding:delete").next({
10822
- resourceId: rid,
10823
- annotationId: aid
10824
- });
10825
- this.logger.debug("Smelter deleted annotation vector", {
10826
- annotationId: String(aid)
10827
- });
10915
+ await this.embeddingStore.deleteAnnotationEmbedding(aid);
10916
+ this.logger.debug("Smelter deleted annotation vector", { annotationId: String(aid) });
10917
+ }
10918
+ };
10919
+
10920
+ // src/embedding-store.ts
10921
+ import { promises as fs } from "fs";
10922
+ import * as path from "path";
10923
+ import { getShardPath } from "@semiont/event-sourcing";
10924
+ var EmbeddingStore = class {
10925
+ constructor(project) {
10926
+ this.project = project;
10927
+ }
10928
+ // ── Path helpers ────────────────────────────────────────────────────────────
10929
+ resourceFilePath(resourceId7) {
10930
+ const [ab, cd] = getShardPath(String(resourceId7));
10931
+ return path.join(this.project.embeddingsDir, ab, cd, `${String(resourceId7)}.jsonl`);
10932
+ }
10933
+ annotationFilePath(annotationId3) {
10934
+ const [ab, cd] = getShardPath(String(annotationId3));
10935
+ return path.join(this.project.embeddingsDir, ab, cd, `${String(annotationId3)}.json`);
10936
+ }
10937
+ // ── Resource embeddings ─────────────────────────────────────────────────────
10938
+ async writeResourceChunks(resourceId7, model, dimensions, chunks) {
10939
+ const filePath = this.resourceFilePath(resourceId7);
10940
+ await fs.mkdir(path.dirname(filePath), { recursive: true });
10941
+ const header = JSON.stringify({ model, dimensions });
10942
+ const lines = chunks.map(
10943
+ (c) => JSON.stringify({ chunkIndex: c.chunkIndex, text: c.text, embedding: c.embedding })
10944
+ );
10945
+ await fs.writeFile(filePath, [header, ...lines].join("\n") + "\n", "utf-8");
10946
+ }
10947
+ async readResourceEmbeddings(resourceId7) {
10948
+ const filePath = this.resourceFilePath(resourceId7);
10949
+ try {
10950
+ const content = await fs.readFile(filePath, "utf-8");
10951
+ const lines = content.trim().split("\n").filter((l) => l.trim());
10952
+ if (lines.length === 0) return null;
10953
+ const header = JSON.parse(lines[0]);
10954
+ const chunks = lines.slice(1).map((l) => JSON.parse(l));
10955
+ return { model: header.model, dimensions: header.dimensions, chunks };
10956
+ } catch (err) {
10957
+ if (err.code === "ENOENT") return null;
10958
+ throw err;
10959
+ }
10960
+ }
10961
+ async deleteResourceEmbeddings(resourceId7) {
10962
+ const filePath = this.resourceFilePath(resourceId7);
10963
+ try {
10964
+ await fs.unlink(filePath);
10965
+ } catch (err) {
10966
+ if (err.code !== "ENOENT") throw err;
10967
+ }
10968
+ }
10969
+ // ── Annotation embeddings ───────────────────────────────────────────────────
10970
+ async writeAnnotationEmbedding(annotationId3, resourceId7, model, dimensions, text, embedding, motivation, entityTypes) {
10971
+ const filePath = this.annotationFilePath(annotationId3);
10972
+ await fs.mkdir(path.dirname(filePath), { recursive: true });
10973
+ const data = {
10974
+ model,
10975
+ dimensions,
10976
+ resourceId: String(resourceId7),
10977
+ text,
10978
+ embedding,
10979
+ motivation,
10980
+ entityTypes
10981
+ };
10982
+ await fs.writeFile(filePath, JSON.stringify(data), "utf-8");
10983
+ }
10984
+ async readAnnotationEmbedding(annotationId3) {
10985
+ const filePath = this.annotationFilePath(annotationId3);
10986
+ try {
10987
+ const content = await fs.readFile(filePath, "utf-8");
10988
+ return JSON.parse(content);
10989
+ } catch (err) {
10990
+ if (err.code === "ENOENT") return null;
10991
+ throw err;
10992
+ }
10993
+ }
10994
+ async deleteAnnotationEmbedding(annotationId3) {
10995
+ const filePath = this.annotationFilePath(annotationId3);
10996
+ try {
10997
+ await fs.unlink(filePath);
10998
+ } catch (err) {
10999
+ if (err.code !== "ENOENT") throw err;
11000
+ }
11001
+ }
11002
+ // ── Scan ────────────────────────────────────────────────────────────────────
11003
+ /**
11004
+ * Scan embeddings directory and return all resource IDs (from *.jsonl files).
11005
+ */
11006
+ async getAllResourceIds() {
11007
+ return this.scanIds((name) => name.endsWith(".jsonl"), ".jsonl");
11008
+ }
11009
+ /**
11010
+ * Scan embeddings directory and return all annotation IDs (from *.json files).
11011
+ */
11012
+ async getAllAnnotationIds() {
11013
+ return this.scanIds((name) => name.endsWith(".json"), ".json");
11014
+ }
11015
+ async scanIds(filter, ext) {
11016
+ const base = this.project.embeddingsDir;
11017
+ try {
11018
+ await fs.access(base);
11019
+ } catch {
11020
+ return [];
11021
+ }
11022
+ const results = [];
11023
+ const scan = async (dir) => {
11024
+ let entries;
11025
+ try {
11026
+ entries = await fs.readdir(dir, { withFileTypes: true });
11027
+ } catch {
11028
+ return;
11029
+ }
11030
+ for (const entry of entries) {
11031
+ const full = path.join(dir, entry.name);
11032
+ if (entry.isDirectory()) {
11033
+ await scan(full);
11034
+ } else if (filter(entry.name)) {
11035
+ results.push(entry.name.slice(0, -ext.length));
11036
+ }
11037
+ }
11038
+ };
11039
+ await scan(base);
11040
+ return results;
10828
11041
  }
10829
11042
  };
10830
11043
 
@@ -10856,12 +11069,15 @@ async function createKnowledgeBase(eventStore, project, graphDb, eventBus, logge
10856
11069
  };
10857
11070
  if (options?.vectorStore && options?.embeddingProvider) {
10858
11071
  kb.vectors = options.vectorStore;
11072
+ const embeddingStore = new EmbeddingStore(project);
10859
11073
  kb.smelter = new Smelter(
10860
11074
  eventStore,
10861
11075
  eventBus,
10862
11076
  options.vectorStore,
10863
11077
  options.embeddingProvider,
10864
11078
  content,
11079
+ embeddingStore,
11080
+ views,
10865
11081
  logger.child({ component: "smelter" }),
10866
11082
  options.chunkingConfig
10867
11083
  );
@@ -12041,7 +12257,7 @@ For each candidate, output a line with the number and score, like:
12041
12257
  // src/stower.ts
12042
12258
  var import_rxjs5 = __toESM(require_cjs(), 1);
12043
12259
  var import_operators5 = __toESM(require_operators(), 1);
12044
- import { promises as fs } from "fs";
12260
+ import { promises as fs2 } from "fs";
12045
12261
  import { resourceId as resourceId3, userId as makeUserId, annotationId as makeAnnotationId4, CREATION_METHODS, generateUuid } from "@semiont/core";
12046
12262
  import { resolveStorageUri } from "@semiont/event-sourcing";
12047
12263
  var Stower = class {
@@ -12069,9 +12285,7 @@ var Stower = class {
12069
12285
  pipe("job:start", (e) => this.handleJobStart(e)),
12070
12286
  pipe("job:report-progress", (e) => this.handleJobReportProgress(e)),
12071
12287
  pipe("job:complete", (e) => this.handleJobComplete(e)),
12072
- pipe("job:fail", (e) => this.handleJobFail(e)),
12073
- pipe("embedding:compute", (e) => this.handleEmbeddingComputed(e)),
12074
- pipe("embedding:delete", (e) => this.handleEmbeddingDeleted(e))
12288
+ pipe("job:fail", (e) => this.handleJobFail(e))
12075
12289
  ).subscribe({
12076
12290
  error: (err) => this.logger.error("Stower pipeline error", { error: err })
12077
12291
  });
@@ -12275,7 +12489,7 @@ var Stower = class {
12275
12489
  if (event.storageUri) {
12276
12490
  const absPath = this.kb.content.resolveUri(event.storageUri);
12277
12491
  try {
12278
- await fs.access(absPath);
12492
+ await fs2.access(absPath);
12279
12493
  } catch {
12280
12494
  this.logger.warn("Unarchive failed: file not found at storageUri", { storageUri: event.storageUri });
12281
12495
  return;
@@ -12375,33 +12589,6 @@ var Stower = class {
12375
12589
  }
12376
12590
  });
12377
12591
  }
12378
- async handleEmbeddingComputed(event) {
12379
- await this.kb.eventStore.appendEvent({
12380
- type: "embedding:computed",
12381
- resourceId: resourceId3(event.resourceId),
12382
- userId: makeUserId("did:web:system:smelter"),
12383
- version: 1,
12384
- payload: {
12385
- annotationId: event.annotationId,
12386
- chunkIndex: event.chunkIndex,
12387
- chunkText: event.chunkText,
12388
- embedding: event.embedding,
12389
- model: event.model,
12390
- dimensions: event.dimensions
12391
- }
12392
- });
12393
- }
12394
- async handleEmbeddingDeleted(event) {
12395
- await this.kb.eventStore.appendEvent({
12396
- type: "embedding:deleted",
12397
- resourceId: resourceId3(event.resourceId),
12398
- userId: makeUserId("did:web:system:smelter"),
12399
- version: 1,
12400
- payload: {
12401
- annotationId: event.annotationId
12402
- }
12403
- });
12404
- }
12405
12592
  async stop() {
12406
12593
  this.subscription?.unsubscribe();
12407
12594
  this.subscription = null;
@@ -12412,25 +12599,25 @@ var Stower = class {
12412
12599
  // src/browser.ts
12413
12600
  var import_rxjs6 = __toESM(require_cjs(), 1);
12414
12601
  var import_operators6 = __toESM(require_operators(), 1);
12415
- import { promises as fs3 } from "fs";
12416
- import * as path2 from "path";
12602
+ import { promises as fs4 } from "fs";
12603
+ import * as path3 from "path";
12417
12604
  import { resourceId as resourceId4, annotationId } from "@semiont/core";
12418
12605
  import { getExactText as getExactText2, getTargetSource as getTargetSource2, getTargetSelector as getTargetSelector3, getResourceEntityTypes as getResourceEntityTypes5, getBodySource as getBodySource2 } from "@semiont/api-client";
12419
- import { EventQuery as EventQuery3 } from "@semiont/event-sourcing";
12606
+ import { EventQuery as EventQuery2 } from "@semiont/event-sourcing";
12420
12607
  import { getEntityTypes as getEntityTypes2 } from "@semiont/ontology";
12421
12608
 
12422
12609
  // src/views/entity-types-reader.ts
12423
- import { promises as fs2 } from "fs";
12424
- import * as path from "path";
12610
+ import { promises as fs3 } from "fs";
12611
+ import * as path2 from "path";
12425
12612
  async function readEntityTypesProjection(project) {
12426
- const entityTypesPath = path.join(
12613
+ const entityTypesPath = path2.join(
12427
12614
  project.stateDir,
12428
12615
  "projections",
12429
12616
  "__system__",
12430
12617
  "entitytypes.json"
12431
12618
  );
12432
12619
  try {
12433
- const content = await fs2.readFile(entityTypesPath, "utf-8");
12620
+ const content = await fs3.readFile(entityTypesPath, "utf-8");
12434
12621
  const projection = JSON.parse(content);
12435
12622
  return projection.entityTypes || [];
12436
12623
  } catch (error) {
@@ -12473,7 +12660,7 @@ var Browser = class {
12473
12660
  // ========================================================================
12474
12661
  async handleBrowseResource(event) {
12475
12662
  try {
12476
- const eventQuery = new EventQuery3(this.kb.eventStore.log.storage);
12663
+ const eventQuery = new EventQuery2(this.kb.eventStore.log.storage);
12477
12664
  const events = await eventQuery.getResourceEvents(resourceId4(event.resourceId));
12478
12665
  const stored = await this.kb.eventStore.views.materializer.materialize(events, resourceId4(event.resourceId));
12479
12666
  if (!stored) {
@@ -12586,7 +12773,7 @@ var Browser = class {
12586
12773
  }
12587
12774
  async handleBrowseEvents(event) {
12588
12775
  try {
12589
- const eventQuery = new EventQuery3(this.kb.eventStore.log.storage);
12776
+ const eventQuery = new EventQuery2(this.kb.eventStore.log.storage);
12590
12777
  const filters = {
12591
12778
  resourceId: resourceId4(event.resourceId)
12592
12779
  };
@@ -12626,7 +12813,7 @@ var Browser = class {
12626
12813
  });
12627
12814
  return;
12628
12815
  }
12629
- const eventQuery = new EventQuery3(this.kb.eventStore.log.storage);
12816
+ const eventQuery = new EventQuery2(this.kb.eventStore.log.storage);
12630
12817
  const allEvents = await eventQuery.queryEvents({ resourceId: resourceId4(event.resourceId) });
12631
12818
  const annotationEvents = allEvents.filter((stored) => {
12632
12819
  const p = stored.payload;
@@ -12715,8 +12902,8 @@ var Browser = class {
12715
12902
  async handleBrowseDirectory(event) {
12716
12903
  const { correlationId, path: reqPath, sort = "name" } = event;
12717
12904
  const projectRoot = this.project.root;
12718
- const resolved = path2.resolve(projectRoot, reqPath);
12719
- if (!resolved.startsWith(projectRoot + path2.sep) && resolved !== projectRoot) {
12905
+ const resolved = path3.resolve(projectRoot, reqPath);
12906
+ if (!resolved.startsWith(projectRoot + path3.sep) && resolved !== projectRoot) {
12720
12907
  this.eventBus.get("browse:directory-failed").next({
12721
12908
  correlationId,
12722
12909
  path: reqPath,
@@ -12726,7 +12913,7 @@ var Browser = class {
12726
12913
  }
12727
12914
  let dirents;
12728
12915
  try {
12729
- dirents = await fs3.readdir(resolved, { withFileTypes: true, encoding: "utf8" });
12916
+ dirents = await fs4.readdir(resolved, { withFileTypes: true, encoding: "utf8" });
12730
12917
  } catch (err) {
12731
12918
  const msg = err.code === "ENOENT" ? "path not found" : String(err);
12732
12919
  this.eventBus.get("browse:directory-failed").next({
@@ -12740,16 +12927,16 @@ var Browser = class {
12740
12927
  const allViews = await this.views.getAll();
12741
12928
  const prefix = `file://${resolved}`;
12742
12929
  const viewsByUri = new Map(
12743
- allViews.filter((v) => v.resource.storageUri?.startsWith(prefix + "/") || v.resource.storageUri?.startsWith(prefix + path2.sep)).map((v) => [v.resource.storageUri, v])
12930
+ allViews.filter((v) => v.resource.storageUri?.startsWith(prefix + "/") || v.resource.storageUri?.startsWith(prefix + path3.sep)).map((v) => [v.resource.storageUri, v])
12744
12931
  );
12745
12932
  const entries = [];
12746
12933
  for (const dirent of visible) {
12747
- const entryPath = path2.join(resolved, dirent.name);
12748
- const relPath = path2.relative(projectRoot, entryPath);
12934
+ const entryPath = path3.join(resolved, dirent.name);
12935
+ const relPath = path3.relative(projectRoot, entryPath);
12749
12936
  if (dirent.isDirectory()) {
12750
12937
  let mtime = (/* @__PURE__ */ new Date(0)).toISOString();
12751
12938
  try {
12752
- const stat = await fs3.stat(entryPath);
12939
+ const stat = await fs4.stat(entryPath);
12753
12940
  mtime = stat.mtime.toISOString();
12754
12941
  } catch {
12755
12942
  }
@@ -12759,7 +12946,7 @@ var Browser = class {
12759
12946
  let size = 0;
12760
12947
  let mtime = (/* @__PURE__ */ new Date(0)).toISOString();
12761
12948
  try {
12762
- const stat = await fs3.stat(entryPath);
12949
+ const stat = await fs4.stat(entryPath);
12763
12950
  size = stat.size;
12764
12951
  mtime = stat.mtime.toISOString();
12765
12952
  } catch {
@@ -13430,9 +13617,7 @@ async function exportBackup(options, output) {
13430
13617
  for (const [streamId, events] of streamData) {
13431
13618
  streamSummaries.push({
13432
13619
  stream: streamId,
13433
- eventCount: events.length,
13434
- firstChecksum: events[0].metadata.checksum || "",
13435
- lastChecksum: events[events.length - 1].metadata.checksum || ""
13620
+ eventCount: events.length
13436
13621
  });
13437
13622
  }
13438
13623
  const manifestHeader = {
@@ -13501,26 +13686,11 @@ async function replayEventStream(jsonl, eventBus, resolveBlob, contentStore, log
13501
13686
  annotationsCreated: 0,
13502
13687
  entityTypesAdded: 0
13503
13688
  };
13504
- let hashChainValid = true;
13505
- for (let i = 1; i < storedEvents.length; i++) {
13506
- const prev = storedEvents[i - 1];
13507
- const curr = storedEvents[i];
13508
- if (curr.metadata.prevEventHash && prev.metadata.checksum) {
13509
- if (curr.metadata.prevEventHash !== prev.metadata.checksum) {
13510
- logger?.warn("Hash chain break", {
13511
- index: i,
13512
- expected: prev.metadata.checksum,
13513
- got: curr.metadata.prevEventHash
13514
- });
13515
- hashChainValid = false;
13516
- }
13517
- }
13518
- }
13519
13689
  for (const stored of storedEvents) {
13520
13690
  await replayEvent(stored, eventBus, resolveBlob, contentStore, stats, logger);
13521
13691
  stats.eventsReplayed++;
13522
13692
  }
13523
- return { stats, hashChainValid };
13693
+ return { stats };
13524
13694
  }
13525
13695
  async function replayEvent(event, eventBus, resolveBlob, contentStore, stats, logger) {
13526
13696
  switch (event.type) {
@@ -13749,7 +13919,6 @@ async function importBackup(archive, options) {
13749
13919
  const resolveBlob = buildBlobResolver(entries);
13750
13920
  const systemData = entries.get(".semiont/events/__system__.jsonl");
13751
13921
  let stats = { eventsReplayed: 0, resourcesCreated: 0, annotationsCreated: 0, entityTypesAdded: 0 };
13752
- let hashChainValid = true;
13753
13922
  if (systemData) {
13754
13923
  const result = await replayEventStream(
13755
13924
  systemData.toString("utf8"),
@@ -13759,7 +13928,6 @@ async function importBackup(archive, options) {
13759
13928
  logger
13760
13929
  );
13761
13930
  stats = mergeStats(stats, result.stats);
13762
- if (!result.hashChainValid) hashChainValid = false;
13763
13931
  }
13764
13932
  for (const summary of streamSummaries) {
13765
13933
  if (summary.stream === "__system__") continue;
@@ -13776,10 +13944,9 @@ async function importBackup(archive, options) {
13776
13944
  logger
13777
13945
  );
13778
13946
  stats = mergeStats(stats, result.stats);
13779
- if (!result.hashChainValid) hashChainValid = false;
13780
13947
  }
13781
- logger?.info("Backup import complete", { ...stats, hashChainValid });
13782
- return { manifest: header, stats, hashChainValid };
13948
+ logger?.info("Backup import complete", { ...stats });
13949
+ return { manifest: header, stats };
13783
13950
  }
13784
13951
  function mergeStats(a, b) {
13785
13952
  return {