@semiont/make-meaning 0.5.6 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,9 @@
1
- import { HttpTransport } from '@semiont/api-client';
2
- import { createTomlConfigLoader, accessToken, baseUrl as baseUrl$1, burstBuffer, resourceId, getTargetSelector, getExactText, annotationId } from '@semiont/core';
1
+ import { createTomlConfigLoader, accessToken, baseUrl as baseUrl$1, burstBuffer, errField, resourceId, textExtractionOf, decodeRepresentation, getTargetSelector, getExactText, annotationId, getPrimaryMediaType, getPrimaryRepresentation } from '@semiont/core';
2
+ import { calculateChecksum } from '@semiont/content';
3
3
  import { createEmbeddingProvider, createVectorStore, chunkText } from '@semiont/vectors';
4
+ import { registerVectorIndexSizeProvider, withActorSpan } from '@semiont/observability';
5
+ import { busRequest } from '@semiont/sdk';
6
+ import { HttpTransport, HttpContentTransport } from '@semiont/http-transport';
4
7
  import { createServer } from 'http';
5
8
  import { existsSync, readFileSync } from 'fs';
6
9
  import { homedir } from 'os';
@@ -14,7 +17,11 @@ var __getOwnPropNames = Object.getOwnPropertyNames;
14
17
  var __getProtoOf = Object.getPrototypeOf;
15
18
  var __hasOwnProp = Object.prototype.hasOwnProperty;
16
19
  var __commonJS = (cb, mod) => function __require() {
17
- return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), mod.exports;
20
+ try {
21
+ return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), mod.exports;
22
+ } catch (e) {
23
+ throw mod = 0, e;
24
+ }
18
25
  };
19
26
  var __copyProps = (to, from2, except, desc) => {
20
27
  if (from2 && typeof from2 === "object" || typeof from2 === "function") {
@@ -699,19 +706,19 @@ var require_Observable = __commonJS({
699
706
  var config_1 = require_config();
700
707
  var isFunction_1 = require_isFunction();
701
708
  var errorContext_1 = require_errorContext();
702
- var Observable2 = (function() {
703
- function Observable3(subscribe) {
709
+ var Observable3 = (function() {
710
+ function Observable4(subscribe) {
704
711
  if (subscribe) {
705
712
  this._subscribe = subscribe;
706
713
  }
707
714
  }
708
- Observable3.prototype.lift = function(operator) {
709
- var observable = new Observable3();
715
+ Observable4.prototype.lift = function(operator) {
716
+ var observable = new Observable4();
710
717
  observable.source = this;
711
718
  observable.operator = operator;
712
719
  return observable;
713
720
  };
714
- Observable3.prototype.subscribe = function(observerOrNext, error, complete) {
721
+ Observable4.prototype.subscribe = function(observerOrNext, error, complete) {
715
722
  var _this = this;
716
723
  var subscriber = isSubscriber(observerOrNext) ? observerOrNext : new Subscriber_1.SafeSubscriber(observerOrNext, error, complete);
717
724
  errorContext_1.errorContext(function() {
@@ -720,14 +727,14 @@ var require_Observable = __commonJS({
720
727
  });
721
728
  return subscriber;
722
729
  };
723
- Observable3.prototype._trySubscribe = function(sink) {
730
+ Observable4.prototype._trySubscribe = function(sink) {
724
731
  try {
725
732
  return this._subscribe(sink);
726
733
  } catch (err) {
727
734
  sink.error(err);
728
735
  }
729
736
  };
730
- Observable3.prototype.forEach = function(next, promiseCtor) {
737
+ Observable4.prototype.forEach = function(next, promiseCtor) {
731
738
  var _this = this;
732
739
  promiseCtor = getPromiseCtor(promiseCtor);
733
740
  return new promiseCtor(function(resolve, reject) {
@@ -746,21 +753,21 @@ var require_Observable = __commonJS({
746
753
  _this.subscribe(subscriber);
747
754
  });
748
755
  };
749
- Observable3.prototype._subscribe = function(subscriber) {
756
+ Observable4.prototype._subscribe = function(subscriber) {
750
757
  var _a;
751
758
  return (_a = this.source) === null || _a === void 0 ? void 0 : _a.subscribe(subscriber);
752
759
  };
753
- Observable3.prototype[observable_1.observable] = function() {
760
+ Observable4.prototype[observable_1.observable] = function() {
754
761
  return this;
755
762
  };
756
- Observable3.prototype.pipe = function() {
763
+ Observable4.prototype.pipe = function() {
757
764
  var operations = [];
758
765
  for (var _i = 0; _i < arguments.length; _i++) {
759
766
  operations[_i] = arguments[_i];
760
767
  }
761
768
  return pipe_1.pipeFromArray(operations)(this);
762
769
  };
763
- Observable3.prototype.toPromise = function(promiseCtor) {
770
+ Observable4.prototype.toPromise = function(promiseCtor) {
764
771
  var _this = this;
765
772
  promiseCtor = getPromiseCtor(promiseCtor);
766
773
  return new promiseCtor(function(resolve, reject) {
@@ -774,12 +781,12 @@ var require_Observable = __commonJS({
774
781
  });
775
782
  });
776
783
  };
777
- Observable3.create = function(subscribe) {
778
- return new Observable3(subscribe);
784
+ Observable4.create = function(subscribe) {
785
+ return new Observable4(subscribe);
779
786
  };
780
- return Observable3;
787
+ return Observable4;
781
788
  })();
782
- exports.Observable = Observable2;
789
+ exports.Observable = Observable3;
783
790
  function getPromiseCtor(promiseCtor) {
784
791
  var _a;
785
792
  return (_a = promiseCtor !== null && promiseCtor !== void 0 ? promiseCtor : config_1.config.Promise) !== null && _a !== void 0 ? _a : Promise;
@@ -9716,8 +9723,7 @@ var require_operators = __commonJS({
9716
9723
  });
9717
9724
 
9718
9725
  // src/smelter-main.ts
9719
- var import_rxjs2 = __toESM(require_cjs());
9720
- var import_operators2 = __toESM(require_operators());
9726
+ var import_rxjs3 = __toESM(require_cjs());
9721
9727
 
9722
9728
  // src/smelter-actor-state-unit.ts
9723
9729
  var import_rxjs = __toESM(require_cjs());
@@ -9746,7 +9752,6 @@ function createSmelterActorStateUnit(options) {
9746
9752
  );
9747
9753
  return {
9748
9754
  events$,
9749
- emit: (channel, payload) => bus.emit(channel, payload),
9750
9755
  start: () => {
9751
9756
  if (started) return;
9752
9757
  started = true;
@@ -9758,8 +9763,493 @@ function createSmelterActorStateUnit(options) {
9758
9763
  };
9759
9764
  }
9760
9765
 
9761
- // src/smelter-main.ts
9762
- var import_rxjs3 = __toESM(require_cjs());
9766
+ // src/smelter.ts
9767
+ var import_rxjs2 = __toESM(require_cjs());
9768
+ var import_operators2 = __toESM(require_operators());
9769
+
9770
+ // src/batch-utils.ts
9771
+ function partitionByType(events) {
9772
+ const runs = [];
9773
+ let currentRun = [];
9774
+ for (const event of events) {
9775
+ if (currentRun.length > 0 && currentRun[0].type !== event.type) {
9776
+ runs.push(currentRun);
9777
+ currentRun = [];
9778
+ }
9779
+ currentRun.push(event);
9780
+ }
9781
+ if (currentRun.length > 0) runs.push(currentRun);
9782
+ return runs;
9783
+ }
9784
+
9785
+ // src/smelter.ts
9786
+ function isWorkItem(input) {
9787
+ return input.type.startsWith("smelt:");
9788
+ }
9789
+ var Smelter = class _Smelter {
9790
+ constructor(events$, vectorStore, embeddingProvider, content, bus, chunkingConfig2, timing, logger2) {
9791
+ this.events$ = events$;
9792
+ this.vectorStore = vectorStore;
9793
+ this.embeddingProvider = embeddingProvider;
9794
+ this.content = content;
9795
+ this.bus = bus;
9796
+ this.chunkingConfig = chunkingConfig2;
9797
+ this.timing = timing;
9798
+ this.logger = logger2;
9799
+ }
9800
+ events$;
9801
+ vectorStore;
9802
+ embeddingProvider;
9803
+ content;
9804
+ bus;
9805
+ chunkingConfig;
9806
+ timing;
9807
+ logger;
9808
+ static RECONCILE_PAGE_SIZE = 200;
9809
+ /** Bound on concurrently in-flight reconcile work — a cold rebuild must not fan out unbounded embedding calls. */
9810
+ static RECONCILE_WAVE = 8;
9811
+ eventSubject = new import_rxjs2.Subject();
9812
+ sourceSubscription = null;
9813
+ pipelineSubscription = null;
9814
+ _eventsProcessed = 0;
9815
+ _reconcileState = { phase: "pending" };
9816
+ workDone = 0;
9817
+ workWaiter = null;
9818
+ get eventsProcessed() {
9819
+ return this._eventsProcessed;
9820
+ }
9821
+ get reconcileState() {
9822
+ return this._reconcileState;
9823
+ }
9824
+ initialize() {
9825
+ this.pipelineSubscription = this.eventSubject.pipe(
9826
+ (0, import_operators2.groupBy)((e) => e.resourceId ?? "__unknown__"),
9827
+ (0, import_operators2.mergeMap)(
9828
+ (group) => group.pipe(
9829
+ burstBuffer({
9830
+ burstWindowMs: this.timing.burstWindowMs,
9831
+ maxBatchSize: this.timing.maxBatchSize,
9832
+ idleTimeoutMs: this.timing.idleTimeoutMs
9833
+ }),
9834
+ (0, import_operators2.concatMap)((inputOrBatch) => {
9835
+ if (Array.isArray(inputOrBatch)) {
9836
+ return (0, import_rxjs2.from)(
9837
+ withActorSpan("smelter", "batch", async () => {
9838
+ this._eventsProcessed += await this.processBatch(inputOrBatch);
9839
+ }, { "batch.size": inputOrBatch.length })
9840
+ );
9841
+ }
9842
+ return (0, import_rxjs2.from)(
9843
+ withActorSpan("smelter", inputOrBatch.type, async () => {
9844
+ const ok = await this.safeProcessEvent(inputOrBatch);
9845
+ if (isWorkItem(inputOrBatch)) this.noteWorkDone(1);
9846
+ else if (ok) this._eventsProcessed++;
9847
+ })
9848
+ );
9849
+ })
9850
+ )
9851
+ )
9852
+ ).subscribe({
9853
+ error: (err) => this.logger.error("Smelter pipeline error", { error: errField(err) })
9854
+ });
9855
+ this.sourceSubscription = this.events$.subscribe((event) => {
9856
+ this.logger.debug("Bus event received", { type: event.type, resourceId: event.resourceId });
9857
+ this.eventSubject.next(event);
9858
+ });
9859
+ this.logger.info("Smelter pipeline initialized");
9860
+ }
9861
+ stop() {
9862
+ this.sourceSubscription?.unsubscribe();
9863
+ this.sourceSubscription = null;
9864
+ this.pipelineSubscription?.unsubscribe();
9865
+ this.pipelineSubscription = null;
9866
+ this.eventSubject.complete();
9867
+ this.logger.info("Smelter stopped");
9868
+ }
9869
+ noteWorkDone(count) {
9870
+ this.workDone += count;
9871
+ if (this.workWaiter && this.workDone >= this.workWaiter.target) {
9872
+ this.workWaiter.resolve();
9873
+ this.workWaiter = null;
9874
+ }
9875
+ }
9876
+ /**
9877
+ * Returns the number of WIRE events processed without error (the S9b
9878
+ * oracle) — `smelt:*` work-item runs tick the drain counter instead.
9879
+ */
9880
+ async processBatch(events) {
9881
+ let wireProcessed = 0;
9882
+ for (const run of partitionByType(events)) {
9883
+ const workRun = isWorkItem(run[0]);
9884
+ try {
9885
+ if (run.length === 1) {
9886
+ const ok = await this.safeProcessEvent(run[0]);
9887
+ if (ok && !workRun) wireProcessed++;
9888
+ } else {
9889
+ const processed = await this.applyBatchByType(run);
9890
+ if (!workRun) wireProcessed += processed;
9891
+ }
9892
+ } catch (error) {
9893
+ this.logger.error("Smelter failed to process batch run", {
9894
+ eventType: run[0].type,
9895
+ runSize: run.length,
9896
+ error: errField(error)
9897
+ });
9898
+ } finally {
9899
+ if (workRun) this.noteWorkDone(run.length);
9900
+ }
9901
+ }
9902
+ return wireProcessed;
9903
+ }
9904
+ /**
9905
+ * Batch-optimized processing for consecutive events of the same type.
9906
+ * Returns the number of events processed without error.
9907
+ */
9908
+ async applyBatchByType(events) {
9909
+ switch (events[0].type) {
9910
+ case "yield:created":
9911
+ case "smelt:embed":
9912
+ return this.batchResourceCreated(events);
9913
+ case "mark:added":
9914
+ case "smelt:embed-annotation":
9915
+ return this.batchAnnotationAdded(events);
9916
+ default: {
9917
+ let processed = 0;
9918
+ for (const event of events) {
9919
+ if (await this.safeProcessEvent(event)) processed++;
9920
+ }
9921
+ return processed;
9922
+ }
9923
+ }
9924
+ }
9925
+ /** Returns true if the input was processed without error. */
9926
+ async safeProcessEvent(event) {
9927
+ try {
9928
+ await this.processEvent(event);
9929
+ return true;
9930
+ } catch (err) {
9931
+ this.logger.error("Smelter failed to process event", {
9932
+ type: event.type,
9933
+ resourceId: event.resourceId,
9934
+ error: errField(err)
9935
+ });
9936
+ return false;
9937
+ }
9938
+ }
9939
+ async processEvent(event) {
9940
+ switch (event.type) {
9941
+ case "yield:created":
9942
+ await this.embedResource(event, "Indexed resource");
9943
+ break;
9944
+ case "yield:updated":
9945
+ case "yield:representation-added":
9946
+ await this.embedResource(event, "Re-embedded resource");
9947
+ break;
9948
+ case "mark:archived":
9949
+ await this.handleResourceArchived(event);
9950
+ break;
9951
+ case "mark:added":
9952
+ await this.handleAnnotationAdded(event);
9953
+ break;
9954
+ case "mark:removed":
9955
+ await this.handleAnnotationRemoved(event);
9956
+ break;
9957
+ // Reconcile work items — same handlers, distinct provenance.
9958
+ case "smelt:embed":
9959
+ await this.embedResource(event, "Reconcile-indexed resource");
9960
+ break;
9961
+ case "smelt:purge":
9962
+ await this.handleResourcePurge(event);
9963
+ break;
9964
+ case "smelt:embed-annotation":
9965
+ await this.handleAnnotationAdded(event);
9966
+ break;
9967
+ case "smelt:purge-annotation":
9968
+ await this.handleAnnotationRemoved(event);
9969
+ break;
9970
+ }
9971
+ }
9972
+ async handleResourcePurge(event) {
9973
+ const rid = event.resourceId;
9974
+ if (!rid) return;
9975
+ await this.vectorStore.deleteResourceVectors(resourceId(rid));
9976
+ this.logger.info("Reconcile deleted orphan resource vectors", { resourceId: rid });
9977
+ }
9978
+ /**
9979
+ * Resolve a resource's embeddable text: bytes via the content transport,
9980
+ * gated to media types that decode as text, decoded charset-aware. The
9981
+ * checksum is over the raw bytes actually read — stamped onto the vectors
9982
+ * so reconciliation can compare against the catalog's claim (S12). Returns
9983
+ * null (logged) when the resource doesn't decode as text, is unavailable,
9984
+ * or is empty — callers skip it.
9985
+ */
9986
+ async fetchEmbeddableText(resourceId$1) {
9987
+ try {
9988
+ const { data, contentType } = await this.content.getBinary(resourceId(resourceId$1));
9989
+ if (textExtractionOf(contentType) !== "decode") {
9990
+ this.logger.debug("Skipping resource that does not decode as text", { resourceId: resourceId$1, contentType });
9991
+ return null;
9992
+ }
9993
+ const bytes = Buffer.from(data);
9994
+ const text = decodeRepresentation(bytes, contentType);
9995
+ return text.trim() ? { text, checksum: calculateChecksum(bytes) } : null;
9996
+ } catch (error) {
9997
+ this.logger.warn("Content unavailable for embedding", { resourceId: resourceId$1, error: errField(error) });
9998
+ return null;
9999
+ }
10000
+ }
10001
+ async embedResource(event, logMessage) {
10002
+ const rid = event.resourceId;
10003
+ if (!rid) return;
10004
+ const fetched = await this.fetchEmbeddableText(rid);
10005
+ if (!fetched) return;
10006
+ const chunks = chunkText(fetched.text, this.chunkingConfig);
10007
+ if (chunks.length === 0) return;
10008
+ const embeddings = await this.embeddingProvider.embedBatch(chunks);
10009
+ const embeddingChunks = chunks.map((t, i) => ({
10010
+ chunkIndex: i,
10011
+ text: t,
10012
+ embedding: embeddings[i]
10013
+ }));
10014
+ await this.vectorStore.upsertResourceVectors(resourceId(rid), embeddingChunks, fetched.checksum);
10015
+ this.logger.info(logMessage, { resourceId: rid, chunks: chunks.length });
10016
+ }
10017
+ async handleResourceArchived(event) {
10018
+ const rid = event.resourceId;
10019
+ if (!rid) return;
10020
+ await this.vectorStore.deleteResourceVectors(resourceId(rid));
10021
+ await this.vectorStore.deleteAnnotationVectorsForResource(resourceId(rid));
10022
+ this.logger.info("Deleted vectors for archived resource", { resourceId: rid });
10023
+ }
10024
+ async handleAnnotationAdded(event) {
10025
+ const annotation = event.payload.annotation;
10026
+ if (!annotation?.id) return;
10027
+ const rid = event.resourceId;
10028
+ if (!rid) return;
10029
+ const selector = getTargetSelector(annotation.target);
10030
+ const exactText = getExactText(selector);
10031
+ if (!exactText?.trim()) return;
10032
+ const aid = annotationId(annotation.id);
10033
+ const embedding2 = await this.embeddingProvider.embed(exactText);
10034
+ const payload = {
10035
+ annotationId: aid,
10036
+ resourceId: resourceId(rid),
10037
+ motivation: annotation.motivation ?? "",
10038
+ entityTypes: annotation.entityTypes ?? [],
10039
+ exactText
10040
+ };
10041
+ await this.vectorStore.upsertAnnotationVector(aid, embedding2, payload);
10042
+ this.logger.info("Indexed annotation", { annotationId: String(aid) });
10043
+ }
10044
+ async handleAnnotationRemoved(event) {
10045
+ const annotationId$1 = event.payload.annotationId;
10046
+ if (!annotationId$1) return;
10047
+ const aid = annotationId(annotationId$1);
10048
+ await this.vectorStore.deleteAnnotationVector(aid);
10049
+ this.logger.info("Deleted annotation vector", { annotationId: annotationId$1 });
10050
+ }
10051
+ /**
10052
+ * Batch-embed chunks from multiple yield:created events in a single
10053
+ * embedBatch() call, then index per resource.
10054
+ */
10055
+ async batchResourceCreated(events) {
10056
+ const resourceData = [];
10057
+ const allChunks = [];
10058
+ for (const event of events) {
10059
+ const rid = event.resourceId;
10060
+ if (!rid) continue;
10061
+ const fetched = await this.fetchEmbeddableText(rid);
10062
+ if (!fetched) continue;
10063
+ const chunks = chunkText(fetched.text, this.chunkingConfig);
10064
+ if (chunks.length === 0) continue;
10065
+ resourceData.push({ rid: resourceId(rid), chunks, checksum: fetched.checksum });
10066
+ allChunks.push(...chunks);
10067
+ }
10068
+ if (allChunks.length === 0) return events.length;
10069
+ const allEmbeddings = await this.embeddingProvider.embedBatch(allChunks);
10070
+ let offset = 0;
10071
+ for (const { rid, chunks, checksum } of resourceData) {
10072
+ const embeddingChunks = chunks.map((t, i) => ({
10073
+ chunkIndex: i,
10074
+ text: t,
10075
+ embedding: allEmbeddings[offset + i]
10076
+ }));
10077
+ await this.vectorStore.upsertResourceVectors(rid, embeddingChunks, checksum);
10078
+ this.logger.info("Batch-indexed resource", { resourceId: String(rid), chunks: chunks.length });
10079
+ offset += chunks.length;
10080
+ }
10081
+ return events.length;
10082
+ }
10083
+ /**
10084
+ * Batch-embed exact texts from multiple mark:added events in a single
10085
+ * embedBatch() call, then index per annotation.
10086
+ */
10087
+ async batchAnnotationAdded(events) {
10088
+ const annotationData = [];
10089
+ for (const event of events) {
10090
+ const annotation = event.payload.annotation;
10091
+ if (!annotation?.id) continue;
10092
+ const rid = event.resourceId;
10093
+ if (!rid) continue;
10094
+ const selector = getTargetSelector(annotation.target);
10095
+ const exactText = getExactText(selector);
10096
+ if (!exactText?.trim()) continue;
10097
+ annotationData.push({
10098
+ rid: resourceId(rid),
10099
+ aid: annotationId(annotation.id),
10100
+ exactText,
10101
+ motivation: annotation.motivation ?? "",
10102
+ entityTypes: annotation.entityTypes ?? []
10103
+ });
10104
+ }
10105
+ if (annotationData.length === 0) return events.length;
10106
+ const allEmbeddings = await this.embeddingProvider.embedBatch(
10107
+ annotationData.map((a) => a.exactText)
10108
+ );
10109
+ for (let i = 0; i < annotationData.length; i++) {
10110
+ const { rid, aid, exactText, motivation, entityTypes } = annotationData[i];
10111
+ const payload = {
10112
+ annotationId: aid,
10113
+ resourceId: rid,
10114
+ motivation,
10115
+ entityTypes,
10116
+ exactText
10117
+ };
10118
+ await this.vectorStore.upsertAnnotationVector(aid, allEmbeddings[i], payload);
10119
+ this.logger.info("Batch-indexed annotation", { annotationId: String(aid) });
10120
+ }
10121
+ return events.length;
10122
+ }
10123
+ // ── Reconciliation ───────────────────────────────────────────────────
10124
+ /**
10125
+ * Reconcile the vector store against the KS catalog.
10126
+ *
10127
+ * Lists what IS indexed (via the store's id enumeration) and what SHOULD
10128
+ * be (non-archived resources with embeddable media types, plus their
10129
+ * exact-text annotations, via the `browse:*` RPC channels), then plans the
10130
+ * diff as `smelt:*` work items — embeds for what's missing, purges for
10131
+ * what shouldn't be there — and drains them through the pipeline mailbox.
10132
+ * Work items share the per-resource lanes with live events, so a reconcile
10133
+ * re-embed can never interleave with (or stale-overwrite) live processing
10134
+ * of the same resource (axioms S1/S2). Waves of RECONCILE_WAVE bound how
10135
+ * many embedding calls a cold rebuild has in flight.
10136
+ *
10137
+ * Call after the live subscription is attached so nothing falls in the
10138
+ * gap. The index snapshot is taken BEFORE the catalog listing so a
10139
+ * resource indexed by a live event mid-reconcile is never mistaken for an
10140
+ * orphan; convergence holds because every upsert replaces a resource's
10141
+ * full vector set from current content.
10142
+ */
10143
+ async reconcile() {
10144
+ if (!this.pipelineSubscription) {
10145
+ throw new Error("Smelter.reconcile() requires initialize() \u2014 work items drain through the pipeline");
10146
+ }
10147
+ this._reconcileState = { phase: "running" };
10148
+ try {
10149
+ const [indexedResources, indexedAnnotations] = await Promise.all([
10150
+ this.vectorStore.listResourceChecksums(),
10151
+ this.vectorStore.listAnnotationIds()
10152
+ ]);
10153
+ const resources = await this.listAllResources();
10154
+ this.logger.info("Reconcile started", {
10155
+ indexedResources: indexedResources.size,
10156
+ indexedAnnotations: indexedAnnotations.size,
10157
+ liveResources: resources.length
10158
+ });
10159
+ const embeddable = /* @__PURE__ */ new Map();
10160
+ for (const resource of resources) {
10161
+ const mediaType = getPrimaryMediaType(resource);
10162
+ if (resource["@id"] && mediaType && textExtractionOf(mediaType) === "decode") {
10163
+ embeddable.set(resource["@id"], getPrimaryRepresentation(resource)?.checksum);
10164
+ }
10165
+ }
10166
+ const work = [];
10167
+ for (const rid of indexedResources.keys()) {
10168
+ if (!embeddable.has(rid)) work.push({ type: "smelt:purge", resourceId: rid, payload: {} });
10169
+ }
10170
+ for (const [rid, catalogChecksum] of embeddable) {
10171
+ if (!indexedResources.has(rid)) {
10172
+ work.push({ type: "smelt:embed", resourceId: rid, payload: {} });
10173
+ } else if (catalogChecksum !== void 0 && indexedResources.get(rid) !== catalogChecksum) {
10174
+ work.push({ type: "smelt:embed", resourceId: rid, payload: {} });
10175
+ }
10176
+ }
10177
+ const liveAnnotationIds = /* @__PURE__ */ new Set();
10178
+ for (const resource of resources) {
10179
+ const rid = resource["@id"];
10180
+ if (!rid) continue;
10181
+ const { annotations } = await busRequest(
10182
+ this.bus,
10183
+ "browse:annotations-requested",
10184
+ { resourceId: rid },
10185
+ "browse:annotations-result",
10186
+ "browse:annotations-failed"
10187
+ );
10188
+ for (const annotation of annotations) {
10189
+ const exactText = getExactText(getTargetSelector(annotation.target));
10190
+ if (!annotation.id || !exactText?.trim()) continue;
10191
+ liveAnnotationIds.add(annotation.id);
10192
+ if (!indexedAnnotations.has(annotation.id)) {
10193
+ work.push({ type: "smelt:embed-annotation", resourceId: rid, payload: { annotation } });
10194
+ }
10195
+ }
10196
+ }
10197
+ for (const aid of indexedAnnotations) {
10198
+ if (!liveAnnotationIds.has(aid)) {
10199
+ work.push({ type: "smelt:purge-annotation", resourceId: aid, payload: { annotationId: aid } });
10200
+ }
10201
+ }
10202
+ await this.drain(work);
10203
+ const summary = {
10204
+ resourcesEmbedded: work.filter((w) => w.type === "smelt:embed").length,
10205
+ resourceVectorsDeleted: work.filter((w) => w.type === "smelt:purge").length,
10206
+ annotationsEmbedded: work.filter((w) => w.type === "smelt:embed-annotation").length,
10207
+ annotationVectorsDeleted: work.filter((w) => w.type === "smelt:purge-annotation").length
10208
+ };
10209
+ this._reconcileState = { phase: "done", summary };
10210
+ this.logger.info("Reconcile complete", { ...summary });
10211
+ return summary;
10212
+ } catch (error) {
10213
+ this._reconcileState = {
10214
+ phase: "failed",
10215
+ error: error instanceof Error ? error.message : String(error)
10216
+ };
10217
+ this.logger.error("Reconcile failed", { error: errField(error) });
10218
+ throw error;
10219
+ }
10220
+ }
10221
+ /**
10222
+ * Enqueue planner work through the mailbox in bounded waves and await
10223
+ * completion. The pipeline ticks `noteWorkDone` for every consumed work
10224
+ * item (success or failure — failures are logged like any live event), so
10225
+ * each wave's waiter resolves exactly when its items have been processed.
10226
+ */
10227
+ async drain(work) {
10228
+ for (let i = 0; i < work.length; i += _Smelter.RECONCILE_WAVE) {
10229
+ const wave = work.slice(i, i + _Smelter.RECONCILE_WAVE);
10230
+ const done = new Promise((resolve) => {
10231
+ this.workWaiter = { target: this.workDone + wave.length, resolve };
10232
+ });
10233
+ for (const item of wave) this.eventSubject.next(item);
10234
+ await done;
10235
+ }
10236
+ }
10237
+ /** Page through `browse:resources-requested` until the catalog is exhausted. */
10238
+ async listAllResources() {
10239
+ const all = [];
10240
+ for (; ; ) {
10241
+ const page = await busRequest(
10242
+ this.bus,
10243
+ "browse:resources-requested",
10244
+ { archived: false, offset: all.length, limit: _Smelter.RECONCILE_PAGE_SIZE },
10245
+ "browse:resources-result",
10246
+ "browse:resources-failed"
10247
+ );
10248
+ all.push(...page.resources);
10249
+ if (page.resources.length === 0 || all.length >= page.total) return all;
10250
+ }
10251
+ }
10252
+ };
9763
10253
  var configPath = join(homedir(), ".semiontconfig");
9764
10254
  var tomlReader = {
9765
10255
  readIfExists: (p) => existsSync(p) ? readFileSync(p, "utf-8") : null
@@ -9796,9 +10286,6 @@ var chunkingConfig = {
9796
10286
  };
9797
10287
  var workerSecret = process.env.SEMIONT_WORKER_SECRET ?? "";
9798
10288
  var healthPort = 9091;
9799
- var BURST_WINDOW_MS = 50;
9800
- var MAX_BATCH_SIZE = 100;
9801
- var IDLE_TIMEOUT_MS = 200;
9802
10289
  var logger = createProcessLogger("smelter");
9803
10290
  async function authenticate() {
9804
10291
  if (!workerSecret) {
@@ -9820,250 +10307,70 @@ async function authenticate() {
9820
10307
  const { token } = await response.json();
9821
10308
  return token;
9822
10309
  }
9823
- var authToken = "";
9824
- async function fetchContent(resourceId) {
9825
- try {
9826
- const response = await fetch(`${baseUrl}/api/resources/${resourceId}`, {
9827
- headers: {
9828
- Authorization: `Bearer ${authToken}`,
9829
- Accept: "text/plain"
9830
- }
9831
- });
9832
- if (!response.ok) return null;
9833
- return response.text();
9834
- } catch {
9835
- return null;
9836
- }
9837
- }
9838
- var vectorStore;
9839
- var embeddingProvider;
9840
- var eventsProcessed = 0;
9841
- async function processEvent(event) {
9842
- try {
9843
- switch (event.type) {
9844
- case "yield:created":
9845
- await handleResourceCreated(event);
9846
- break;
9847
- case "yield:updated":
9848
- case "yield:representation-added":
9849
- await handleResourceReembed(event);
9850
- break;
9851
- case "mark:archived":
9852
- await handleResourceArchived(event);
9853
- break;
9854
- case "mark:added":
9855
- await handleAnnotationAdded(event);
9856
- break;
9857
- case "mark:removed":
9858
- await handleAnnotationRemoved(event);
9859
- break;
9860
- }
9861
- eventsProcessed++;
9862
- } catch (err) {
9863
- logger.error("Failed to process event", { type: event.type, resourceId: event.resourceId, error: err instanceof Error ? err.message : String(err) });
9864
- }
9865
- }
9866
- async function handleResourceCreated(event) {
9867
- const rid = event.resourceId;
9868
- if (!rid) return;
9869
- const text = await fetchContent(rid);
9870
- if (!text?.trim()) return;
9871
- const chunks = chunkText(text, chunkingConfig);
9872
- if (chunks.length === 0) return;
9873
- const embeddings = await embeddingProvider.embedBatch(chunks);
9874
- const embeddingChunks = chunks.map((t, i) => ({
9875
- chunkIndex: i,
9876
- text: t,
9877
- embedding: embeddings[i]
9878
- }));
9879
- await vectorStore.upsertResourceVectors(resourceId(rid), embeddingChunks);
9880
- logger.info("Indexed resource", { resourceId: rid, chunks: chunks.length });
9881
- }
9882
- async function handleResourceReembed(event) {
9883
- const rid = event.resourceId;
9884
- if (!rid) return;
9885
- const text = await fetchContent(rid);
9886
- if (!text?.trim()) return;
9887
- const chunks = chunkText(text, chunkingConfig);
9888
- if (chunks.length === 0) return;
9889
- const embeddings = await embeddingProvider.embedBatch(chunks);
9890
- const embeddingChunks = chunks.map((t, i) => ({
9891
- chunkIndex: i,
9892
- text: t,
9893
- embedding: embeddings[i]
9894
- }));
9895
- await vectorStore.deleteResourceVectors(resourceId(rid));
9896
- await vectorStore.upsertResourceVectors(resourceId(rid), embeddingChunks);
9897
- logger.info("Re-embedded resource", { resourceId: rid, chunks: chunks.length });
9898
- }
9899
- async function handleResourceArchived(event) {
9900
- const rid = event.resourceId;
9901
- if (!rid) return;
9902
- await vectorStore.deleteResourceVectors(resourceId(rid));
9903
- logger.info("Deleted vectors for archived resource", { resourceId: rid });
9904
- }
9905
- async function handleAnnotationAdded(event) {
9906
- const annotation = event.payload.annotation;
9907
- if (!annotation?.id) return;
9908
- const rid = event.resourceId;
9909
- if (!rid) return;
9910
- const selector = getTargetSelector(annotation.target);
9911
- const exactText = getExactText(selector);
9912
- if (!exactText?.trim()) return;
9913
- const aid = annotationId(annotation.id);
9914
- const embedding2 = await embeddingProvider.embed(exactText);
9915
- const payload = {
9916
- annotationId: aid,
9917
- resourceId: resourceId(rid),
9918
- motivation: annotation.motivation ?? "",
9919
- entityTypes: annotation.entityTypes ?? [],
9920
- exactText
9921
- };
9922
- await vectorStore.upsertAnnotationVector(aid, embedding2, payload);
9923
- logger.info("Indexed annotation", { annotationId: String(aid) });
9924
- }
9925
- async function handleAnnotationRemoved(event) {
9926
- const annotationId$1 = event.payload.annotationId;
9927
- if (!annotationId$1) return;
9928
- const aid = annotationId(annotationId$1);
9929
- await vectorStore.deleteAnnotationVector(aid);
9930
- logger.info("Deleted annotation vector", { annotationId: annotationId$1 });
9931
- }
9932
- async function processBatch(events) {
9933
- const type = events[0].type;
9934
- if (type === "yield:created") {
9935
- await batchResourceCreated(events);
9936
- } else if (type === "mark:added") {
9937
- await batchAnnotationAdded(events);
9938
- } else {
9939
- for (const event of events) {
9940
- await processEvent(event);
9941
- }
9942
- }
9943
- }
9944
- async function batchResourceCreated(events) {
9945
- const resourceData = [];
9946
- const allChunks = [];
9947
- for (const event of events) {
9948
- const rid = event.resourceId;
9949
- if (!rid) continue;
9950
- const text = await fetchContent(rid);
9951
- if (!text?.trim()) continue;
9952
- const chunks = chunkText(text, chunkingConfig);
9953
- if (chunks.length === 0) continue;
9954
- resourceData.push({ rid: resourceId(rid), chunks });
9955
- allChunks.push(...chunks);
9956
- }
9957
- if (allChunks.length === 0) return;
9958
- const allEmbeddings = await embeddingProvider.embedBatch(allChunks);
9959
- let offset = 0;
9960
- for (const { rid, chunks } of resourceData) {
9961
- const embeddingChunks = chunks.map((t, i) => ({
9962
- chunkIndex: i,
9963
- text: t,
9964
- embedding: allEmbeddings[offset + i]
9965
- }));
9966
- await vectorStore.upsertResourceVectors(rid, embeddingChunks);
9967
- logger.info("Batch-indexed resource", { resourceId: String(rid), chunks: chunks.length });
9968
- offset += chunks.length;
9969
- }
9970
- eventsProcessed += events.length;
9971
- }
9972
- async function batchAnnotationAdded(events) {
9973
- const annotationData = [];
9974
- for (const event of events) {
9975
- const annotation = event.payload.annotation;
9976
- if (!annotation?.id) continue;
9977
- const rid = event.resourceId;
9978
- if (!rid) continue;
9979
- const selector = getTargetSelector(annotation.target);
9980
- const exactText = getExactText(selector);
9981
- if (!exactText?.trim()) continue;
9982
- annotationData.push({
9983
- rid: resourceId(rid),
9984
- aid: annotationId(annotation.id),
9985
- exactText,
9986
- motivation: annotation.motivation ?? "",
9987
- entityTypes: annotation.entityTypes ?? []
9988
- });
9989
- }
9990
- if (annotationData.length === 0) return;
9991
- const allEmbeddings = await embeddingProvider.embedBatch(
9992
- annotationData.map((a) => a.exactText)
9993
- );
9994
- for (let i = 0; i < annotationData.length; i++) {
9995
- const { rid, aid, exactText, motivation, entityTypes } = annotationData[i];
9996
- const payload = {
9997
- annotationId: aid,
9998
- resourceId: rid,
9999
- motivation,
10000
- entityTypes,
10001
- exactText
10002
- };
10003
- await vectorStore.upsertAnnotationVector(aid, allEmbeddings[i], payload);
10004
- logger.info("Batch-indexed annotation", { annotationId: String(aid) });
10005
- }
10006
- eventsProcessed += events.length;
10007
- }
10008
10310
  async function main() {
10009
10311
  const { initObservabilityNode } = await import('@semiont/observability/node');
10010
10312
  initObservabilityNode({ serviceName: "semiont-smelter" });
10011
10313
  logger.info("Authenticating", { baseUrl });
10012
- authToken = await authenticate();
10314
+ const tokenSubject = new import_rxjs3.BehaviorSubject(accessToken(await authenticate()));
10013
10315
  logger.info("Authenticated");
10014
- embeddingProvider = await createEmbeddingProvider({
10316
+ const refreshToken = async () => {
10317
+ const token = await authenticate();
10318
+ tokenSubject.next(accessToken(token));
10319
+ return token;
10320
+ };
10321
+ const reauthTimer = setInterval(() => {
10322
+ refreshToken().catch((error) => {
10323
+ logger.error("Proactive re-authentication failed", {
10324
+ error: error instanceof Error ? error.message : String(error)
10325
+ });
10326
+ });
10327
+ }, 12 * 60 * 60 * 1e3);
10328
+ const embeddingProvider = await createEmbeddingProvider({
10015
10329
  type: embeddingType,
10016
10330
  model: embeddingModel,
10017
10331
  baseURL: embeddingBaseURL
10018
10332
  });
10019
10333
  logger.info("Embedding provider ready", { type: embeddingType, model: embeddingModel });
10020
10334
  const dimensions = embeddingProvider.dimensions();
10021
- vectorStore = await createVectorStore({
10335
+ const vectorStore = await createVectorStore({
10022
10336
  type: "qdrant",
10023
10337
  host: qdrantHost,
10024
10338
  port: qdrantPort,
10025
10339
  dimensions
10026
10340
  });
10027
10341
  logger.info("Vector store ready", { host: qdrantHost, port: qdrantPort, dimensions });
10028
- const tokenSubject = new import_rxjs3.BehaviorSubject(accessToken(authToken));
10342
+ registerVectorIndexSizeProvider(() => vectorStore.count());
10029
10343
  const httpTransport = new HttpTransport({
10030
10344
  baseUrl: baseUrl$1(baseUrl),
10031
- token$: tokenSubject
10345
+ token$: tokenSubject,
10346
+ tokenRefresher: refreshToken
10032
10347
  });
10033
10348
  const actorStateUnit = createSmelterActorStateUnit({
10034
10349
  bus: httpTransport.actor
10035
10350
  });
10036
- const eventSubject = new import_rxjs2.Subject();
10037
- const pipelineSubscription = eventSubject.pipe(
10038
- (0, import_operators2.groupBy)((e) => e.resourceId ?? "__unknown__"),
10039
- (0, import_operators2.mergeMap)(
10040
- (group) => group.pipe(
10041
- burstBuffer({
10042
- burstWindowMs: BURST_WINDOW_MS,
10043
- maxBatchSize: MAX_BATCH_SIZE,
10044
- idleTimeoutMs: IDLE_TIMEOUT_MS
10045
- }),
10046
- (0, import_operators2.concatMap)((eventOrBatch) => {
10047
- if (Array.isArray(eventOrBatch)) {
10048
- return (0, import_rxjs2.from)(processBatch(eventOrBatch));
10049
- }
10050
- return (0, import_rxjs2.from)(processEvent(eventOrBatch));
10051
- })
10052
- )
10053
- )
10054
- ).subscribe({
10055
- error: (err) => logger.error("Pipeline error", { error: err instanceof Error ? err.message : String(err) })
10056
- });
10057
- actorStateUnit.events$.subscribe((event) => {
10058
- logger.debug("Bus event received", { type: event.type, resourceId: event.resourceId });
10059
- eventSubject.next(event);
10060
- });
10351
+ const contentTransport = new HttpContentTransport(httpTransport);
10352
+ logger.info("Content transport ready", { mode: "http" });
10353
+ const smelter = new Smelter(
10354
+ actorStateUnit.events$,
10355
+ vectorStore,
10356
+ embeddingProvider,
10357
+ contentTransport,
10358
+ httpTransport,
10359
+ chunkingConfig,
10360
+ { burstWindowMs: 50, maxBatchSize: 100, idleTimeoutMs: 200 },
10361
+ logger
10362
+ );
10363
+ smelter.initialize();
10061
10364
  actorStateUnit.start();
10062
10365
  logger.info("Subscribed to domain events");
10063
10366
  const health = createServer((req, res) => {
10064
10367
  if (req.url === "/health") {
10065
10368
  res.writeHead(200, { "Content-Type": "application/json" });
10066
- res.end(JSON.stringify({ status: "ok", eventsProcessed }));
10369
+ res.end(JSON.stringify({
10370
+ status: "ok",
10371
+ eventsProcessed: smelter.eventsProcessed,
10372
+ reconcile: smelter.reconcileState
10373
+ }));
10067
10374
  } else {
10068
10375
  res.writeHead(404);
10069
10376
  res.end();
@@ -10074,15 +10381,16 @@ async function main() {
10074
10381
  });
10075
10382
  const shutdown = () => {
10076
10383
  logger.info("Shutting down");
10384
+ clearInterval(reauthTimer);
10077
10385
  actorStateUnit.dispose();
10078
10386
  httpTransport.dispose();
10079
- pipelineSubscription.unsubscribe();
10080
- eventSubject.complete();
10387
+ smelter.stop();
10081
10388
  health.close();
10082
10389
  process.exit(0);
10083
10390
  };
10084
10391
  process.on("SIGTERM", shutdown);
10085
10392
  process.on("SIGINT", shutdown);
10393
+ await smelter.reconcile();
10086
10394
  }
10087
10395
  main().catch((error) => {
10088
10396
  logger.error("Fatal", { error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : void 0 });