@semiont/make-meaning 0.5.5 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,9 @@
1
- import { HttpTransport } from '@semiont/api-client';
2
- import { createTomlConfigLoader, accessToken, baseUrl as baseUrl$1, burstBuffer, resourceId, getTargetSelector, getExactText, annotationId } from '@semiont/core';
1
+ import { createTomlConfigLoader, accessToken, baseUrl as baseUrl$1, burstBuffer, errField, resourceId, textExtractionOf, decodeRepresentation, getTargetSelector, getExactText, annotationId, getPrimaryMediaType, getPrimaryRepresentation } from '@semiont/core';
2
+ import { calculateChecksum } from '@semiont/content';
3
3
  import { createEmbeddingProvider, createVectorStore, chunkText } from '@semiont/vectors';
4
+ import { registerVectorIndexSizeProvider, withActorSpan } from '@semiont/observability';
5
+ import { busRequest } from '@semiont/sdk';
6
+ import { HttpTransport, HttpContentTransport } from '@semiont/http-transport';
4
7
  import { createServer } from 'http';
5
8
  import { existsSync, readFileSync } from 'fs';
6
9
  import { homedir } from 'os';
@@ -699,19 +702,19 @@ var require_Observable = __commonJS({
699
702
  var config_1 = require_config();
700
703
  var isFunction_1 = require_isFunction();
701
704
  var errorContext_1 = require_errorContext();
702
- var Observable2 = (function() {
703
- function Observable3(subscribe) {
705
+ var Observable3 = (function() {
706
+ function Observable4(subscribe) {
704
707
  if (subscribe) {
705
708
  this._subscribe = subscribe;
706
709
  }
707
710
  }
708
- Observable3.prototype.lift = function(operator) {
709
- var observable = new Observable3();
711
+ Observable4.prototype.lift = function(operator) {
712
+ var observable = new Observable4();
710
713
  observable.source = this;
711
714
  observable.operator = operator;
712
715
  return observable;
713
716
  };
714
- Observable3.prototype.subscribe = function(observerOrNext, error, complete) {
717
+ Observable4.prototype.subscribe = function(observerOrNext, error, complete) {
715
718
  var _this = this;
716
719
  var subscriber = isSubscriber(observerOrNext) ? observerOrNext : new Subscriber_1.SafeSubscriber(observerOrNext, error, complete);
717
720
  errorContext_1.errorContext(function() {
@@ -720,14 +723,14 @@ var require_Observable = __commonJS({
720
723
  });
721
724
  return subscriber;
722
725
  };
723
- Observable3.prototype._trySubscribe = function(sink) {
726
+ Observable4.prototype._trySubscribe = function(sink) {
724
727
  try {
725
728
  return this._subscribe(sink);
726
729
  } catch (err) {
727
730
  sink.error(err);
728
731
  }
729
732
  };
730
- Observable3.prototype.forEach = function(next, promiseCtor) {
733
+ Observable4.prototype.forEach = function(next, promiseCtor) {
731
734
  var _this = this;
732
735
  promiseCtor = getPromiseCtor(promiseCtor);
733
736
  return new promiseCtor(function(resolve, reject) {
@@ -746,21 +749,21 @@ var require_Observable = __commonJS({
746
749
  _this.subscribe(subscriber);
747
750
  });
748
751
  };
749
- Observable3.prototype._subscribe = function(subscriber) {
752
+ Observable4.prototype._subscribe = function(subscriber) {
750
753
  var _a;
751
754
  return (_a = this.source) === null || _a === void 0 ? void 0 : _a.subscribe(subscriber);
752
755
  };
753
- Observable3.prototype[observable_1.observable] = function() {
756
+ Observable4.prototype[observable_1.observable] = function() {
754
757
  return this;
755
758
  };
756
- Observable3.prototype.pipe = function() {
759
+ Observable4.prototype.pipe = function() {
757
760
  var operations = [];
758
761
  for (var _i = 0; _i < arguments.length; _i++) {
759
762
  operations[_i] = arguments[_i];
760
763
  }
761
764
  return pipe_1.pipeFromArray(operations)(this);
762
765
  };
763
- Observable3.prototype.toPromise = function(promiseCtor) {
766
+ Observable4.prototype.toPromise = function(promiseCtor) {
764
767
  var _this = this;
765
768
  promiseCtor = getPromiseCtor(promiseCtor);
766
769
  return new promiseCtor(function(resolve, reject) {
@@ -774,12 +777,12 @@ var require_Observable = __commonJS({
774
777
  });
775
778
  });
776
779
  };
777
- Observable3.create = function(subscribe) {
778
- return new Observable3(subscribe);
780
+ Observable4.create = function(subscribe) {
781
+ return new Observable4(subscribe);
779
782
  };
780
- return Observable3;
783
+ return Observable4;
781
784
  })();
782
- exports.Observable = Observable2;
785
+ exports.Observable = Observable3;
783
786
  function getPromiseCtor(promiseCtor) {
784
787
  var _a;
785
788
  return (_a = promiseCtor !== null && promiseCtor !== void 0 ? promiseCtor : config_1.config.Promise) !== null && _a !== void 0 ? _a : Promise;
@@ -9716,8 +9719,7 @@ var require_operators = __commonJS({
9716
9719
  });
9717
9720
 
9718
9721
  // src/smelter-main.ts
9719
- var import_rxjs2 = __toESM(require_cjs());
9720
- var import_operators2 = __toESM(require_operators());
9722
+ var import_rxjs3 = __toESM(require_cjs());
9721
9723
 
9722
9724
  // src/smelter-actor-state-unit.ts
9723
9725
  var import_rxjs = __toESM(require_cjs());
@@ -9746,7 +9748,6 @@ function createSmelterActorStateUnit(options) {
9746
9748
  );
9747
9749
  return {
9748
9750
  events$,
9749
- emit: (channel, payload) => bus.emit(channel, payload),
9750
9751
  start: () => {
9751
9752
  if (started) return;
9752
9753
  started = true;
@@ -9758,8 +9759,493 @@ function createSmelterActorStateUnit(options) {
9758
9759
  };
9759
9760
  }
9760
9761
 
9761
- // src/smelter-main.ts
9762
- var import_rxjs3 = __toESM(require_cjs());
9762
+ // src/smelter.ts
9763
+ var import_rxjs2 = __toESM(require_cjs());
9764
+ var import_operators2 = __toESM(require_operators());
9765
+
9766
+ // src/batch-utils.ts
9767
+ function partitionByType(events) {
9768
+ const runs = [];
9769
+ let currentRun = [];
9770
+ for (const event of events) {
9771
+ if (currentRun.length > 0 && currentRun[0].type !== event.type) {
9772
+ runs.push(currentRun);
9773
+ currentRun = [];
9774
+ }
9775
+ currentRun.push(event);
9776
+ }
9777
+ if (currentRun.length > 0) runs.push(currentRun);
9778
+ return runs;
9779
+ }
9780
+
9781
+ // src/smelter.ts
9782
+ function isWorkItem(input) {
9783
+ return input.type.startsWith("smelt:");
9784
+ }
9785
+ var Smelter = class _Smelter {
9786
+ constructor(events$, vectorStore, embeddingProvider, content, bus, chunkingConfig2, timing, logger2) {
9787
+ this.events$ = events$;
9788
+ this.vectorStore = vectorStore;
9789
+ this.embeddingProvider = embeddingProvider;
9790
+ this.content = content;
9791
+ this.bus = bus;
9792
+ this.chunkingConfig = chunkingConfig2;
9793
+ this.timing = timing;
9794
+ this.logger = logger2;
9795
+ }
9796
+ events$;
9797
+ vectorStore;
9798
+ embeddingProvider;
9799
+ content;
9800
+ bus;
9801
+ chunkingConfig;
9802
+ timing;
9803
+ logger;
9804
+ static RECONCILE_PAGE_SIZE = 200;
9805
+ /** Bound on concurrently in-flight reconcile work — a cold rebuild must not fan out unbounded embedding calls. */
9806
+ static RECONCILE_WAVE = 8;
9807
+ eventSubject = new import_rxjs2.Subject();
9808
+ sourceSubscription = null;
9809
+ pipelineSubscription = null;
9810
+ _eventsProcessed = 0;
9811
+ _reconcileState = { phase: "pending" };
9812
+ workDone = 0;
9813
+ workWaiter = null;
9814
+ get eventsProcessed() {
9815
+ return this._eventsProcessed;
9816
+ }
9817
+ get reconcileState() {
9818
+ return this._reconcileState;
9819
+ }
9820
+ initialize() {
9821
+ this.pipelineSubscription = this.eventSubject.pipe(
9822
+ (0, import_operators2.groupBy)((e) => e.resourceId ?? "__unknown__"),
9823
+ (0, import_operators2.mergeMap)(
9824
+ (group) => group.pipe(
9825
+ burstBuffer({
9826
+ burstWindowMs: this.timing.burstWindowMs,
9827
+ maxBatchSize: this.timing.maxBatchSize,
9828
+ idleTimeoutMs: this.timing.idleTimeoutMs
9829
+ }),
9830
+ (0, import_operators2.concatMap)((inputOrBatch) => {
9831
+ if (Array.isArray(inputOrBatch)) {
9832
+ return (0, import_rxjs2.from)(
9833
+ withActorSpan("smelter", "batch", async () => {
9834
+ this._eventsProcessed += await this.processBatch(inputOrBatch);
9835
+ }, { "batch.size": inputOrBatch.length })
9836
+ );
9837
+ }
9838
+ return (0, import_rxjs2.from)(
9839
+ withActorSpan("smelter", inputOrBatch.type, async () => {
9840
+ const ok = await this.safeProcessEvent(inputOrBatch);
9841
+ if (isWorkItem(inputOrBatch)) this.noteWorkDone(1);
9842
+ else if (ok) this._eventsProcessed++;
9843
+ })
9844
+ );
9845
+ })
9846
+ )
9847
+ )
9848
+ ).subscribe({
9849
+ error: (err) => this.logger.error("Smelter pipeline error", { error: errField(err) })
9850
+ });
9851
+ this.sourceSubscription = this.events$.subscribe((event) => {
9852
+ this.logger.debug("Bus event received", { type: event.type, resourceId: event.resourceId });
9853
+ this.eventSubject.next(event);
9854
+ });
9855
+ this.logger.info("Smelter pipeline initialized");
9856
+ }
9857
+ stop() {
9858
+ this.sourceSubscription?.unsubscribe();
9859
+ this.sourceSubscription = null;
9860
+ this.pipelineSubscription?.unsubscribe();
9861
+ this.pipelineSubscription = null;
9862
+ this.eventSubject.complete();
9863
+ this.logger.info("Smelter stopped");
9864
+ }
9865
+ noteWorkDone(count) {
9866
+ this.workDone += count;
9867
+ if (this.workWaiter && this.workDone >= this.workWaiter.target) {
9868
+ this.workWaiter.resolve();
9869
+ this.workWaiter = null;
9870
+ }
9871
+ }
9872
+ /**
9873
+ * Returns the number of WIRE events processed without error (the S9b
9874
+ * oracle) — `smelt:*` work-item runs tick the drain counter instead.
9875
+ */
9876
+ async processBatch(events) {
9877
+ let wireProcessed = 0;
9878
+ for (const run of partitionByType(events)) {
9879
+ const workRun = isWorkItem(run[0]);
9880
+ try {
9881
+ if (run.length === 1) {
9882
+ const ok = await this.safeProcessEvent(run[0]);
9883
+ if (ok && !workRun) wireProcessed++;
9884
+ } else {
9885
+ const processed = await this.applyBatchByType(run);
9886
+ if (!workRun) wireProcessed += processed;
9887
+ }
9888
+ } catch (error) {
9889
+ this.logger.error("Smelter failed to process batch run", {
9890
+ eventType: run[0].type,
9891
+ runSize: run.length,
9892
+ error: errField(error)
9893
+ });
9894
+ } finally {
9895
+ if (workRun) this.noteWorkDone(run.length);
9896
+ }
9897
+ }
9898
+ return wireProcessed;
9899
+ }
9900
+ /**
9901
+ * Batch-optimized processing for consecutive events of the same type.
9902
+ * Returns the number of events processed without error.
9903
+ */
9904
+ async applyBatchByType(events) {
9905
+ switch (events[0].type) {
9906
+ case "yield:created":
9907
+ case "smelt:embed":
9908
+ return this.batchResourceCreated(events);
9909
+ case "mark:added":
9910
+ case "smelt:embed-annotation":
9911
+ return this.batchAnnotationAdded(events);
9912
+ default: {
9913
+ let processed = 0;
9914
+ for (const event of events) {
9915
+ if (await this.safeProcessEvent(event)) processed++;
9916
+ }
9917
+ return processed;
9918
+ }
9919
+ }
9920
+ }
9921
+ /** Returns true if the input was processed without error. */
9922
+ async safeProcessEvent(event) {
9923
+ try {
9924
+ await this.processEvent(event);
9925
+ return true;
9926
+ } catch (err) {
9927
+ this.logger.error("Smelter failed to process event", {
9928
+ type: event.type,
9929
+ resourceId: event.resourceId,
9930
+ error: errField(err)
9931
+ });
9932
+ return false;
9933
+ }
9934
+ }
9935
+ async processEvent(event) {
9936
+ switch (event.type) {
9937
+ case "yield:created":
9938
+ await this.embedResource(event, "Indexed resource");
9939
+ break;
9940
+ case "yield:updated":
9941
+ case "yield:representation-added":
9942
+ await this.embedResource(event, "Re-embedded resource");
9943
+ break;
9944
+ case "mark:archived":
9945
+ await this.handleResourceArchived(event);
9946
+ break;
9947
+ case "mark:added":
9948
+ await this.handleAnnotationAdded(event);
9949
+ break;
9950
+ case "mark:removed":
9951
+ await this.handleAnnotationRemoved(event);
9952
+ break;
9953
+ // Reconcile work items — same handlers, distinct provenance.
9954
+ case "smelt:embed":
9955
+ await this.embedResource(event, "Reconcile-indexed resource");
9956
+ break;
9957
+ case "smelt:purge":
9958
+ await this.handleResourcePurge(event);
9959
+ break;
9960
+ case "smelt:embed-annotation":
9961
+ await this.handleAnnotationAdded(event);
9962
+ break;
9963
+ case "smelt:purge-annotation":
9964
+ await this.handleAnnotationRemoved(event);
9965
+ break;
9966
+ }
9967
+ }
9968
+ async handleResourcePurge(event) {
9969
+ const rid = event.resourceId;
9970
+ if (!rid) return;
9971
+ await this.vectorStore.deleteResourceVectors(resourceId(rid));
9972
+ this.logger.info("Reconcile deleted orphan resource vectors", { resourceId: rid });
9973
+ }
9974
+ /**
9975
+ * Resolve a resource's embeddable text: bytes via the content transport,
9976
+ * gated to media types that decode as text, decoded charset-aware. The
9977
+ * checksum is over the raw bytes actually read — stamped onto the vectors
9978
+ * so reconciliation can compare against the catalog's claim (S12). Returns
9979
+ * null (logged) when the resource doesn't decode as text, is unavailable,
9980
+ * or is empty — callers skip it.
9981
+ */
9982
+ async fetchEmbeddableText(resourceId$1) {
9983
+ try {
9984
+ const { data, contentType } = await this.content.getBinary(resourceId(resourceId$1));
9985
+ if (textExtractionOf(contentType) !== "decode") {
9986
+ this.logger.debug("Skipping resource that does not decode as text", { resourceId: resourceId$1, contentType });
9987
+ return null;
9988
+ }
9989
+ const bytes = Buffer.from(data);
9990
+ const text = decodeRepresentation(bytes, contentType);
9991
+ return text.trim() ? { text, checksum: calculateChecksum(bytes) } : null;
9992
+ } catch (error) {
9993
+ this.logger.warn("Content unavailable for embedding", { resourceId: resourceId$1, error: errField(error) });
9994
+ return null;
9995
+ }
9996
+ }
9997
+ async embedResource(event, logMessage) {
9998
+ const rid = event.resourceId;
9999
+ if (!rid) return;
10000
+ const fetched = await this.fetchEmbeddableText(rid);
10001
+ if (!fetched) return;
10002
+ const chunks = chunkText(fetched.text, this.chunkingConfig);
10003
+ if (chunks.length === 0) return;
10004
+ const embeddings = await this.embeddingProvider.embedBatch(chunks);
10005
+ const embeddingChunks = chunks.map((t, i) => ({
10006
+ chunkIndex: i,
10007
+ text: t,
10008
+ embedding: embeddings[i]
10009
+ }));
10010
+ await this.vectorStore.upsertResourceVectors(resourceId(rid), embeddingChunks, fetched.checksum);
10011
+ this.logger.info(logMessage, { resourceId: rid, chunks: chunks.length });
10012
+ }
10013
+ async handleResourceArchived(event) {
10014
+ const rid = event.resourceId;
10015
+ if (!rid) return;
10016
+ await this.vectorStore.deleteResourceVectors(resourceId(rid));
10017
+ await this.vectorStore.deleteAnnotationVectorsForResource(resourceId(rid));
10018
+ this.logger.info("Deleted vectors for archived resource", { resourceId: rid });
10019
+ }
10020
+ async handleAnnotationAdded(event) {
10021
+ const annotation = event.payload.annotation;
10022
+ if (!annotation?.id) return;
10023
+ const rid = event.resourceId;
10024
+ if (!rid) return;
10025
+ const selector = getTargetSelector(annotation.target);
10026
+ const exactText = getExactText(selector);
10027
+ if (!exactText?.trim()) return;
10028
+ const aid = annotationId(annotation.id);
10029
+ const embedding2 = await this.embeddingProvider.embed(exactText);
10030
+ const payload = {
10031
+ annotationId: aid,
10032
+ resourceId: resourceId(rid),
10033
+ motivation: annotation.motivation ?? "",
10034
+ entityTypes: annotation.entityTypes ?? [],
10035
+ exactText
10036
+ };
10037
+ await this.vectorStore.upsertAnnotationVector(aid, embedding2, payload);
10038
+ this.logger.info("Indexed annotation", { annotationId: String(aid) });
10039
+ }
10040
+ async handleAnnotationRemoved(event) {
10041
+ const annotationId$1 = event.payload.annotationId;
10042
+ if (!annotationId$1) return;
10043
+ const aid = annotationId(annotationId$1);
10044
+ await this.vectorStore.deleteAnnotationVector(aid);
10045
+ this.logger.info("Deleted annotation vector", { annotationId: annotationId$1 });
10046
+ }
10047
+ /**
10048
+ * Batch-embed chunks from multiple yield:created events in a single
10049
+ * embedBatch() call, then index per resource.
10050
+ */
10051
+ async batchResourceCreated(events) {
10052
+ const resourceData = [];
10053
+ const allChunks = [];
10054
+ for (const event of events) {
10055
+ const rid = event.resourceId;
10056
+ if (!rid) continue;
10057
+ const fetched = await this.fetchEmbeddableText(rid);
10058
+ if (!fetched) continue;
10059
+ const chunks = chunkText(fetched.text, this.chunkingConfig);
10060
+ if (chunks.length === 0) continue;
10061
+ resourceData.push({ rid: resourceId(rid), chunks, checksum: fetched.checksum });
10062
+ allChunks.push(...chunks);
10063
+ }
10064
+ if (allChunks.length === 0) return events.length;
10065
+ const allEmbeddings = await this.embeddingProvider.embedBatch(allChunks);
10066
+ let offset = 0;
10067
+ for (const { rid, chunks, checksum } of resourceData) {
10068
+ const embeddingChunks = chunks.map((t, i) => ({
10069
+ chunkIndex: i,
10070
+ text: t,
10071
+ embedding: allEmbeddings[offset + i]
10072
+ }));
10073
+ await this.vectorStore.upsertResourceVectors(rid, embeddingChunks, checksum);
10074
+ this.logger.info("Batch-indexed resource", { resourceId: String(rid), chunks: chunks.length });
10075
+ offset += chunks.length;
10076
+ }
10077
+ return events.length;
10078
+ }
10079
+ /**
10080
+ * Batch-embed exact texts from multiple mark:added events in a single
10081
+ * embedBatch() call, then index per annotation.
10082
+ */
10083
+ async batchAnnotationAdded(events) {
10084
+ const annotationData = [];
10085
+ for (const event of events) {
10086
+ const annotation = event.payload.annotation;
10087
+ if (!annotation?.id) continue;
10088
+ const rid = event.resourceId;
10089
+ if (!rid) continue;
10090
+ const selector = getTargetSelector(annotation.target);
10091
+ const exactText = getExactText(selector);
10092
+ if (!exactText?.trim()) continue;
10093
+ annotationData.push({
10094
+ rid: resourceId(rid),
10095
+ aid: annotationId(annotation.id),
10096
+ exactText,
10097
+ motivation: annotation.motivation ?? "",
10098
+ entityTypes: annotation.entityTypes ?? []
10099
+ });
10100
+ }
10101
+ if (annotationData.length === 0) return events.length;
10102
+ const allEmbeddings = await this.embeddingProvider.embedBatch(
10103
+ annotationData.map((a) => a.exactText)
10104
+ );
10105
+ for (let i = 0; i < annotationData.length; i++) {
10106
+ const { rid, aid, exactText, motivation, entityTypes } = annotationData[i];
10107
+ const payload = {
10108
+ annotationId: aid,
10109
+ resourceId: rid,
10110
+ motivation,
10111
+ entityTypes,
10112
+ exactText
10113
+ };
10114
+ await this.vectorStore.upsertAnnotationVector(aid, allEmbeddings[i], payload);
10115
+ this.logger.info("Batch-indexed annotation", { annotationId: String(aid) });
10116
+ }
10117
+ return events.length;
10118
+ }
10119
+ // ── Reconciliation ───────────────────────────────────────────────────
10120
+ /**
10121
+ * Reconcile the vector store against the KS catalog.
10122
+ *
10123
+ * Lists what IS indexed (via the store's id enumeration) and what SHOULD
10124
+ * be (non-archived resources with embeddable media types, plus their
10125
+ * exact-text annotations, via the `browse:*` RPC channels), then plans the
10126
+ * diff as `smelt:*` work items — embeds for what's missing, purges for
10127
+ * what shouldn't be there — and drains them through the pipeline mailbox.
10128
+ * Work items share the per-resource lanes with live events, so a reconcile
10129
+ * re-embed can never interleave with (or stale-overwrite) live processing
10130
+ * of the same resource (axioms S1/S2). Waves of RECONCILE_WAVE bound how
10131
+ * many embedding calls a cold rebuild has in flight.
10132
+ *
10133
+ * Call after the live subscription is attached so nothing falls in the
10134
+ * gap. The index snapshot is taken BEFORE the catalog listing so a
10135
+ * resource indexed by a live event mid-reconcile is never mistaken for an
10136
+ * orphan; convergence holds because every upsert replaces a resource's
10137
+ * full vector set from current content.
10138
+ */
10139
+ async reconcile() {
10140
+ if (!this.pipelineSubscription) {
10141
+ throw new Error("Smelter.reconcile() requires initialize() \u2014 work items drain through the pipeline");
10142
+ }
10143
+ this._reconcileState = { phase: "running" };
10144
+ try {
10145
+ const [indexedResources, indexedAnnotations] = await Promise.all([
10146
+ this.vectorStore.listResourceChecksums(),
10147
+ this.vectorStore.listAnnotationIds()
10148
+ ]);
10149
+ const resources = await this.listAllResources();
10150
+ this.logger.info("Reconcile started", {
10151
+ indexedResources: indexedResources.size,
10152
+ indexedAnnotations: indexedAnnotations.size,
10153
+ liveResources: resources.length
10154
+ });
10155
+ const embeddable = /* @__PURE__ */ new Map();
10156
+ for (const resource of resources) {
10157
+ const mediaType = getPrimaryMediaType(resource);
10158
+ if (resource["@id"] && mediaType && textExtractionOf(mediaType) === "decode") {
10159
+ embeddable.set(resource["@id"], getPrimaryRepresentation(resource)?.checksum);
10160
+ }
10161
+ }
10162
+ const work = [];
10163
+ for (const rid of indexedResources.keys()) {
10164
+ if (!embeddable.has(rid)) work.push({ type: "smelt:purge", resourceId: rid, payload: {} });
10165
+ }
10166
+ for (const [rid, catalogChecksum] of embeddable) {
10167
+ if (!indexedResources.has(rid)) {
10168
+ work.push({ type: "smelt:embed", resourceId: rid, payload: {} });
10169
+ } else if (catalogChecksum !== void 0 && indexedResources.get(rid) !== catalogChecksum) {
10170
+ work.push({ type: "smelt:embed", resourceId: rid, payload: {} });
10171
+ }
10172
+ }
10173
+ const liveAnnotationIds = /* @__PURE__ */ new Set();
10174
+ for (const resource of resources) {
10175
+ const rid = resource["@id"];
10176
+ if (!rid) continue;
10177
+ const { annotations } = await busRequest(
10178
+ this.bus,
10179
+ "browse:annotations-requested",
10180
+ { resourceId: rid },
10181
+ "browse:annotations-result",
10182
+ "browse:annotations-failed"
10183
+ );
10184
+ for (const annotation of annotations) {
10185
+ const exactText = getExactText(getTargetSelector(annotation.target));
10186
+ if (!annotation.id || !exactText?.trim()) continue;
10187
+ liveAnnotationIds.add(annotation.id);
10188
+ if (!indexedAnnotations.has(annotation.id)) {
10189
+ work.push({ type: "smelt:embed-annotation", resourceId: rid, payload: { annotation } });
10190
+ }
10191
+ }
10192
+ }
10193
+ for (const aid of indexedAnnotations) {
10194
+ if (!liveAnnotationIds.has(aid)) {
10195
+ work.push({ type: "smelt:purge-annotation", resourceId: aid, payload: { annotationId: aid } });
10196
+ }
10197
+ }
10198
+ await this.drain(work);
10199
+ const summary = {
10200
+ resourcesEmbedded: work.filter((w) => w.type === "smelt:embed").length,
10201
+ resourceVectorsDeleted: work.filter((w) => w.type === "smelt:purge").length,
10202
+ annotationsEmbedded: work.filter((w) => w.type === "smelt:embed-annotation").length,
10203
+ annotationVectorsDeleted: work.filter((w) => w.type === "smelt:purge-annotation").length
10204
+ };
10205
+ this._reconcileState = { phase: "done", summary };
10206
+ this.logger.info("Reconcile complete", { ...summary });
10207
+ return summary;
10208
+ } catch (error) {
10209
+ this._reconcileState = {
10210
+ phase: "failed",
10211
+ error: error instanceof Error ? error.message : String(error)
10212
+ };
10213
+ this.logger.error("Reconcile failed", { error: errField(error) });
10214
+ throw error;
10215
+ }
10216
+ }
10217
+ /**
10218
+ * Enqueue planner work through the mailbox in bounded waves and await
10219
+ * completion. The pipeline ticks `noteWorkDone` for every consumed work
10220
+ * item (success or failure — failures are logged like any live event), so
10221
+ * each wave's waiter resolves exactly when its items have been processed.
10222
+ */
10223
+ async drain(work) {
10224
+ for (let i = 0; i < work.length; i += _Smelter.RECONCILE_WAVE) {
10225
+ const wave = work.slice(i, i + _Smelter.RECONCILE_WAVE);
10226
+ const done = new Promise((resolve) => {
10227
+ this.workWaiter = { target: this.workDone + wave.length, resolve };
10228
+ });
10229
+ for (const item of wave) this.eventSubject.next(item);
10230
+ await done;
10231
+ }
10232
+ }
10233
+ /** Page through `browse:resources-requested` until the catalog is exhausted. */
10234
+ async listAllResources() {
10235
+ const all = [];
10236
+ for (; ; ) {
10237
+ const page = await busRequest(
10238
+ this.bus,
10239
+ "browse:resources-requested",
10240
+ { archived: false, offset: all.length, limit: _Smelter.RECONCILE_PAGE_SIZE },
10241
+ "browse:resources-result",
10242
+ "browse:resources-failed"
10243
+ );
10244
+ all.push(...page.resources);
10245
+ if (page.resources.length === 0 || all.length >= page.total) return all;
10246
+ }
10247
+ }
10248
+ };
9763
10249
  var configPath = join(homedir(), ".semiontconfig");
9764
10250
  var tomlReader = {
9765
10251
  readIfExists: (p) => existsSync(p) ? readFileSync(p, "utf-8") : null
@@ -9796,9 +10282,6 @@ var chunkingConfig = {
9796
10282
  };
9797
10283
  var workerSecret = process.env.SEMIONT_WORKER_SECRET ?? "";
9798
10284
  var healthPort = 9091;
9799
- var BURST_WINDOW_MS = 50;
9800
- var MAX_BATCH_SIZE = 100;
9801
- var IDLE_TIMEOUT_MS = 200;
9802
10285
  var logger = createProcessLogger("smelter");
9803
10286
  async function authenticate() {
9804
10287
  if (!workerSecret) {
@@ -9820,250 +10303,70 @@ async function authenticate() {
9820
10303
  const { token } = await response.json();
9821
10304
  return token;
9822
10305
  }
9823
- var authToken = "";
9824
- async function fetchContent(resourceId) {
9825
- try {
9826
- const response = await fetch(`${baseUrl}/api/resources/${resourceId}`, {
9827
- headers: {
9828
- Authorization: `Bearer ${authToken}`,
9829
- Accept: "text/plain"
9830
- }
9831
- });
9832
- if (!response.ok) return null;
9833
- return response.text();
9834
- } catch {
9835
- return null;
9836
- }
9837
- }
9838
- var vectorStore;
9839
- var embeddingProvider;
9840
- var eventsProcessed = 0;
9841
- async function processEvent(event) {
9842
- try {
9843
- switch (event.type) {
9844
- case "yield:created":
9845
- await handleResourceCreated(event);
9846
- break;
9847
- case "yield:updated":
9848
- case "yield:representation-added":
9849
- await handleResourceReembed(event);
9850
- break;
9851
- case "mark:archived":
9852
- await handleResourceArchived(event);
9853
- break;
9854
- case "mark:added":
9855
- await handleAnnotationAdded(event);
9856
- break;
9857
- case "mark:removed":
9858
- await handleAnnotationRemoved(event);
9859
- break;
9860
- }
9861
- eventsProcessed++;
9862
- } catch (err) {
9863
- logger.error("Failed to process event", { type: event.type, resourceId: event.resourceId, error: err instanceof Error ? err.message : String(err) });
9864
- }
9865
- }
9866
- async function handleResourceCreated(event) {
9867
- const rid = event.resourceId;
9868
- if (!rid) return;
9869
- const text = await fetchContent(rid);
9870
- if (!text?.trim()) return;
9871
- const chunks = chunkText(text, chunkingConfig);
9872
- if (chunks.length === 0) return;
9873
- const embeddings = await embeddingProvider.embedBatch(chunks);
9874
- const embeddingChunks = chunks.map((t, i) => ({
9875
- chunkIndex: i,
9876
- text: t,
9877
- embedding: embeddings[i]
9878
- }));
9879
- await vectorStore.upsertResourceVectors(resourceId(rid), embeddingChunks);
9880
- logger.info("Indexed resource", { resourceId: rid, chunks: chunks.length });
9881
- }
9882
- async function handleResourceReembed(event) {
9883
- const rid = event.resourceId;
9884
- if (!rid) return;
9885
- const text = await fetchContent(rid);
9886
- if (!text?.trim()) return;
9887
- const chunks = chunkText(text, chunkingConfig);
9888
- if (chunks.length === 0) return;
9889
- const embeddings = await embeddingProvider.embedBatch(chunks);
9890
- const embeddingChunks = chunks.map((t, i) => ({
9891
- chunkIndex: i,
9892
- text: t,
9893
- embedding: embeddings[i]
9894
- }));
9895
- await vectorStore.deleteResourceVectors(resourceId(rid));
9896
- await vectorStore.upsertResourceVectors(resourceId(rid), embeddingChunks);
9897
- logger.info("Re-embedded resource", { resourceId: rid, chunks: chunks.length });
9898
- }
9899
- async function handleResourceArchived(event) {
9900
- const rid = event.resourceId;
9901
- if (!rid) return;
9902
- await vectorStore.deleteResourceVectors(resourceId(rid));
9903
- logger.info("Deleted vectors for archived resource", { resourceId: rid });
9904
- }
9905
- async function handleAnnotationAdded(event) {
9906
- const annotation = event.payload.annotation;
9907
- if (!annotation?.id) return;
9908
- const rid = event.resourceId;
9909
- if (!rid) return;
9910
- const selector = getTargetSelector(annotation.target);
9911
- const exactText = getExactText(selector);
9912
- if (!exactText?.trim()) return;
9913
- const aid = annotationId(annotation.id);
9914
- const embedding2 = await embeddingProvider.embed(exactText);
9915
- const payload = {
9916
- annotationId: aid,
9917
- resourceId: resourceId(rid),
9918
- motivation: annotation.motivation ?? "",
9919
- entityTypes: annotation.entityTypes ?? [],
9920
- exactText
9921
- };
9922
- await vectorStore.upsertAnnotationVector(aid, embedding2, payload);
9923
- logger.info("Indexed annotation", { annotationId: String(aid) });
9924
- }
9925
- async function handleAnnotationRemoved(event) {
9926
- const annotationId$1 = event.payload.annotationId;
9927
- if (!annotationId$1) return;
9928
- const aid = annotationId(annotationId$1);
9929
- await vectorStore.deleteAnnotationVector(aid);
9930
- logger.info("Deleted annotation vector", { annotationId: annotationId$1 });
9931
- }
9932
- async function processBatch(events) {
9933
- const type = events[0].type;
9934
- if (type === "yield:created") {
9935
- await batchResourceCreated(events);
9936
- } else if (type === "mark:added") {
9937
- await batchAnnotationAdded(events);
9938
- } else {
9939
- for (const event of events) {
9940
- await processEvent(event);
9941
- }
9942
- }
9943
- }
9944
- async function batchResourceCreated(events) {
9945
- const resourceData = [];
9946
- const allChunks = [];
9947
- for (const event of events) {
9948
- const rid = event.resourceId;
9949
- if (!rid) continue;
9950
- const text = await fetchContent(rid);
9951
- if (!text?.trim()) continue;
9952
- const chunks = chunkText(text, chunkingConfig);
9953
- if (chunks.length === 0) continue;
9954
- resourceData.push({ rid: resourceId(rid), chunks });
9955
- allChunks.push(...chunks);
9956
- }
9957
- if (allChunks.length === 0) return;
9958
- const allEmbeddings = await embeddingProvider.embedBatch(allChunks);
9959
- let offset = 0;
9960
- for (const { rid, chunks } of resourceData) {
9961
- const embeddingChunks = chunks.map((t, i) => ({
9962
- chunkIndex: i,
9963
- text: t,
9964
- embedding: allEmbeddings[offset + i]
9965
- }));
9966
- await vectorStore.upsertResourceVectors(rid, embeddingChunks);
9967
- logger.info("Batch-indexed resource", { resourceId: String(rid), chunks: chunks.length });
9968
- offset += chunks.length;
9969
- }
9970
- eventsProcessed += events.length;
9971
- }
9972
- async function batchAnnotationAdded(events) {
9973
- const annotationData = [];
9974
- for (const event of events) {
9975
- const annotation = event.payload.annotation;
9976
- if (!annotation?.id) continue;
9977
- const rid = event.resourceId;
9978
- if (!rid) continue;
9979
- const selector = getTargetSelector(annotation.target);
9980
- const exactText = getExactText(selector);
9981
- if (!exactText?.trim()) continue;
9982
- annotationData.push({
9983
- rid: resourceId(rid),
9984
- aid: annotationId(annotation.id),
9985
- exactText,
9986
- motivation: annotation.motivation ?? "",
9987
- entityTypes: annotation.entityTypes ?? []
9988
- });
9989
- }
9990
- if (annotationData.length === 0) return;
9991
- const allEmbeddings = await embeddingProvider.embedBatch(
9992
- annotationData.map((a) => a.exactText)
9993
- );
9994
- for (let i = 0; i < annotationData.length; i++) {
9995
- const { rid, aid, exactText, motivation, entityTypes } = annotationData[i];
9996
- const payload = {
9997
- annotationId: aid,
9998
- resourceId: rid,
9999
- motivation,
10000
- entityTypes,
10001
- exactText
10002
- };
10003
- await vectorStore.upsertAnnotationVector(aid, allEmbeddings[i], payload);
10004
- logger.info("Batch-indexed annotation", { annotationId: String(aid) });
10005
- }
10006
- eventsProcessed += events.length;
10007
- }
10008
10306
  async function main() {
10009
10307
  const { initObservabilityNode } = await import('@semiont/observability/node');
10010
10308
  initObservabilityNode({ serviceName: "semiont-smelter" });
10011
10309
  logger.info("Authenticating", { baseUrl });
10012
- authToken = await authenticate();
10310
+ const tokenSubject = new import_rxjs3.BehaviorSubject(accessToken(await authenticate()));
10013
10311
  logger.info("Authenticated");
10014
- embeddingProvider = await createEmbeddingProvider({
10312
+ const refreshToken = async () => {
10313
+ const token = await authenticate();
10314
+ tokenSubject.next(accessToken(token));
10315
+ return token;
10316
+ };
10317
+ const reauthTimer = setInterval(() => {
10318
+ refreshToken().catch((error) => {
10319
+ logger.error("Proactive re-authentication failed", {
10320
+ error: error instanceof Error ? error.message : String(error)
10321
+ });
10322
+ });
10323
+ }, 12 * 60 * 60 * 1e3);
10324
+ const embeddingProvider = await createEmbeddingProvider({
10015
10325
  type: embeddingType,
10016
10326
  model: embeddingModel,
10017
10327
  baseURL: embeddingBaseURL
10018
10328
  });
10019
10329
  logger.info("Embedding provider ready", { type: embeddingType, model: embeddingModel });
10020
10330
  const dimensions = embeddingProvider.dimensions();
10021
- vectorStore = await createVectorStore({
10331
+ const vectorStore = await createVectorStore({
10022
10332
  type: "qdrant",
10023
10333
  host: qdrantHost,
10024
10334
  port: qdrantPort,
10025
10335
  dimensions
10026
10336
  });
10027
10337
  logger.info("Vector store ready", { host: qdrantHost, port: qdrantPort, dimensions });
10028
- const tokenSubject = new import_rxjs3.BehaviorSubject(accessToken(authToken));
10338
+ registerVectorIndexSizeProvider(() => vectorStore.count());
10029
10339
  const httpTransport = new HttpTransport({
10030
10340
  baseUrl: baseUrl$1(baseUrl),
10031
- token$: tokenSubject
10341
+ token$: tokenSubject,
10342
+ tokenRefresher: refreshToken
10032
10343
  });
10033
10344
  const actorStateUnit = createSmelterActorStateUnit({
10034
10345
  bus: httpTransport.actor
10035
10346
  });
10036
- const eventSubject = new import_rxjs2.Subject();
10037
- const pipelineSubscription = eventSubject.pipe(
10038
- (0, import_operators2.groupBy)((e) => e.resourceId ?? "__unknown__"),
10039
- (0, import_operators2.mergeMap)(
10040
- (group) => group.pipe(
10041
- burstBuffer({
10042
- burstWindowMs: BURST_WINDOW_MS,
10043
- maxBatchSize: MAX_BATCH_SIZE,
10044
- idleTimeoutMs: IDLE_TIMEOUT_MS
10045
- }),
10046
- (0, import_operators2.concatMap)((eventOrBatch) => {
10047
- if (Array.isArray(eventOrBatch)) {
10048
- return (0, import_rxjs2.from)(processBatch(eventOrBatch));
10049
- }
10050
- return (0, import_rxjs2.from)(processEvent(eventOrBatch));
10051
- })
10052
- )
10053
- )
10054
- ).subscribe({
10055
- error: (err) => logger.error("Pipeline error", { error: err instanceof Error ? err.message : String(err) })
10056
- });
10057
- actorStateUnit.events$.subscribe((event) => {
10058
- logger.debug("Bus event received", { type: event.type, resourceId: event.resourceId });
10059
- eventSubject.next(event);
10060
- });
10347
+ const contentTransport = new HttpContentTransport(httpTransport);
10348
+ logger.info("Content transport ready", { mode: "http" });
10349
+ const smelter = new Smelter(
10350
+ actorStateUnit.events$,
10351
+ vectorStore,
10352
+ embeddingProvider,
10353
+ contentTransport,
10354
+ httpTransport,
10355
+ chunkingConfig,
10356
+ { burstWindowMs: 50, maxBatchSize: 100, idleTimeoutMs: 200 },
10357
+ logger
10358
+ );
10359
+ smelter.initialize();
10061
10360
  actorStateUnit.start();
10062
10361
  logger.info("Subscribed to domain events");
10063
10362
  const health = createServer((req, res) => {
10064
10363
  if (req.url === "/health") {
10065
10364
  res.writeHead(200, { "Content-Type": "application/json" });
10066
- res.end(JSON.stringify({ status: "ok", eventsProcessed }));
10365
+ res.end(JSON.stringify({
10366
+ status: "ok",
10367
+ eventsProcessed: smelter.eventsProcessed,
10368
+ reconcile: smelter.reconcileState
10369
+ }));
10067
10370
  } else {
10068
10371
  res.writeHead(404);
10069
10372
  res.end();
@@ -10074,15 +10377,16 @@ async function main() {
10074
10377
  });
10075
10378
  const shutdown = () => {
10076
10379
  logger.info("Shutting down");
10380
+ clearInterval(reauthTimer);
10077
10381
  actorStateUnit.dispose();
10078
10382
  httpTransport.dispose();
10079
- pipelineSubscription.unsubscribe();
10080
- eventSubject.complete();
10383
+ smelter.stop();
10081
10384
  health.close();
10082
10385
  process.exit(0);
10083
10386
  };
10084
10387
  process.on("SIGTERM", shutdown);
10085
10388
  process.on("SIGINT", shutdown);
10389
+ await smelter.reconcile();
10086
10390
  }
10087
10391
  main().catch((error) => {
10088
10392
  logger.error("Fatal", { error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : void 0 });