@semiont/make-meaning 0.4.18 → 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { JobQueue, ReferenceAnnotationWorker, GenerationWorker, HighlightAnnotationWorker, AssessmentAnnotationWorker, CommentAnnotationWorker, TagAnnotationWorker } from '@semiont/jobs';
2
2
  import { SemiontProject } from '@semiont/core/node';
3
- import { GraphServiceConfig, VectorsServiceConfig, EmbeddingServiceConfig, EventBus, Logger, StoredEvent, ResourceId, components, AnnotationId, UserId, CreationMethod, ResourceAnnotations, AnnotationCategory, GraphPath, GraphConnection } from '@semiont/core';
3
+ import { GraphServiceConfig, VectorsServiceConfig, EmbeddingServiceConfig, EventBus, Logger, StoredEvent, ResourceId, AnnotationId, components, UserId, CreationMethod, ResourceAnnotations, AnnotationCategory, GraphPath, GraphConnection } from '@semiont/core';
4
4
  export { AssembledAnnotation, applyBodyOperations, assembleAnnotation } from '@semiont/core';
5
5
  import { EventStore, ViewStorage } from '@semiont/event-sourcing';
6
6
  import { WorkingTreeStore } from '@semiont/content';
@@ -72,6 +72,15 @@ interface MakeMeaningConfig {
72
72
  * BURST_WINDOW_MS = 50 — debounce window before flushing a batch
73
73
  * MAX_BATCH_SIZE = 500 — force flush to bound memory
74
74
  * IDLE_TIMEOUT_MS = 200 — silence before returning to passthrough
75
+ *
76
+ * ## Per-resource serialization
77
+ *
78
+ * `groupBy(resourceId) + concatMap(...)` is the stream-consumer flavor of
79
+ * per-resource serialization — the same invariant enforced by `Smelter`,
80
+ * `Gatherer`, and (in a different shape) `ViewManager`. See
81
+ * `packages/core/src/serialize-per-key.ts` for the shared primitive used
82
+ * by RPC-style services, and `.plans/PerResourceSerializer.md` for the
83
+ * broader design that would unify the two shapes.
75
84
  */
76
85
 
77
86
  declare class GraphDBConsumer {
@@ -147,28 +156,102 @@ declare class GraphDBConsumer {
147
156
  shutdown(): Promise<void>;
148
157
  }
149
158
 
159
+ /**
160
+ * EmbeddingStore
161
+ *
162
+ * Durable file-based cache for pre-computed embedding vectors.
163
+ * Stored under .semiont/embeddings/ — committed to git alongside events,
164
+ * but overwritten in place rather than appended.
165
+ *
166
+ * File layout (same 4-hex Jump Consistent Hash sharding as events):
167
+ *
168
+ * .semiont/embeddings/{ab}/{cd}/{resourceId}.jsonl
169
+ * Line 0: { model, dimensions } ← model header
170
+ * Line N: { chunkIndex, text, embedding[] } ← one chunk per line
171
+ *
172
+ * .semiont/embeddings/{ab}/{cd}/{annotationId}.json
173
+ * { model, dimensions, resourceId, text, embedding[], motivation, entityTypes }
174
+ *
175
+ * rebuildAll() in Smelter reads these files and upserts into Qdrant without
176
+ * calling the embedding provider — unless the stored model doesn't match the
177
+ * configured provider, in which case the file is re-embedded and overwritten.
178
+ */
179
+
180
+ interface StoredChunk {
181
+ chunkIndex: number;
182
+ text: string;
183
+ embedding: number[];
184
+ }
185
+ interface ResourceEmbeddingFile {
186
+ model: string;
187
+ dimensions: number;
188
+ chunks: StoredChunk[];
189
+ }
190
+ interface AnnotationEmbeddingFile {
191
+ model: string;
192
+ dimensions: number;
193
+ resourceId: string;
194
+ text: string;
195
+ embedding: number[];
196
+ motivation: string;
197
+ entityTypes: string[];
198
+ }
199
+ declare class EmbeddingStore {
200
+ private readonly project;
201
+ constructor(project: SemiontProject);
202
+ private resourceFilePath;
203
+ private annotationFilePath;
204
+ writeResourceChunks(resourceId: ResourceId, model: string, dimensions: number, chunks: StoredChunk[]): Promise<void>;
205
+ readResourceEmbeddings(resourceId: ResourceId): Promise<ResourceEmbeddingFile | null>;
206
+ deleteResourceEmbeddings(resourceId: ResourceId): Promise<void>;
207
+ writeAnnotationEmbedding(annotationId: AnnotationId, resourceId: ResourceId, model: string, dimensions: number, text: string, embedding: number[], motivation: string, entityTypes: string[]): Promise<void>;
208
+ readAnnotationEmbedding(annotationId: AnnotationId): Promise<AnnotationEmbeddingFile | null>;
209
+ deleteAnnotationEmbedding(annotationId: AnnotationId): Promise<void>;
210
+ /**
211
+ * Scan embeddings directory and return all resource IDs (from *.jsonl files).
212
+ */
213
+ getAllResourceIds(): Promise<string[]>;
214
+ /**
215
+ * Scan embeddings directory and return all annotation IDs (from *.json files).
216
+ */
217
+ getAllAnnotationIds(): Promise<string[]>;
218
+ private scanIds;
219
+ }
220
+
150
221
  /**
151
222
  * Smelter Actor
152
223
  *
153
- * Takes raw content, refines it into embedding vectors, persists them as events,
154
- * and indexes them into the vector store. Peer to the Graph Consumer.
224
+ * Takes raw content, refines it into embedding vectors, persists them to the
225
+ * EmbeddingStore (.semiont/embeddings/), and indexes them into the VectorStore
226
+ * (Qdrant). Peer to the Graph Consumer.
155
227
  *
156
228
  * Pipeline:
157
229
  * 1. Subscribe to resource and annotation events from the EventStore
158
230
  * 2. Chunk resource text into overlapping passages
159
231
  * 3. Embed each chunk via the configured EmbeddingProvider
160
- * 4. Emit embedding:computed events on the EventBus (persisted by Stower)
232
+ * 4. Write vectors to EmbeddingStore (overwrite-in-place, git-durable)
161
233
  * 5. Index vectors into the VectorStore (Qdrant) for fast similarity search
162
234
  *
163
235
  * Uses the same burst-buffer RxJS pipeline as GraphDBConsumer.
236
+ *
237
+ * ## Per-resource serialization
238
+ *
239
+ * Smelter processes events strictly in order per resourceId via
240
+ * `groupBy(resourceId) + concatMap(...)`. This is the stream-consumer
241
+ * flavor of per-resource serialization — the same invariant enforced by
242
+ * `GraphDBConsumer`, `Gatherer`, and (in a different shape) `ViewManager`.
243
+ * See `packages/core/src/serialize-per-key.ts` for the shared primitive
244
+ * used by RPC-style services, and `.plans/PerResourceSerializer.md` for
245
+ * the broader design that would unify the two shapes.
164
246
  */
165
247
 
166
248
  declare class Smelter {
167
- private eventStore;
168
249
  private eventBus;
169
250
  private vectorStore;
170
251
  private embeddingProvider;
171
252
  private contentStore;
253
+ private embeddingStore;
254
+ private viewStorage;
172
255
  private static readonly SMELTER_RELEVANT_EVENTS;
173
256
  private static readonly BURST_WINDOW_MS;
174
257
  private static readonly MAX_BATCH_SIZE;
@@ -178,35 +261,47 @@ declare class Smelter {
178
261
  private pipelineSubscription;
179
262
  private readonly logger;
180
263
  private readonly chunkingConfig;
181
- constructor(eventStore: EventStore, eventBus: EventBus, vectorStore: VectorStore, embeddingProvider: EmbeddingProvider, contentStore: WorkingTreeStore, logger: Logger, chunkingConfig?: ChunkingConfig);
264
+ constructor(_eventStore: EventStore, eventBus: EventBus, vectorStore: VectorStore, embeddingProvider: EmbeddingProvider, contentStore: WorkingTreeStore, embeddingStore: EmbeddingStore, viewStorage: ViewStorage, logger: Logger, chunkingConfig?: ChunkingConfig);
182
265
  initialize(): Promise<void>;
183
266
  stop(): Promise<void>;
184
267
  /**
185
- * Rebuild the vector store from persisted embedding events in the event log.
186
- * Reads all embedding:computed / embedding:deleted events and replays them.
187
- * Bypasses the live pipeline reads directly from the event store.
268
+ * Rebuild the vector store from the EmbeddingStore (.semiont/embeddings/).
269
+ *
270
+ * For each stored file, checks whether the model matches the configured
271
+ * provider. On mismatch, re-embeds from the stored text and overwrites the
272
+ * file before upserting into Qdrant. On match, loads the stored vectors
273
+ * directly — no embedding provider calls needed.
188
274
  */
189
275
  rebuildAll(): Promise<void>;
190
276
  private processBatch;
191
277
  /**
192
278
  * Batch-optimized processing for consecutive events of the same type.
193
- * Collects all texts across events, embeds in a single embedBatch() call,
194
- * then distributes results back to their respective resources/annotations.
195
279
  */
196
280
  private applyBatchByType;
197
281
  /**
198
- * Batch-embed chunks from multiple resource.created events in a single
199
- * embedBatch() call, then emit events and index per resource.
282
+ * Batch-embed chunks from multiple yield:created events in a single
283
+ * embedBatch() call, then write to EmbeddingStore and index per resource.
200
284
  */
201
285
  private batchResourceCreated;
202
286
  /**
203
- * Batch-embed exact texts from multiple annotation.added events in a
204
- * single embedBatch() call, then emit events and index per annotation.
287
+ * Batch-embed exact texts from multiple mark:added events in a single
288
+ * embedBatch() call, then write to EmbeddingStore and index per annotation.
205
289
  */
206
290
  private batchAnnotationAdded;
207
291
  private safeProcessEvent;
208
292
  private processEvent;
209
293
  private handleResourceCreated;
294
+ /**
295
+ * Re-embed a resource whose content has changed in-place.
296
+ *
297
+ * Used by yield:updated and yield:representation-added handlers. Reads the
298
+ * current storageUri from the materialized view (which is updated before the
299
+ * EventBus fires), deletes stale Qdrant vectors, and overwrites the
300
+ * EmbeddingStore file with fresh chunks.
301
+ */
302
+ private reembedResource;
303
+ private handleResourceUpdated;
304
+ private handleRepresentationAdded;
210
305
  private handleResourceArchived;
211
306
  private handleAnnotationAdded;
212
307
  private handleAnnotationRemoved;
@@ -277,8 +372,6 @@ declare function createKnowledgeBase(eventStore: EventStore, project: SemiontPro
277
372
  * - job:report-progress → job.progress
278
373
  * - job:complete → job.completed
279
374
  * - job:fail → job.failed
280
- * - embedding:compute → embedding.computed (from Smelter)
281
- * - embedding:delete → embedding.deleted (from Smelter)
282
375
  */
283
376
 
284
377
  type ResourceDescriptor$3 = components['schemas']['ResourceDescriptor'];
@@ -307,8 +400,6 @@ declare class Stower {
307
400
  private handleJobReportProgress;
308
401
  private handleJobComplete;
309
402
  private handleJobFail;
310
- private handleEmbeddingComputed;
311
- private handleEmbeddingDeleted;
312
403
  stop(): Promise<void>;
313
404
  }
314
405
 
@@ -328,6 +419,15 @@ declare class Stower {
328
419
  * - gather:resource-requested — resource-level LLM context assembly
329
420
  *
330
421
  * RxJS pipeline uses groupBy(resourceId) + concatMap for per-resource isolation.
422
+ *
423
+ * ## Per-resource serialization
424
+ *
425
+ * `groupBy(resourceId) + concatMap(...)` is the stream-consumer flavor of
426
+ * per-resource serialization — the same invariant enforced by `Smelter`,
427
+ * `GraphDBConsumer`, and (in a different shape) `ViewManager`. See
428
+ * `packages/core/src/serialize-per-key.ts` for the shared primitive used
429
+ * by RPC-style services, and `.plans/PerResourceSerializer.md` for the
430
+ * broader design that would unify the two shapes.
331
431
  */
332
432
 
333
433
  declare class Gatherer {
@@ -581,8 +681,6 @@ interface BackupManifestHeader {
581
681
  interface BackupStreamSummary {
582
682
  stream: string;
583
683
  eventCount: number;
584
- firstChecksum: string;
585
- lastChecksum: string;
586
684
  }
587
685
  declare const LINKED_DATA_FORMAT: "semiont-linked-data";
588
686
  interface LinkedDataManifest {
@@ -687,7 +785,6 @@ interface BackupImporterOptions {
687
785
  interface BackupImportResult {
688
786
  manifest: BackupManifestHeader;
689
787
  stats: ReplayStats;
690
- hashChainValid: boolean;
691
788
  }
692
789
  /**
693
790
  * Import a backup archive by replaying events through the EventBus.