@steno-ai/engine 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,471 @@
1
+ /**
2
+ * Structured data extractor — bypasses LLM entirely.
3
+ *
4
+ * Handles structured_event, structured_task, structured_email, structured_vault
5
+ * input types by directly creating entities, edges, and facts from known fields.
6
+ * Zero LLM cost, deterministic, high confidence.
7
+ */
8
+
9
+ import type { ExtractionResult, ExtractedFact, ExtractedEntity, ExtractedEdge } from './types.js';
10
+ import type { SourceType, EdgeType } from '../config.js';
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Structured input schemas
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export interface StructuredEvent {
17
+ title: string;
18
+ startTime: string; // ISO 8601
19
+ endTime?: string;
20
+ location?: string;
21
+ description?: string;
22
+ organizers?: string[]; // org/person names
23
+ attendees?: string[];
24
+ url?: string;
25
+ provider?: string; // 'google' | 'microsoft' | 'partiful' etc.
26
+ externalId?: string; // calendar event ID, vault item ID, etc.
27
+ sourceType?: 'calendar' | 'vault';
28
+ }
29
+
30
+ export interface StructuredTask {
31
+ title: string;
32
+ description?: string;
33
+ status?: string;
34
+ priority?: string;
35
+ category?: string;
36
+ dueDate?: string; // ISO 8601
37
+ tags?: string[];
38
+ externalId?: string;
39
+ }
40
+
41
+ export interface StructuredEmail {
42
+ subject: string;
43
+ from: string;
44
+ to?: string[];
45
+ body?: string; // truncated
46
+ date: string; // ISO 8601
47
+ isUnread?: boolean;
48
+ threadId?: string;
49
+ provider?: string; // 'gmail' | 'outlook'
50
+ externalId?: string;
51
+ }
52
+
53
+ export interface StructuredVault {
54
+ title: string;
55
+ contentType: string; // 'event', 'article', 'job', 'recipe', etc.
56
+ url?: string;
57
+ source?: string; // domain
58
+ savedAt: string; // ISO 8601
59
+ content?: string; // truncated page content
60
+ metadata?: Record<string, unknown>;
61
+ externalId?: string;
62
+ }
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // Helpers
66
+ // ---------------------------------------------------------------------------
67
+
68
+ function canonicalize(name: string): string {
69
+ return name.toLowerCase().replace(/[^a-z0-9\s.-]/g, '').replace(/\s+/g, ' ').trim();
70
+ }
71
+
72
+ function formatDate(iso: string): string {
73
+ try {
74
+ return new Date(iso).toLocaleDateString('en-US', {
75
+ weekday: 'long', year: 'numeric', month: 'long', day: 'numeric',
76
+ });
77
+ } catch {
78
+ return iso;
79
+ }
80
+ }
81
+
82
+ function formatTime(iso: string): string {
83
+ try {
84
+ return new Date(iso).toLocaleTimeString('en-US', {
85
+ hour: 'numeric', minute: '2-digit', hour12: true,
86
+ });
87
+ } catch {
88
+ return '';
89
+ }
90
+ }
91
+
92
+ // ---------------------------------------------------------------------------
93
+ // Extractors
94
+ // ---------------------------------------------------------------------------
95
+
96
+ export function extractStructuredEvent(data: StructuredEvent): ExtractionResult {
97
+ if (!data?.title) return { facts: [], entities: [], edges: [], tier: 'heuristic', confidence: 1.0, tokensInput: 0, tokensOutput: 0, model: null };
98
+
99
+ const entities: ExtractedEntity[] = [];
100
+ const edges: ExtractedEdge[] = [];
101
+
102
+ // Main event entity
103
+ const eventCanonical = canonicalize(data.title);
104
+ entities.push({
105
+ name: data.title,
106
+ entityType: 'event',
107
+ canonicalName: eventCanonical,
108
+ properties: {
109
+ startTime: data.startTime,
110
+ endTime: data.endTime,
111
+ location: data.location,
112
+ url: data.url,
113
+ provider: data.provider,
114
+ externalId: data.externalId,
115
+ sourceType: data.sourceType,
116
+ },
117
+ });
118
+
119
+ // Location entity
120
+ if (data.location) {
121
+ const locCanonical = canonicalize(data.location);
122
+ entities.push({
123
+ name: data.location,
124
+ entityType: 'location',
125
+ canonicalName: locCanonical,
126
+ properties: {},
127
+ });
128
+ edges.push({
129
+ sourceName: eventCanonical,
130
+ targetName: locCanonical,
131
+ relation: 'located_at',
132
+ edgeType: 'associative',
133
+ confidence: 1.0,
134
+ });
135
+ }
136
+
137
+ // Organizer entities
138
+ for (const org of data.organizers ?? []) {
139
+ const orgCanonical = canonicalize(org);
140
+ entities.push({
141
+ name: org,
142
+ entityType: 'organization',
143
+ canonicalName: orgCanonical,
144
+ properties: {},
145
+ });
146
+ edges.push({
147
+ sourceName: eventCanonical,
148
+ targetName: orgCanonical,
149
+ relation: 'hosted_by',
150
+ edgeType: 'associative',
151
+ confidence: 1.0,
152
+ });
153
+ }
154
+
155
+ // Attendee entities
156
+ for (const attendee of data.attendees ?? []) {
157
+ const attCanonical = canonicalize(attendee);
158
+ entities.push({
159
+ name: attendee,
160
+ entityType: 'person',
161
+ canonicalName: attCanonical,
162
+ properties: {},
163
+ });
164
+ edges.push({
165
+ sourceName: attCanonical,
166
+ targetName: eventCanonical,
167
+ relation: 'attends',
168
+ edgeType: 'associative',
169
+ confidence: 1.0,
170
+ });
171
+ }
172
+
173
+ // Build fact content
174
+ let factContent = `Event: "${data.title}" on ${formatDate(data.startTime)}`;
175
+ if (data.startTime) factContent += ` at ${formatTime(data.startTime)}`;
176
+ if (data.endTime) factContent += ` - ${formatTime(data.endTime)}`;
177
+ if (data.location) factContent += ` at ${data.location}`;
178
+ if (data.organizers?.length) factContent += `. Hosted by ${data.organizers.join(', ')}`;
179
+ if (data.description) factContent += `. ${data.description.slice(0, 300)}`;
180
+
181
+ const fact: ExtractedFact = {
182
+ content: factContent,
183
+ importance: 0.8,
184
+ confidence: 1.0,
185
+ sourceType: (data.sourceType === 'vault' ? 'structured_vault' : 'structured_event') as SourceType,
186
+ modality: 'text',
187
+ tags: ['structured', 'event', ...(data.provider ? [data.provider] : [])],
188
+ originalContent: JSON.stringify(data),
189
+ entityCanonicalNames: [eventCanonical, ...entities.filter(e => e.canonicalName !== eventCanonical).map(e => e.canonicalName)],
190
+ eventDate: new Date(data.startTime),
191
+ documentDate: new Date(),
192
+ };
193
+
194
+ return {
195
+ facts: [fact],
196
+ entities,
197
+ edges,
198
+ tier: 'heuristic',
199
+ confidence: 1.0,
200
+ tokensInput: 0,
201
+ tokensOutput: 0,
202
+ model: null,
203
+ };
204
+ }
205
+
206
+ export function extractStructuredTask(data: StructuredTask): ExtractionResult {
207
+ if (!data?.title) return { facts: [], entities: [], edges: [], tier: 'heuristic', confidence: 1.0, tokensInput: 0, tokensOutput: 0, model: null };
208
+
209
+ const entities: ExtractedEntity[] = [];
210
+ const edges: ExtractedEdge[] = [];
211
+
212
+ const taskCanonical = canonicalize(data.title);
213
+ entities.push({
214
+ name: data.title,
215
+ entityType: 'task',
216
+ canonicalName: taskCanonical,
217
+ properties: {
218
+ status: data.status,
219
+ priority: data.priority,
220
+ category: data.category,
221
+ dueDate: data.dueDate,
222
+ externalId: data.externalId,
223
+ },
224
+ });
225
+
226
+ // Category entity
227
+ if (data.category) {
228
+ const catCanonical = canonicalize(data.category);
229
+ entities.push({
230
+ name: data.category,
231
+ entityType: 'topic',
232
+ canonicalName: catCanonical,
233
+ properties: {},
234
+ });
235
+ edges.push({
236
+ sourceName: taskCanonical,
237
+ targetName: catCanonical,
238
+ relation: 'categorized_as',
239
+ edgeType: 'hierarchical',
240
+ confidence: 1.0,
241
+ });
242
+ }
243
+
244
+ let factContent = `Task: "${data.title}"`;
245
+ if (data.status) factContent += ` (${data.status})`;
246
+ if (data.priority) factContent += `, priority: ${data.priority}`;
247
+ if (data.dueDate) factContent += `, due ${formatDate(data.dueDate)}`;
248
+ if (data.description) factContent += `. ${data.description.slice(0, 200)}`;
249
+
250
+ const fact: ExtractedFact = {
251
+ content: factContent,
252
+ importance: data.priority === 'high' || data.priority === 'urgent' ? 0.9 : 0.7,
253
+ confidence: 1.0,
254
+ sourceType: 'structured_task' as SourceType,
255
+ modality: 'text',
256
+ tags: ['structured', 'task', ...(data.tags ?? [])],
257
+ originalContent: JSON.stringify(data),
258
+ entityCanonicalNames: [taskCanonical],
259
+ eventDate: data.dueDate ? new Date(data.dueDate) : undefined,
260
+ documentDate: new Date(),
261
+ };
262
+
263
+ return {
264
+ facts: [fact],
265
+ entities,
266
+ edges,
267
+ tier: 'heuristic',
268
+ confidence: 1.0,
269
+ tokensInput: 0,
270
+ tokensOutput: 0,
271
+ model: null,
272
+ };
273
+ }
274
+
275
+ export function extractStructuredEmail(data: StructuredEmail): ExtractionResult {
276
+ if (!data?.subject && !data?.from) return { facts: [], entities: [], edges: [], tier: 'heuristic', confidence: 1.0, tokensInput: 0, tokensOutput: 0, model: null };
277
+
278
+ const entities: ExtractedEntity[] = [];
279
+ const edges: ExtractedEdge[] = [];
280
+
281
+ // Sender entity
282
+ const senderCanonical = canonicalize(data.from);
283
+ entities.push({
284
+ name: data.from,
285
+ entityType: 'person',
286
+ canonicalName: senderCanonical,
287
+ properties: { email: data.from },
288
+ });
289
+
290
+ // Subject as topic entity if substantial
291
+ if (data.subject && data.subject.length > 5) {
292
+ const subjectCanonical = canonicalize(data.subject);
293
+ entities.push({
294
+ name: data.subject,
295
+ entityType: 'topic',
296
+ canonicalName: subjectCanonical,
297
+ properties: { threadId: data.threadId, provider: data.provider },
298
+ });
299
+ edges.push({
300
+ sourceName: senderCanonical,
301
+ targetName: subjectCanonical,
302
+ relation: 'authored',
303
+ edgeType: 'associative',
304
+ confidence: 1.0,
305
+ });
306
+ }
307
+
308
+ // Recipients
309
+ for (const to of data.to ?? []) {
310
+ const toCanonical = canonicalize(to);
311
+ entities.push({
312
+ name: to,
313
+ entityType: 'person',
314
+ canonicalName: toCanonical,
315
+ properties: { email: to },
316
+ });
317
+ }
318
+
319
+ let factContent = `Email from ${data.from}: "${data.subject}"`;
320
+ if (data.date) factContent += ` on ${formatDate(data.date)}`;
321
+ if (data.body) factContent += `. ${data.body.slice(0, 300)}`;
322
+
323
+ const fact: ExtractedFact = {
324
+ content: factContent,
325
+ importance: data.isUnread ? 0.8 : 0.5,
326
+ confidence: 1.0,
327
+ sourceType: 'structured_email' as SourceType,
328
+ modality: 'text',
329
+ tags: ['structured', 'email', ...(data.provider ? [data.provider] : []), ...(data.isUnread ? ['unread'] : [])],
330
+ originalContent: JSON.stringify(data),
331
+ entityCanonicalNames: [senderCanonical],
332
+ eventDate: new Date(data.date),
333
+ documentDate: new Date(),
334
+ };
335
+
336
+ return {
337
+ facts: [fact],
338
+ entities,
339
+ edges,
340
+ tier: 'heuristic',
341
+ confidence: 1.0,
342
+ tokensInput: 0,
343
+ tokensOutput: 0,
344
+ model: null,
345
+ };
346
+ }
347
+
348
+ export function extractStructuredVault(data: StructuredVault): ExtractionResult {
349
+ if (!data?.title) return { facts: [], entities: [], edges: [], tier: 'heuristic', confidence: 1.0, tokensInput: 0, tokensOutput: 0, model: null };
350
+
351
+ const entities: ExtractedEntity[] = [];
352
+ const edges: ExtractedEdge[] = [];
353
+
354
+ const vaultCanonical = canonicalize(data.title);
355
+ entities.push({
356
+ name: data.title,
357
+ entityType: data.contentType === 'event' ? 'event' : 'topic',
358
+ canonicalName: vaultCanonical,
359
+ properties: {
360
+ contentType: data.contentType,
361
+ url: data.url,
362
+ source: data.source,
363
+ savedAt: data.savedAt,
364
+ externalId: data.externalId,
365
+ ...(data.metadata ?? {}),
366
+ },
367
+ });
368
+
369
+ // Source domain entity
370
+ if (data.source) {
371
+ const sourceCanonical = canonicalize(data.source);
372
+ entities.push({
373
+ name: data.source,
374
+ entityType: 'source',
375
+ canonicalName: sourceCanonical,
376
+ properties: {},
377
+ });
378
+ edges.push({
379
+ sourceName: vaultCanonical,
380
+ targetName: sourceCanonical,
381
+ relation: 'saved_from',
382
+ edgeType: 'associative',
383
+ confidence: 1.0,
384
+ });
385
+ }
386
+
387
+ // If event type, extract organizers from metadata
388
+ const organizers = data.metadata?.organizer || data.metadata?.organizers;
389
+ if (organizers) {
390
+ const orgList = typeof organizers === 'string'
391
+ ? organizers.split(/,\s*|(?:\s+and\s+)/)
392
+ : Array.isArray(organizers) ? organizers : [];
393
+ for (const org of orgList) {
394
+ const trimmed = (org as string).trim();
395
+ if (!trimmed) continue;
396
+ const orgCanonical = canonicalize(trimmed);
397
+ entities.push({
398
+ name: trimmed,
399
+ entityType: 'organization',
400
+ canonicalName: orgCanonical,
401
+ properties: {},
402
+ });
403
+ edges.push({
404
+ sourceName: vaultCanonical,
405
+ targetName: orgCanonical,
406
+ relation: 'hosted_by',
407
+ edgeType: 'associative',
408
+ confidence: 1.0,
409
+ });
410
+ }
411
+ }
412
+
413
+ let factContent = `Saved to vault: "${data.title}" (${data.contentType})`;
414
+ if (data.source) factContent += ` from ${data.source}`;
415
+ if (data.savedAt) factContent += ` on ${formatDate(data.savedAt)}`;
416
+ if (data.content) factContent += `. ${data.content.slice(0, 300)}`;
417
+
418
+ const fact: ExtractedFact = {
419
+ content: factContent,
420
+ importance: 0.7,
421
+ confidence: 1.0,
422
+ sourceType: 'structured_vault' as SourceType,
423
+ modality: 'text',
424
+ tags: ['structured', 'vault', data.contentType],
425
+ originalContent: JSON.stringify(data),
426
+ entityCanonicalNames: [vaultCanonical, ...entities.filter(e => e.canonicalName !== vaultCanonical).map(e => e.canonicalName)],
427
+ eventDate: data.metadata?.date ? new Date(data.metadata.date as string) : undefined,
428
+ documentDate: new Date(data.savedAt),
429
+ };
430
+
431
+ return {
432
+ facts: [fact],
433
+ entities,
434
+ edges,
435
+ tier: 'heuristic',
436
+ confidence: 1.0,
437
+ tokensInput: 0,
438
+ tokensOutput: 0,
439
+ model: null,
440
+ };
441
+ }
442
+
443
+ // ---------------------------------------------------------------------------
444
+ // Router — picks the right extractor based on inputType
445
+ // ---------------------------------------------------------------------------
446
+
447
+ const STRUCTURED_INPUT_TYPES = new Set([
448
+ 'structured_event',
449
+ 'structured_task',
450
+ 'structured_email',
451
+ 'structured_vault',
452
+ ]);
453
+
454
+ export function isStructuredInput(inputType: string): boolean {
455
+ return STRUCTURED_INPUT_TYPES.has(inputType);
456
+ }
457
+
458
+ export function extractStructured(inputType: string, data: unknown): ExtractionResult {
459
+ switch (inputType) {
460
+ case 'structured_event':
461
+ return extractStructuredEvent(data as StructuredEvent);
462
+ case 'structured_task':
463
+ return extractStructuredTask(data as StructuredTask);
464
+ case 'structured_email':
465
+ return extractStructuredEmail(data as StructuredEmail);
466
+ case 'structured_vault':
467
+ return extractStructuredVault(data as StructuredVault);
468
+ default:
469
+ throw new Error(`Unknown structured input type: ${inputType}`);
470
+ }
471
+ }
@@ -60,7 +60,7 @@ export interface ExtractionInput {
60
60
  scope: Scope;
61
61
  scopeId: string;
62
62
  sessionId?: string;
63
- inputType: 'conversation' | 'document' | 'url' | 'raw_text' | 'image' | 'audio' | 'code' | 'codebase_scan' | 'file_change' | 'architecture_doc';
63
+ inputType: 'conversation' | 'document' | 'url' | 'raw_text' | 'image' | 'audio' | 'code' | 'codebase_scan' | 'file_change' | 'architecture_doc' | 'structured_event' | 'structured_task' | 'structured_email' | 'structured_vault';
64
64
  data: unknown;
65
65
  existingFacts?: Array<{ id: string; lineageId: string; content: string; embedding?: number[] }>;
66
66
  /** Source provider for provenance tracking — where did this data come from? */