@soundbi/sound-connect 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +111 -0
  2. package/dist/__tests__/ingest.test.d.ts +18 -0
  3. package/dist/__tests__/ingest.test.d.ts.map +1 -0
  4. package/dist/__tests__/ingest.test.js +639 -0
  5. package/dist/__tests__/ingest.test.js.map +1 -0
  6. package/dist/__tests__/isolation.test.d.ts +12 -0
  7. package/dist/__tests__/isolation.test.d.ts.map +1 -0
  8. package/dist/__tests__/isolation.test.js +149 -0
  9. package/dist/__tests__/isolation.test.js.map +1 -0
  10. package/dist/__tests__/retry-queue.test.d.ts +11 -0
  11. package/dist/__tests__/retry-queue.test.d.ts.map +1 -0
  12. package/dist/__tests__/retry-queue.test.js +458 -0
  13. package/dist/__tests__/retry-queue.test.js.map +1 -0
  14. package/dist/auth.d.ts +80 -0
  15. package/dist/auth.d.ts.map +1 -0
  16. package/dist/auth.js +211 -0
  17. package/dist/auth.js.map +1 -0
  18. package/dist/config.d.ts +35 -0
  19. package/dist/config.d.ts.map +1 -0
  20. package/dist/config.js +66 -0
  21. package/dist/config.js.map +1 -0
  22. package/dist/index.d.ts +23 -0
  23. package/dist/index.d.ts.map +1 -0
  24. package/dist/index.js +100 -0
  25. package/dist/index.js.map +1 -0
  26. package/dist/ingest.d.ts +253 -0
  27. package/dist/ingest.d.ts.map +1 -0
  28. package/dist/ingest.js +573 -0
  29. package/dist/ingest.js.map +1 -0
  30. package/dist/proxy.d.ts +79 -0
  31. package/dist/proxy.d.ts.map +1 -0
  32. package/dist/proxy.js +217 -0
  33. package/dist/proxy.js.map +1 -0
  34. package/dist/retry-queue.d.ts +236 -0
  35. package/dist/retry-queue.d.ts.map +1 -0
  36. package/dist/retry-queue.js +461 -0
  37. package/dist/retry-queue.js.map +1 -0
  38. package/dist/tools.d.ts +75 -0
  39. package/dist/tools.d.ts.map +1 -0
  40. package/dist/tools.js +368 -0
  41. package/dist/tools.js.map +1 -0
  42. package/package.json +36 -0
@@ -0,0 +1,253 @@
1
+ /**
2
+ * File ingestion logic for the Sound Connect bridge (STORY-011, STORY-012, STORY-013).
3
+ *
4
+ * STORY-011 implements:
5
+ * - Path validation and confinement (no directory traversal — AC3)
6
+ * - Markdown file reading and normalization (AC1)
7
+ * - Content chunking for large files (AC1)
8
+ * - SHA-256 content hash for idempotency (AC2, ADR-004)
9
+ * - Provenance attachment and POST to /ingest/:slug (AC2)
10
+ * - Summary return: chunks ingested, deduped count (AC4)
11
+ *
12
+ * STORY-012 extends to:
13
+ * - Folder enumeration supporting .md and transcript text files (.txt/.vtt/.srt)
14
+ * - Per-file source_type reflecting the file kind (markdown vs transcript)
15
+ * - Idempotency across folder re-runs (sha256 already present → deduped)
16
+ * - Per-file result table (ingested / deduped / failed)
17
+ *
18
+ * STORY-013 extends to:
19
+ * - When `queueOnFailure` is enabled, network errors and 5xx responses write the
20
+ * chunk to the local retry queue instead of throwing (ADR-011).
21
+ * - 4xx responses still throw immediately (permanent auth/access error — queue won't help).
22
+ * - IngestSummary gains a `chunks_queued` field reporting items sent to the retry queue.
23
+ *
24
+ * ADR-006: v1 handles text/markdown and transcript plaintext only. Binary formats
25
+ * (PDF, .xlsx, .docx) deferred to v1.1.
26
+ * ADR-004: sha256 of normalized content is the idempotency key.
27
+ * ADR-011: All errors are thrown with descriptive messages; callers surface them
28
+ * as MCP tool errors — never silent failures.
29
+ */
30
+ /**
31
+ * File extensions accepted for ingestion (ADR-006: text/markdown + transcript plaintext).
32
+ * Binary formats (PDF, .xlsx, .docx) are deferred to v1.1.
33
+ */
34
+ export declare const SUPPORTED_EXTENSIONS: Set<string>;
35
+ /**
36
+ * Derive the source_type provenance value from a file extension (STORY-012, ADR-004).
37
+ * Returns 'markdown' for .md files, 'transcript' for .txt/.vtt/.srt.
38
+ */
39
+ export declare function sourceTypeForExt(ext: string): 'markdown' | 'transcript';
40
+ export interface IngestFileOptions {
41
+ /** Absolute or relative path to the markdown file (AC3: validated/confined). */
42
+ filePath: string;
43
+ /** Backend base URL (e.g. https://my-forge.azurecontainerapps.io). */
44
+ backendUrl: string;
45
+ /** Bound client slug — all ingestion is scoped to this client (ADR-005). */
46
+ clientSlug: string;
47
+ /** Bearer token for the authenticated peer (ADR-003). */
48
+ token: string;
49
+ /** Optional workstream slug to scope the ingested knowledge. */
50
+ workstreamSlug?: string;
51
+ /**
52
+ * Author email for provenance — extracted from the bearer token claim (AC2).
53
+ * If not supplied, the backend will derive it from the token.
54
+ * Passing it explicitly allows the backend to record accurate provenance even
55
+ * when the token's email claim uses a different casing convention.
56
+ */
57
+ authorEmail?: string;
58
+ /**
59
+ * STORY-013 / ADR-011: When true, network errors and 5xx responses write the
60
+ * failed chunk to the local retry queue instead of throwing. The summary will
61
+ * include a non-zero `chunks_queued` count. 4xx responses still throw (permanent).
62
+ *
63
+ * Defaults to false for backward compatibility with existing callers.
64
+ */
65
+ queueOnFailure?: boolean;
66
+ }
67
+ export interface IngestSummary {
68
+ /** Total number of chunks sent to the backend (includes queued). */
69
+ chunks_sent: number;
70
+ /** Number of chunks the backend accepted as new (not deduped). */
71
+ chunks_ingested: number;
72
+ /** Number of chunks the backend reported as duplicates (sha256 already present). */
73
+ chunks_deduped: number;
74
+ /**
75
+ * STORY-013: Number of chunks written to the local retry queue due to
76
+ * transient backend failure (network error or 5xx). Zero when queueOnFailure
77
+ * is not enabled or when all chunks succeeded.
78
+ */
79
+ chunks_queued: number;
80
+ /** Resolved absolute path of the file that was ingested. */
81
+ file: string;
82
+ /** SHA-256 of the full normalized content (hex). */
83
+ content_hash: string;
84
+ }
85
+ /** Status of a single file in a folder ingest run. */
86
+ export type FileIngestStatus = 'ingested' | 'deduped' | 'failed';
87
+ /** Per-file result in a folder ingest (STORY-012 AC4). */
88
+ export interface FileIngestResult {
89
+ /** Absolute path of the file. */
90
+ file: string;
91
+ /** Outcome for this file. */
92
+ status: FileIngestStatus;
93
+ /** For ingested/deduped: number of chunks sent. */
94
+ chunks_sent?: number;
95
+ /** For ingested: number of chunks that were new. */
96
+ chunks_ingested?: number;
97
+ /** For deduped: number of chunks that were duplicates. */
98
+ chunks_deduped?: number;
99
+ /** For failed: the error message. */
100
+ error?: string;
101
+ }
102
+ /** Result of ingest_folder (STORY-012 AC4). */
103
+ export interface FolderIngestResult {
104
+ /** Absolute resolved path of the folder. */
105
+ folder: string;
106
+ /** Total files found matching the glob/extension filter. */
107
+ files_found: number;
108
+ /** Number of files with at least one new chunk. */
109
+ files_ingested: number;
110
+ /** Number of files fully deduped (all chunks already present). */
111
+ files_deduped: number;
112
+ /** Number of files that failed during ingestion. */
113
+ files_failed: number;
114
+ /** Per-file results (STORY-012 AC4). */
115
+ results: FileIngestResult[];
116
+ }
117
+ /** Options for ingest_folder (STORY-012). */
118
+ export interface IngestFolderOptions {
119
+ /** Path to the folder to enumerate. */
120
+ folderPath: string;
121
+ /**
122
+ * Optional glob pattern (extension filter). Defaults to all supported
123
+ * extensions: .md, .txt, .vtt, .srt (ADR-006).
124
+ * Only the extension portion of the pattern is matched — pass e.g. "*.md"
125
+ * or "*.vtt" to restrict to a specific type. Pass undefined for all types.
126
+ */
127
+ glob?: string;
128
+ /** Backend base URL. */
129
+ backendUrl: string;
130
+ /** Bound client slug (ADR-005). */
131
+ clientSlug: string;
132
+ /** Bearer token (ADR-003). */
133
+ token: string;
134
+ /** Optional workstream slug. */
135
+ workstreamSlug?: string;
136
+ /** Author email for provenance. */
137
+ authorEmail?: string;
138
+ }
139
+ /**
140
+ * Resolves and validates a single ingestable file path.
141
+ *
142
+ * Rules enforced:
143
+ * 1. Path must not be empty.
144
+ * 2. File must exist and be a regular file (not a directory or symlink loop).
145
+ * 3. File extension must be in SUPPORTED_EXTENSIONS (ADR-006: .md/.txt/.vtt/.srt).
146
+ * 4. File size must not exceed MAX_FILE_BYTES (ADR-004 body limit).
147
+ *
148
+ * Traversal confinement: `resolve()` normalises `../` sequences so the resolved
149
+ * path is always absolute. We do NOT restrict to a whitelist directory because
150
+ * peers legitimately have notes and transcripts anywhere on their machine. We do
151
+ * block the obvious attack vectors: empty paths, non-files, unsupported extensions.
152
+ *
153
+ * @param rawPath The path supplied by the tool caller.
154
+ * @returns Resolved absolute path.
155
+ * @throws Error with a descriptive message on any validation failure.
156
+ */
157
+ export declare function validateAndResolvePath(rawPath: string): Promise<string>;
158
+ /**
159
+ * Parse the `glob` argument from ingest_folder into a Set of allowed extensions.
160
+ *
161
+ * Accepts patterns like "*.md", "*.vtt", "*.txt", "*.srt".
162
+ * If the pattern matches a known SUPPORTED_EXTENSIONS entry, only that extension
163
+ * is returned. If glob is undefined/empty, all SUPPORTED_EXTENSIONS are returned.
164
+ * If glob specifies an unsupported extension, throws with a clear message.
165
+ *
166
+ * This is intentionally simple — STORY-012 AC1 says "glob?" but the only
167
+ * meaningful variation is filtering by extension, not full glob matching.
168
+ * Full glob support (**, recursive patterns) is deferred to v1.1.
169
+ */
170
+ export declare function parseGlobToExtensions(glob: string | undefined): Set<string>;
171
+ /**
172
+ * Enumerate all ingestable files in a folder (non-recursive, flat listing).
173
+ *
174
+ * Returns absolute paths of files whose extension is in the allowed set.
175
+ * Silently skips subdirectories and files with unsupported extensions.
176
+ * Throws if the folder does not exist or is not a directory.
177
+ *
178
+ * STORY-012 AC1: enumerates .md and transcript text files (.txt/.vtt/.srt).
179
+ * Recursion is not in scope for v1 — flat directory only.
180
+ *
181
+ * @param folderPath Absolute path to the folder to enumerate.
182
+ * @param allowedExts Set of lowercase extensions to include (from parseGlobToExtensions).
183
+ * @returns Sorted list of absolute file paths.
184
+ */
185
+ export declare function enumerateFolder(folderPath: string, allowedExts: Set<string>): Promise<string[]>;
186
+ /**
187
+ * Normalize markdown content before hashing and chunking.
188
+ *
189
+ * Normalization steps:
190
+ * 1. Normalize line endings to LF.
191
+ * 2. Strip trailing whitespace from each line.
192
+ * 3. Collapse runs of 3+ blank lines to 2 blank lines (preserve paragraph spacing).
193
+ * 4. Trim leading/trailing blank lines from the document.
194
+ *
195
+ * This ensures that whitespace-only edits do not produce new sha256 hashes
196
+ * (no spurious re-ingestion) while preserving meaningful structure.
197
+ */
198
+ export declare function normalizeMarkdown(content: string): string;
199
+ /**
200
+ * Split normalized content into overlapping chunks for retrieval-friendly storage.
201
+ *
202
+ * Strategy: paragraph-aware sliding window.
203
+ * - Split on double-newlines (paragraph boundaries) to avoid cutting mid-sentence.
204
+ * - Accumulate paragraphs until the chunk would exceed CHUNK_SIZE_CHARS.
205
+ * - Start the next chunk with the last CHUNK_OVERLAP_CHARS of the previous chunk
206
+ * (approximate — overlap at paragraph boundary nearest to the overlap target).
207
+ *
208
+ * If the entire content fits in one chunk, returns a single-element array.
209
+ */
210
+ export declare function chunkContent(content: string): string[];
211
+ /** Compute the SHA-256 hex digest of a string. */
212
+ export declare function sha256(content: string): string;
213
+ /**
214
+ * Ingest a single local text file (markdown or transcript) into the Sound Connect corpus.
215
+ *
216
+ * Full pipeline:
217
+ * 1. Validate path — supports .md, .txt, .vtt, .srt (STORY-012 AC1, ADR-006).
218
+ * 2. Read file.
219
+ * 3. Normalize content (line endings, trailing whitespace, blank lines).
220
+ * 4. Chunk if large.
221
+ * 5. Hash each chunk (ADR-004 idempotency key).
222
+ * 6. Attach provenance with source_type derived from extension (STORY-012 AC2).
223
+ * 7. POST each chunk to /ingest/:slug.
224
+ * STORY-013: When queueOnFailure=true, network errors and 5xx responses write
225
+ * the chunk to the local retry queue instead of throwing. 4xx errors still throw.
226
+ * 8. Return summary (includes chunks_queued when queueOnFailure is used).
227
+ *
228
+ * @throws Error with a descriptive message on any failure (ADR-011).
229
+ */
230
+ export declare function ingestMarkdownFile(opts: IngestFileOptions): Promise<IngestSummary>;
231
+ /**
232
+ * Bulk-ingest a directory of markdown and transcript text files.
233
+ *
234
+ * Pipeline (STORY-012):
235
+ * 1. Resolve and validate the folder path.
236
+ * 2. Parse glob to determine which extensions to include (AC1).
237
+ * 3. Enumerate matching files in the folder (flat, non-recursive).
238
+ * 4. For each file: run through the ingestMarkdownFile pipeline (STORY-011 path).
239
+ * - deduped = all chunks were already present (idempotency, AC3).
240
+ * - ingested = at least one new chunk was accepted.
241
+ * - failed = any error during read/post.
242
+ * 5. Return per-file result table + folder-level totals (AC4).
243
+ *
244
+ * Idempotency (AC3): re-running the folder produces files_ingested=0 and
245
+ * files_deduped=N (one per file) because sha256 keys are already present.
246
+ *
247
+ * ADR-011: per-file failures are captured in the result table and do NOT abort
248
+ * the rest of the run — callers see which files failed and why.
249
+ *
250
+ * @throws Error only if the folder itself cannot be accessed (not per-file errors).
251
+ */
252
+ export declare function ingestFolder(opts: IngestFolderOptions): Promise<FolderIngestResult>;
253
+ //# sourceMappingURL=ingest.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../src/ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AA8BH;;;GAGG;AACH,eAAO,MAAM,oBAAoB,aAA2C,CAAC;AAE7E;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,GAAG,YAAY,CAEvE;AAID,MAAM,WAAW,iBAAiB;IAChC,gFAAgF;IAChF,QAAQ,EAAE,MAAM,CAAC;IACjB,sEAAsE;IACtE,UAAU,EAAE,MAAM,CAAC;IACnB,4EAA4E;IAC5E,UAAU,EAAE,MAAM,CAAC;IACnB,yDAAyD;IACzD,KAAK,EAAE,MAAM,CAAC;IACd,gEAAgE;IAChE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED,MAAM,WAAW,aAAa;IAC5B,oEAAoE;IACpE,WAAW,EAAE,MAAM,CAAC;IACpB,kEAAkE;IAClE,eAAe,EAAE,MAAM,CAAC;IACxB,oFAAoF;IACpF,cAAc,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,aAAa,EAAE,MAAM,CAAC;IACtB,4DAA4D;IAC5D,IAAI,EAAE,MAAM,CAAC;IACb,oDAAoD;IACpD,YAAY,EAAE,MAAM,CAAC;CACtB;AAID,sDAAsD;AACtD,MAAM,MAAM,gBAAgB,GAAG,UAAU,GAAG,SAAS,GAAG,QAAQ,CAAC;AAEjE,0DAA0D;AAC1D,MAAM,WAAW,gBAAgB;IAC/B,iCAAiC;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,6BAA6B;IAC7B,MAAM,EAAE,gBAAgB,CAAC;IACzB,mDAAmD;IACnD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,oDAAoD;IACpD,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,0DAA0D;IAC1D,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,+CAA+C;AAC/C,MAAM,WAAW,kBAAkB;IACjC,4CAA4C;IAC5C,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,EAAE,MAAM,CAAC;IACpB,mDAAmD;IACnD,cAAc,EAAE,MAAM,CAAC;IACvB,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,YAAY,EAAE,MAAM,CAAC;IACrB,wCAAwC;IACxC,OAAO,EAAE,gBAAgB,EAAE,CAAC;CAC7B;AAED,6CAA6C;AAC7C,MAAM,WAAW,mBAAmB;IAClC,uCAAuC;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB;;;;;OAKG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,wBAAwB;IACxB,UAAU,EAAE,MAAM,CAAC;IACnB,mCAAmC;IACnC,UAAU,EAAE,MAAM,CAAC;IACnB,8BAA8B;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,gCAAgC;IAChC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,mCAAmC;IACnC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAID;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAsB,sBAAsB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAkD7E;AAID;;;;;;;;;;;GAWG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,CAsB3E;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,eAAe,CACnC,UAAU,EAAE,MAAM,EAClB,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,GACvB,OAAO,CAAC,MAAM,EAAE,CAAC,CA4BnB;AAID;;;;;;;;;;;GAWG;AACH,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CASzD;AAID;;;;;;;;;;GAUG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CA8BtD;AAID,kDAAkD;AAClD,wBAAgB,MAAM,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAE9C;AAsJD;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAsB,kBAAkB,CAAC,IAAI,EAAE,iBAAiB,GAAG,OAAO,CAAC,aAAa,CAAC,CA6FxF;AAID;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,mBAAmB,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAqFzF"}