@alta-foundation/plaud-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/.env.example +9 -0
  2. package/.github/workflows/ci.yml +33 -0
  3. package/.github/workflows/publish.yml +46 -0
  4. package/CLAUDE.md +53 -0
  5. package/README.md +318 -0
  6. package/dist/PlaudExtractor.d.ts +61 -0
  7. package/dist/PlaudExtractor.d.ts.map +1 -0
  8. package/dist/PlaudExtractor.js +236 -0
  9. package/dist/PlaudExtractor.js.map +1 -0
  10. package/dist/auth/browser-auth.d.ts +10 -0
  11. package/dist/auth/browser-auth.d.ts.map +1 -0
  12. package/dist/auth/browser-auth.js +220 -0
  13. package/dist/auth/browser-auth.js.map +1 -0
  14. package/dist/auth/token-store.d.ts +9 -0
  15. package/dist/auth/token-store.d.ts.map +1 -0
  16. package/dist/auth/token-store.js +74 -0
  17. package/dist/auth/token-store.js.map +1 -0
  18. package/dist/auth/types.d.ts +266 -0
  19. package/dist/auth/types.d.ts.map +1 -0
  20. package/dist/auth/types.js +32 -0
  21. package/dist/auth/types.js.map +1 -0
  22. package/dist/cli/bin.d.ts +3 -0
  23. package/dist/cli/bin.d.ts.map +1 -0
  24. package/dist/cli/bin.js +30 -0
  25. package/dist/cli/bin.js.map +1 -0
  26. package/dist/cli/commands/auth.d.ts +3 -0
  27. package/dist/cli/commands/auth.d.ts.map +1 -0
  28. package/dist/cli/commands/auth.js +22 -0
  29. package/dist/cli/commands/auth.js.map +1 -0
  30. package/dist/cli/commands/backfill.d.ts +3 -0
  31. package/dist/cli/commands/backfill.d.ts.map +1 -0
  32. package/dist/cli/commands/backfill.js +59 -0
  33. package/dist/cli/commands/backfill.js.map +1 -0
  34. package/dist/cli/commands/sync.d.ts +3 -0
  35. package/dist/cli/commands/sync.d.ts.map +1 -0
  36. package/dist/cli/commands/sync.js +55 -0
  37. package/dist/cli/commands/sync.js.map +1 -0
  38. package/dist/cli/commands/verify.d.ts +3 -0
  39. package/dist/cli/commands/verify.d.ts.map +1 -0
  40. package/dist/cli/commands/verify.js +28 -0
  41. package/dist/cli/commands/verify.js.map +1 -0
  42. package/dist/cli/exit-codes.d.ts +8 -0
  43. package/dist/cli/exit-codes.d.ts.map +1 -0
  44. package/dist/cli/exit-codes.js +16 -0
  45. package/dist/cli/exit-codes.js.map +1 -0
  46. package/dist/cli/options.d.ts +31 -0
  47. package/dist/cli/options.d.ts.map +1 -0
  48. package/dist/cli/options.js +11 -0
  49. package/dist/cli/options.js.map +1 -0
  50. package/dist/client/endpoints.d.ts +26 -0
  51. package/dist/client/endpoints.d.ts.map +1 -0
  52. package/dist/client/endpoints.js +54 -0
  53. package/dist/client/endpoints.js.map +1 -0
  54. package/dist/client/http.d.ts +17 -0
  55. package/dist/client/http.d.ts.map +1 -0
  56. package/dist/client/http.js +92 -0
  57. package/dist/client/http.js.map +1 -0
  58. package/dist/client/plaud-client.d.ts +14 -0
  59. package/dist/client/plaud-client.d.ts.map +1 -0
  60. package/dist/client/plaud-client.js +216 -0
  61. package/dist/client/plaud-client.js.map +1 -0
  62. package/dist/client/types.d.ts +154 -0
  63. package/dist/client/types.d.ts.map +1 -0
  64. package/dist/client/types.js +41 -0
  65. package/dist/client/types.js.map +1 -0
  66. package/dist/errors.d.ts +24 -0
  67. package/dist/errors.d.ts.map +1 -0
  68. package/dist/errors.js +51 -0
  69. package/dist/errors.js.map +1 -0
  70. package/dist/index.d.ts +7 -0
  71. package/dist/index.d.ts.map +1 -0
  72. package/dist/index.js +5 -0
  73. package/dist/index.js.map +1 -0
  74. package/dist/logger.d.ts +9 -0
  75. package/dist/logger.d.ts.map +1 -0
  76. package/dist/logger.js +37 -0
  77. package/dist/logger.js.map +1 -0
  78. package/dist/mcp/job-tools.d.ts +3 -0
  79. package/dist/mcp/job-tools.d.ts.map +1 -0
  80. package/dist/mcp/job-tools.js +108 -0
  81. package/dist/mcp/job-tools.js.map +1 -0
  82. package/dist/mcp/read-tools.d.ts +3 -0
  83. package/dist/mcp/read-tools.d.ts.map +1 -0
  84. package/dist/mcp/read-tools.js +173 -0
  85. package/dist/mcp/read-tools.js.map +1 -0
  86. package/dist/mcp/server.d.ts +3 -0
  87. package/dist/mcp/server.d.ts.map +1 -0
  88. package/dist/mcp/server.js +32 -0
  89. package/dist/mcp/server.js.map +1 -0
  90. package/dist/storage/atomic.d.ts +5 -0
  91. package/dist/storage/atomic.d.ts.map +1 -0
  92. package/dist/storage/atomic.js +51 -0
  93. package/dist/storage/atomic.js.map +1 -0
  94. package/dist/storage/checksums.d.ts +15 -0
  95. package/dist/storage/checksums.d.ts.map +1 -0
  96. package/dist/storage/checksums.js +56 -0
  97. package/dist/storage/checksums.js.map +1 -0
  98. package/dist/storage/dataset-writer.d.ts +21 -0
  99. package/dist/storage/dataset-writer.d.ts.map +1 -0
  100. package/dist/storage/dataset-writer.js +52 -0
  101. package/dist/storage/dataset-writer.js.map +1 -0
  102. package/dist/storage/paths.d.ts +9 -0
  103. package/dist/storage/paths.d.ts.map +1 -0
  104. package/dist/storage/paths.js +38 -0
  105. package/dist/storage/paths.js.map +1 -0
  106. package/dist/storage/recording-store.d.ts +24 -0
  107. package/dist/storage/recording-store.d.ts.map +1 -0
  108. package/dist/storage/recording-store.js +161 -0
  109. package/dist/storage/recording-store.js.map +1 -0
  110. package/dist/sync/download-queue.d.ts +21 -0
  111. package/dist/sync/download-queue.d.ts.map +1 -0
  112. package/dist/sync/download-queue.js +82 -0
  113. package/dist/sync/download-queue.js.map +1 -0
  114. package/dist/sync/incremental.d.ts +21 -0
  115. package/dist/sync/incremental.d.ts.map +1 -0
  116. package/dist/sync/incremental.js +96 -0
  117. package/dist/sync/incremental.js.map +1 -0
  118. package/dist/sync/sync-engine.d.ts +6 -0
  119. package/dist/sync/sync-engine.d.ts.map +1 -0
  120. package/dist/sync/sync-engine.js +135 -0
  121. package/dist/sync/sync-engine.js.map +1 -0
  122. package/dist/sync/types.d.ts +130 -0
  123. package/dist/sync/types.d.ts.map +1 -0
  124. package/dist/sync/types.js +17 -0
  125. package/dist/sync/types.js.map +1 -0
  126. package/dist/transcript/formatter.d.ts +4 -0
  127. package/dist/transcript/formatter.d.ts.map +1 -0
  128. package/dist/transcript/formatter.js +88 -0
  129. package/dist/transcript/formatter.js.map +1 -0
  130. package/package.json +41 -0
  131. package/src/PlaudExtractor.ts +275 -0
  132. package/src/auth/browser-auth.ts +248 -0
  133. package/src/auth/token-store.ts +79 -0
  134. package/src/auth/types.ts +41 -0
  135. package/src/cli/bin.ts +30 -0
  136. package/src/cli/commands/auth.ts +27 -0
  137. package/src/cli/commands/backfill.ts +77 -0
  138. package/src/cli/commands/sync.ts +71 -0
  139. package/src/cli/commands/verify.ts +31 -0
  140. package/src/cli/exit-codes.ts +14 -0
  141. package/src/cli/options.ts +10 -0
  142. package/src/client/endpoints.ts +62 -0
  143. package/src/client/http.ts +110 -0
  144. package/src/client/plaud-client.ts +268 -0
  145. package/src/client/types.ts +62 -0
  146. package/src/errors.ts +57 -0
  147. package/src/index.ts +17 -0
  148. package/src/logger.ts +49 -0
  149. package/src/mcp/job-tools.ts +156 -0
  150. package/src/mcp/read-tools.ts +204 -0
  151. package/src/mcp/server.ts +39 -0
  152. package/src/storage/atomic.ts +51 -0
  153. package/src/storage/checksums.ts +76 -0
  154. package/src/storage/dataset-writer.ts +74 -0
  155. package/src/storage/paths.ts +44 -0
  156. package/src/storage/recording-store.ts +182 -0
  157. package/src/sync/download-queue.ts +102 -0
  158. package/src/sync/incremental.ts +111 -0
  159. package/src/sync/sync-engine.ts +183 -0
  160. package/src/sync/types.ts +64 -0
  161. package/src/transcript/formatter.ts +91 -0
  162. package/tsconfig.build.json +8 -0
  163. package/tsconfig.json +19 -0
@@ -0,0 +1,183 @@
1
+ import { AuthError } from '../errors.js'
2
+ import { getLogger } from '../logger.js'
3
+ import { IncrementalTracker } from './incremental.js'
4
+ import { processQueue, retryWithBackoff } from './download-queue.js'
5
+ import { RecordingStore } from '../storage/recording-store.js'
6
+ import { DatasetWriter } from '../storage/dataset-writer.js'
7
+ import type { SyncOptions, SyncResult, BackfillOptions } from './types.js'
8
+ import type { PlaudClient, PlaudRecording } from '../client/types.js'
9
+ import type { HttpClient } from '../client/http.js'
10
+
11
+ export class SyncEngine {
12
+ async run(
13
+ client: PlaudClient,
14
+ opts: SyncOptions,
15
+ mode: 'sync' | 'backfill' = 'sync',
16
+ ): Promise<SyncResult> {
17
+ const log = getLogger()
18
+ const startedAt = Date.now()
19
+
20
+ // 1. Verify auth
21
+ const authed = await client.isAuthenticated()
22
+ if (!authed) {
23
+ throw new AuthError("Not authenticated — run 'alta-plaud auth' first")
24
+ }
25
+
26
+ // 2. Load sync state
27
+ const tracker = new IncrementalTracker()
28
+ await tracker.load(opts.outDir)
29
+
30
+ // 3. Determine effective --since
31
+ const since = opts.since ?? (mode === 'sync' ? tracker.getSince() : undefined)
32
+
33
+ log.info({ mode, since: since?.toISOString(), outDir: opts.outDir }, 'Starting sync')
34
+
35
+ // 4. Collect recordings to process
36
+ const toProcess: PlaudRecording[] = []
37
+ const skipped: PlaudRecording[] = []
38
+ let listCount = 0
39
+
40
+ for await (const recording of client.listRecordings({ since, limit: opts.limit })) {
41
+ listCount++
42
+ if (mode === 'sync' && !tracker.needsDownload(recording)) {
43
+ skipped.push(recording)
44
+ } else {
45
+ toProcess.push(recording)
46
+ }
47
+
48
+ if (opts.limit && toProcess.length >= opts.limit) break
49
+ }
50
+
51
+ log.info(
52
+ { total: listCount, toProcess: toProcess.length, skipped: skipped.length },
53
+ 'Recordings collected',
54
+ )
55
+
56
+ // 5. Dry run — just print plan
57
+ if (opts.dryRun) {
58
+ for (const rec of toProcess) {
59
+ log.info(
60
+ { id: rec.id, title: rec.title, recordedAt: rec.recordedAt },
61
+ '[dry-run] Would download',
62
+ )
63
+ }
64
+ return {
65
+ mode,
66
+ attempted: 0,
67
+ succeeded: 0,
68
+ failed: 0,
69
+ skipped: skipped.length,
70
+ durationMs: Date.now() - startedAt,
71
+ errors: [],
72
+ }
73
+ }
74
+
75
+ // 6. Initialize storage
76
+ const store = new RecordingStore(opts.outDir)
77
+ const dataset = opts.includeDataset ? new DatasetWriter(opts.outDir) : null
78
+ if (dataset) await dataset.open()
79
+
80
+ // 7. Run download queue
81
+ const errors: Array<{ recordingId: string; error: Error }> = []
82
+ const httpClient = getHttpClient(client)
83
+
84
+ const { succeeded, failed } = await processQueue(
85
+ toProcess,
86
+ async (recording) => {
87
+ await retryWithBackoff(
88
+ () => downloadRecording(recording, client, store, dataset, tracker, httpClient, opts),
89
+ { label: `recording:${recording.id}` },
90
+ )
91
+ },
92
+ opts.concurrency,
93
+ )
94
+
95
+ for (const { item, error } of failed) {
96
+ errors.push({ recordingId: item.id, error })
97
+ log.error({ recordingId: item.id, err: error }, 'Failed to download recording')
98
+ }
99
+
100
+ // 8. Mark successful sync only if zero failures
101
+ if (errors.length === 0) {
102
+ tracker.markSuccessfulSync()
103
+ }
104
+
105
+ // 9. Persist state
106
+ await tracker.persist(opts.outDir)
107
+
108
+ if (dataset) await dataset.close()
109
+
110
+ const result: SyncResult = {
111
+ mode,
112
+ attempted: toProcess.length,
113
+ succeeded: succeeded.length,
114
+ failed: failed.length,
115
+ skipped: skipped.length,
116
+ durationMs: Date.now() - startedAt,
117
+ errors,
118
+ datasetPath: dataset?.path,
119
+ }
120
+
121
+ log.info(result, 'Sync complete')
122
+ return result
123
+ }
124
+ }
125
+
126
+ async function downloadRecording(
127
+ recording: PlaudRecording,
128
+ client: PlaudClient,
129
+ store: RecordingStore,
130
+ dataset: DatasetWriter | null,
131
+ tracker: IncrementalTracker,
132
+ httpClient: HttpClient | null,
133
+ opts: SyncOptions,
134
+ ): Promise<void> {
135
+ const log = getLogger()
136
+ log.info({ recordingId: recording.id, title: recording.title }, 'Downloading recording')
137
+
138
+ // a. Write metadata
139
+ await store.writeMetadata(recording)
140
+
141
+ // b. Download transcript
142
+ let hasTranscript = false
143
+ if (recording.hasTranscript) {
144
+ try {
145
+ const transcript = await client.getTranscript(recording.id)
146
+ await store.writeTranscript(recording, transcript, opts.formats)
147
+ if (dataset) {
148
+ await dataset.append(opts.outDir, recording, transcript)
149
+ }
150
+ hasTranscript = true
151
+ } catch (err) {
152
+ log.warn({ recordingId: recording.id, err }, 'Failed to get transcript')
153
+ }
154
+ }
155
+
156
+ // c. Download audio
157
+ let hasAudio = false
158
+ if (httpClient) {
159
+ const audioUrl = await client.getAudioDownloadUrl(recording.id)
160
+ if (audioUrl) {
161
+ hasAudio = await store.writeAudioFromUrl(recording, audioUrl, httpClient)
162
+ }
163
+ }
164
+
165
+ // d. Write checksums
166
+ await store.writeChecksums(recording)
167
+
168
+ // e. Mark complete in state
169
+ tracker.markComplete(recording.id, recording.recordedAt, {
170
+ hasAudio,
171
+ hasTranscript,
172
+ contentHash: tracker.computeContentHash(recording),
173
+ })
174
+ }
175
+
176
+ /** Extract HttpClient from PlaudApiClient for audio downloads */
177
+ function getHttpClient(client: PlaudClient): HttpClient | null {
178
+ // PlaudApiClient exposes getHttpClient()
179
+ if ('getHttpClient' in client && typeof (client as { getHttpClient?: () => HttpClient }).getHttpClient === 'function') {
180
+ return (client as { getHttpClient: () => HttpClient }).getHttpClient()
181
+ }
182
+ return null
183
+ }
@@ -0,0 +1,64 @@
1
+ import { z } from 'zod'
2
+ import type { TranscriptFormat } from '../storage/recording-store.js'
3
+
4
+ export const RecordingStateSchema = z.object({
5
+ recordedAt: z.string().datetime(),
6
+ contentHash: z.string().optional(),
7
+ downloadedAt: z.string().datetime().optional(),
8
+ hasAudio: z.boolean().default(false),
9
+ hasTranscript: z.boolean().default(false),
10
+ verified: z.boolean().default(false),
11
+ verifiedAt: z.string().datetime().optional(),
12
+ })
13
+
14
+ export type RecordingState = z.infer<typeof RecordingStateSchema>
15
+
16
+ export const SyncStateSchema = z.object({
17
+ schemaVersion: z.literal(1),
18
+ lastSuccessfulSyncAt: z.string().datetime().optional(),
19
+ lastAttemptAt: z.string().datetime().optional(),
20
+ recordings: z.record(z.string(), RecordingStateSchema),
21
+ })
22
+
23
+ export type SyncState = z.infer<typeof SyncStateSchema>
24
+
25
+ export interface SyncOptions {
26
+ /** Output directory root */
27
+ outDir: string
28
+ /** Only sync recordings after this date */
29
+ since?: Date
30
+ /** Max number of recordings to process */
31
+ limit?: number
32
+ /** Parallel downloads (default: 3) */
33
+ concurrency: number
34
+ /** Transcript formats to write */
35
+ formats: TranscriptFormat[]
36
+ /** Append to JSONL dataset */
37
+ includeDataset: boolean
38
+ /** Print plan without downloading */
39
+ dryRun: boolean
40
+ }
41
+
42
+ export interface BackfillOptions extends Omit<SyncOptions, 'since'> {
43
+ /** Backfill from a specific date; defaults to all-time */
44
+ since?: Date
45
+ }
46
+
47
+ export interface SyncResult {
48
+ mode: 'sync' | 'backfill'
49
+ attempted: number
50
+ succeeded: number
51
+ failed: number
52
+ skipped: number
53
+ durationMs: number
54
+ errors: Array<{ recordingId: string; error: Error }>
55
+ datasetPath?: string
56
+ }
57
+
58
+ export interface VerifyResult {
59
+ scanned: number
60
+ ok: number
61
+ failed: number
62
+ repaired: number
63
+ issues: Array<{ recordingId: string; file: string; issue: string }>
64
+ }
@@ -0,0 +1,91 @@
1
+ import type { PlaudTranscript, PlaudRecording } from '../client/types.js'
2
+
3
+ export function toPlainText(transcript: PlaudTranscript): string {
4
+ return transcript.segments
5
+ .map(seg => {
6
+ const speaker = seg.speaker ? `${seg.speaker}: ` : ''
7
+ return `${speaker}${seg.text}`
8
+ })
9
+ .join('\n\n')
10
+ }
11
+
12
+ export function toMarkdown(transcript: PlaudTranscript, recording: PlaudRecording): string {
13
+ const lines: string[] = []
14
+
15
+ // YAML frontmatter
16
+ lines.push('---')
17
+ lines.push('source: plaud')
18
+ lines.push(`id: "${recording.id}"`)
19
+ lines.push(`recorded_at: "${recording.recordedAt}"`)
20
+ if (recording.title) lines.push(`title: "${recording.title.replace(/"/g, '\\"')}"`)
21
+ if (recording.language) lines.push(`language: "${recording.language}"`)
22
+ lines.push(`duration_seconds: ${recording.duration}`)
23
+ if (recording.tags?.length) lines.push(`tags: [${recording.tags.map(t => `"${t}"`).join(', ')}]`)
24
+ lines.push('---')
25
+ lines.push('')
26
+
27
+ // Title
28
+ lines.push(`# ${recording.title ?? 'Untitled Recording'}`)
29
+ lines.push('')
30
+
31
+ // Metadata block
32
+ lines.push(`**Recorded:** ${formatDate(recording.recordedAt)}`)
33
+ lines.push(`**Duration:** ${formatDuration(recording.duration)}`)
34
+ if (recording.language) lines.push(`**Language:** ${recording.language}`)
35
+ lines.push('')
36
+ lines.push('## Transcript')
37
+ lines.push('')
38
+
39
+ // Segments
40
+ const hasTimestamps = transcript.segments.some(s => s.startMs > 0 || s.endMs > 0)
41
+
42
+ for (const seg of transcript.segments) {
43
+ if (hasTimestamps) {
44
+ const ts = `\`[${msToTimestamp(seg.startMs)}]\``
45
+ const speaker = seg.speaker ? ` **${seg.speaker}**` : ''
46
+ lines.push(`${ts}${speaker}`)
47
+ } else if (seg.speaker) {
48
+ lines.push(`**${seg.speaker}**`)
49
+ }
50
+ lines.push(seg.text)
51
+ lines.push('')
52
+ }
53
+
54
+ return lines.join('\n')
55
+ }
56
+
57
+ function msToTimestamp(ms: number): string {
58
+ const totalSeconds = Math.floor(ms / 1000)
59
+ const h = Math.floor(totalSeconds / 3600)
60
+ const m = Math.floor((totalSeconds % 3600) / 60)
61
+ const s = totalSeconds % 60
62
+ if (h > 0) {
63
+ return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`
64
+ }
65
+ return `${m}:${String(s).padStart(2, '0')}`
66
+ }
67
+
68
+ function formatDuration(seconds: number): string {
69
+ const h = Math.floor(seconds / 3600)
70
+ const m = Math.floor((seconds % 3600) / 60)
71
+ const s = Math.floor(seconds % 60)
72
+ if (h > 0) return `${h}h ${m}m ${s}s`
73
+ if (m > 0) return `${m}m ${s}s`
74
+ return `${s}s`
75
+ }
76
+
77
+ function formatDate(iso: string): string {
78
+ try {
79
+ return new Date(iso).toLocaleString('en-US', {
80
+ weekday: 'long',
81
+ year: 'numeric',
82
+ month: 'long',
83
+ day: 'numeric',
84
+ hour: '2-digit',
85
+ minute: '2-digit',
86
+ timeZoneName: 'short',
87
+ })
88
+ } catch {
89
+ return iso
90
+ }
91
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "extends": "./tsconfig.json",
3
+ "compilerOptions": {
4
+ "declaration": true,
5
+ "declarationMap": true,
6
+ "sourceMap": true
7
+ }
8
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,19 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "lib": ["ES2022"],
7
+ "outDir": "dist",
8
+ "rootDir": "src",
9
+ "strict": true,
10
+ "noUncheckedIndexedAccess": true,
11
+ "forceConsistentCasingInFileNames": true,
12
+ "declaration": true,
13
+ "declarationMap": true,
14
+ "sourceMap": true,
15
+ "skipLibCheck": true
16
+ },
17
+ "include": ["src"],
18
+ "exclude": ["node_modules", "dist"]
19
+ }