anymd 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/tui.tsx DELETED
@@ -1,1105 +0,0 @@
1
- // oxlint-disable react/no-unknown-property
2
- /* eslint-disable react/no-unknown-property, react-hooks/rules-of-hooks, @typescript-eslint/promise-function-async */
3
- /** biome-ignore-all lint/nursery/noUnknownAttribute: OpenTUI uses custom JSX intrinsics */
4
- import { createCliRenderer } from '@opentui/core'
5
- import { createRoot, useKeyboard, useTerminalDimensions } from '@opentui/react'
6
- import { useCallback, useEffect, useReducer, useRef, useState } from 'react'
7
-
8
- import type { AllStepsData, CommandKey, DatasetResult, OcrProgress, StepData } from '~/tui-data'
9
-
10
- import { bootstrapPython } from '~/bootstrap'
11
- import { getPaths } from '~/paths'
12
- import { runPreflight } from '~/preflight'
13
- import {
14
- appendErrorLog,
15
- appendPipelineLog,
16
- buildDataset,
17
- clearErrorLog,
18
- clearPipelineLog,
19
- fetchStepData,
20
- getOcrStats,
21
- readLogTail,
22
- runClassify,
23
- runEnhanceOcr,
24
- spawnCommand,
25
- writeNativeFileList
26
- } from '~/tui-data'
27
-
28
- const stripAnsi = (s: string): string => s.replaceAll(new RegExp(`${String.fromCodePoint(0x1b)}\\[[0-9;]*m`, 'gu'), '')
29
-
30
- const DIM = '#888888'
31
- const SIDEBAR_WIDTH = 38
32
-
33
- const setTerminalTitle = (title: string): void => {
34
- process.stdout.write(`\u001B]0;${title}\u0007`)
35
- }
36
-
37
- interface StepConfig {
38
- command: CommandKey
39
- name: string
40
- stepNum: number
41
- unit: string
42
- }
43
-
44
- const STEPS: StepConfig[] = [
45
- { command: 'classify', name: 'Classify PDFs', stepNum: 1, unit: 'PDFs' },
46
- { command: 'pipeline', name: 'Convert to Markdown', stepNum: 2, unit: 'files' },
47
- { command: 'ocr', name: 'OCR Scanned PDFs', stepNum: 3, unit: 'files' },
48
- { command: 'enhance', name: 'Enhance Markdown', stepNum: 4, unit: 'files' },
49
- { command: 'dataset', name: 'Build Dataset', stepNum: 5, unit: 'entries' }
50
- ]
51
-
52
- const STEP_ORDER: CommandKey[] = ['classify', 'pipeline', 'ocr', 'enhance', 'dataset']
53
-
54
- type Action =
55
- | { active: boolean; type: 'SET_BACKGROUND_OCR' }
56
- | { code: number; type: 'COMMAND_DONE' }
57
- | { command: CommandKey; count: number; type: 'RECORD_FAILURES' }
58
- | { command: CommandKey; seconds: number; type: 'RECORD_DURATION' }
59
- | { data: AllStepsData; type: 'SET_STEPS' }
60
- | { errors: string[]; type: 'SET_PREFLIGHT'; warnings: string[] }
61
- | { label: string; stepCommand: CommandKey; type: 'START_COMMAND' }
62
- | { line: string; type: 'APPEND_OUTPUT' }
63
- | { lines: string[]; type: 'SET_LOG' }
64
- | { result: DatasetResult; type: 'SET_DATASET_RESULT' }
65
- | { status: string; type: 'SET_RUNNING_STATUS' }
66
- | { type: 'CLEAR_FAILURE' }
67
- | { type: 'SET_ALL_DONE' }
68
- | { type: 'TOGGLE_LOG' }
69
-
70
- interface AppState {
71
- allDone: boolean
72
- backgroundOcr: boolean
73
- datasetResult: DatasetResult | null
74
- failed: boolean
75
- logLines: string[]
76
- pipelineStartedAt: number
77
- preflightErrors: string[]
78
- preflightWarnings: string[]
79
- runningCommand: CommandKey | null
80
- runningLines: string[]
81
- runningStatus: string
82
- showLog: boolean
83
- stepDurations: Partial<Record<CommandKey, number>>
84
- stepFailures: Partial<Record<CommandKey, number>>
85
- stepsData: AllStepsData | null
86
- stepStartedAt: number
87
- }
88
-
89
- const MAX_OUTPUT_LINES = 15
90
- // oxlint-disable-next-line promise/prefer-await-to-then
91
- const noop = (): Promise<void> => Promise.resolve() // eslint-disable-line @typescript-eslint/promise-function-async
92
-
93
- const initialState: AppState = {
94
- allDone: false,
95
- backgroundOcr: false,
96
- datasetResult: null,
97
- failed: false,
98
- logLines: [],
99
- pipelineStartedAt: Date.now(),
100
- preflightErrors: [],
101
- preflightWarnings: [],
102
- runningCommand: null,
103
- runningLines: [],
104
- runningStatus: '',
105
- showLog: false,
106
- stepDurations: {},
107
- stepFailures: {},
108
- stepsData: null,
109
- stepStartedAt: 0
110
- }
111
-
112
- const reducer = (state: AppState, action: Action): AppState => {
113
- switch (action.type) {
114
- case 'APPEND_OUTPUT':
115
- return { ...state, runningLines: [...state.runningLines.slice(-MAX_OUTPUT_LINES), stripAnsi(action.line)] }
116
- case 'CLEAR_FAILURE':
117
- return { ...state, backgroundOcr: false, failed: false, runningCommand: null }
118
- case 'COMMAND_DONE': {
119
- const durations =
120
- action.code === 0 && state.runningCommand && state.stepStartedAt > 0
121
- ? { ...state.stepDurations, [state.runningCommand]: Math.floor((Date.now() - state.stepStartedAt) / 1000) }
122
- : state.stepDurations
123
- return {
124
- ...state,
125
- failed: action.code !== 0,
126
- runningCommand: action.code === 0 ? null : state.runningCommand,
127
- stepDurations: durations
128
- }
129
- }
130
- case 'RECORD_DURATION':
131
- return { ...state, stepDurations: { ...state.stepDurations, [action.command]: action.seconds } }
132
- case 'RECORD_FAILURES':
133
- if (action.count <= 0) return state
134
- return { ...state, stepFailures: { ...state.stepFailures, [action.command]: action.count } }
135
- case 'SET_ALL_DONE':
136
- return { ...state, allDone: true, backgroundOcr: false, runningCommand: null }
137
- case 'SET_BACKGROUND_OCR':
138
- return { ...state, backgroundOcr: action.active }
139
- case 'SET_DATASET_RESULT':
140
- return { ...state, datasetResult: action.result }
141
- case 'SET_LOG':
142
- return { ...state, logLines: action.lines.map(stripAnsi) }
143
- case 'SET_PREFLIGHT':
144
- return { ...state, preflightErrors: action.errors, preflightWarnings: action.warnings }
145
- case 'SET_RUNNING_STATUS':
146
- return { ...state, runningStatus: action.status }
147
- case 'SET_STEPS':
148
- return { ...state, stepsData: action.data }
149
- case 'START_COMMAND':
150
- return {
151
- ...state,
152
- failed: false,
153
- runningCommand: action.stepCommand,
154
- runningLines: [],
155
- runningStatus: '',
156
- stepStartedAt: Date.now()
157
- }
158
- case 'TOGGLE_LOG':
159
- return { ...state, showLog: !state.showLog }
160
- default:
161
- return state
162
- }
163
- }
164
-
165
- const isStepDone = (step: CommandKey, data: AllStepsData): boolean => {
166
- const s: StepData = data[step]
167
- if (s.requires) return false
168
- return s.done >= s.total
169
- }
170
-
171
- const findFirstIncomplete = (data: AllStepsData): CommandKey | null => {
172
- for (const key of STEP_ORDER) if (!isStepDone(key, data)) return key
173
-
174
- return null
175
- }
176
-
177
- const formatDuration = (seconds: number): string => {
178
- const h = Math.floor(seconds / 3600)
179
- const m = Math.floor((seconds % 3600) / 60)
180
- const s = Math.floor(seconds % 60)
181
- if (h > 0) return `${h}h${m.toString().padStart(2, '0')}m`
182
- if (m > 0) return `${m}m${s.toString().padStart(2, '0')}s`
183
- return `${s}s`
184
- }
185
-
186
- const SPINNER_FRAMES = ['\u28CB', '\u28D9', '\u28F9', '\u28F8', '\u28FC', '\u28F4', '\u28E6', '\u28E7', '\u28C7', '\u28CF']
187
-
188
- const SpinnerDots = ({ fg }: { fg?: string }) => {
189
- const [frame, setFrame] = useState(0)
190
-
191
- useEffect(() => {
192
- const interval = setInterval(() => {
193
- setFrame(f => (f + 1) % SPINNER_FRAMES.length)
194
- }, 80)
195
- return () => clearInterval(interval)
196
- }, [])
197
-
198
- return <text fg={fg ?? 'yellow'}>{SPINNER_FRAMES[frame]} </text>
199
- }
200
-
201
- const PROGRESS_FULL = '█'
202
- const PROGRESS_EMPTY = '░'
203
-
204
- const ProgressBarSimple = ({ value, width = 12 }: { value: number; width?: number }) => {
205
- const filled = Math.round((value / 100) * width)
206
- const empty = width - filled
207
- return (
208
- <text>
209
- <span fg='green'>{PROGRESS_FULL.repeat(filled)}</span>
210
- <span fg='#444444'>{PROGRESS_EMPTY.repeat(empty)}</span>
211
- </text>
212
- )
213
- }
214
-
215
- const ElapsedTimer = ({ startedAt }: { startedAt: number }) => {
216
- const [secs, setSecs] = useState(0)
217
-
218
- useEffect(() => {
219
- if (startedAt <= 0) return
220
- const tick = (): void => {
221
- setSecs(Math.max(0, Math.floor((Date.now() - startedAt) / 1000)))
222
- }
223
- tick()
224
- const timer = setInterval(tick, 1000)
225
- return () => clearInterval(timer)
226
- }, [startedAt])
227
-
228
- if (startedAt <= 0) return null
229
- return <text fg={DIM}> {formatDuration(secs)}</text>
230
- }
231
-
232
- const RecentFiles = ({ files }: { files: { duration: number; name: string; pages: number; per_page: number }[] }) => {
233
- const display = files.slice(-3)
234
- return (
235
- <box flexDirection='column'>
236
- <text fg={DIM}>
237
- <b>── recent ──</b>
238
- </text>
239
- {display.map((f, i) => (
240
- // eslint-disable-next-line react/no-array-index-key
241
- <text key={i}>
242
- <span fg='green'>✓</span>
243
- <span fg='cyan'> {f.name.slice(0, 18)}</span>
244
- <span> {f.pages.toString().padStart(2)}p</span>
245
- <span fg='yellow'> {formatDuration(f.duration)}</span>
246
- </text>
247
- ))}
248
- </box>
249
- )
250
- }
251
-
252
- const OcrLiveInfo = ({ progress }: { progress: OcrProgress }) => {
253
- const [elapsed, setElapsed] = useState(0)
254
-
255
- useEffect(() => {
256
- const started = progress.current_file_started
257
- if (progress.current_file === '-' || !started || started <= 0) {
258
- setElapsed(0)
259
- return
260
- }
261
- const update = (): void => {
262
- setElapsed(Date.now() / 1000 - started)
263
- }
264
- update()
265
- const timer = setInterval(update, 1000)
266
- return () => clearInterval(timer)
267
- }, [progress.current_file, progress.current_file_started])
268
-
269
- return (
270
- <box flexDirection='column' paddingLeft={2}>
271
- {progress.current_file === '-' ? null : (
272
- <text>
273
- <span fg={DIM}>Now: </span>
274
- <b fg='cyan'>{progress.current_file.slice(0, 20)}</b>
275
- </text>
276
- )}
277
- {progress.current_file === '-' ? null : (
278
- <text>
279
- <span fg={DIM}>p</span>
280
- <span>{progress.current_page}</span>
281
- {progress.current_pages_total ? <span fg={DIM}>/{progress.current_pages_total}</span> : null}
282
- <span fg={DIM}> </span>
283
- <span fg='yellow'>{formatDuration(elapsed)}</span>
284
- </text>
285
- )}
286
- <text>
287
- <span fg={DIM}>ETA </span>
288
- <span fg='yellow'>{progress.eta}</span>
289
- <span fg={DIM}> avg </span>
290
- <span>{progress.avg_per_file}</span>
291
- </text>
292
- <text>
293
- <span fg={DIM}>err </span>
294
- <span fg={progress.errors > 0 ? 'red' : 'green'}>{progress.errors}</span>
295
- </text>
296
- {progress.recent_files?.length ? <RecentFiles files={progress.recent_files} /> : null}
297
- </box>
298
- )
299
- }
300
-
301
- const getStepIcon = (isDone: boolean, isFailed: boolean): string => {
302
- if (isFailed) return '✗'
303
- if (isDone) return '✓'
304
- return '·'
305
- }
306
-
307
- const getStepColor = (isRunning: boolean, isDone: boolean, isFailed: boolean): string => {
308
- if (isFailed) return 'red'
309
- if (isRunning) return 'yellow'
310
- if (isDone) return 'green'
311
- return 'gray'
312
- }
313
-
314
- const StepIcon = ({
315
- color,
316
- isDone,
317
- isFailed,
318
- isRunning
319
- }: {
320
- color: string
321
- isDone: boolean
322
- isFailed: boolean
323
- isRunning: boolean
324
- }) => {
325
- if (isRunning) return <SpinnerDots fg='yellow' />
326
- return (
327
- <text fg={color}>
328
- <b>{getStepIcon(isDone, isFailed)} </b>
329
- </text>
330
- )
331
- }
332
-
333
- const SidebarStep = ({
334
- completedDuration,
335
- failedCount,
336
- isFailed,
337
- isRunning,
338
- ocrProgress,
339
- runningStatus,
340
- step,
341
- stepData,
342
- stepStartedAt
343
- }: {
344
- completedDuration?: number
345
- failedCount?: number
346
- isFailed: boolean
347
- isRunning: boolean
348
- ocrProgress?: null | OcrProgress
349
- runningStatus: string
350
- step: StepConfig
351
- stepData: StepData | undefined
352
- stepStartedAt: number
353
- // eslint-disable-next-line complexity, max-statements
354
- }) => {
355
- const done = stepData?.done ?? 0
356
- const total = stepData?.total ?? 0
357
- const isDone = total > 0 && done >= total
358
- const pct = total > 0 ? Math.round((done / total) * 100) : 0
359
- const color = getStepColor(isRunning, isDone, isFailed)
360
- const isActive = isRunning || isDone || isFailed
361
- const showRequires = Boolean(stepData?.requires) && !isDone && !isRunning
362
- const showOcr = isRunning && step.command === 'ocr'
363
-
364
- return (
365
- <box flexDirection='column'>
366
- <box>
367
- <box width={2}>
368
- <StepIcon color={color} isDone={isDone} isFailed={false} isRunning={isRunning} />
369
- </box>
370
- <text fg={isActive ? color : DIM}>{step.name}</text>
371
- {isRunning ? <ElapsedTimer startedAt={stepStartedAt} /> : null}
372
- {isDone && !isRunning && completedDuration !== undefined ? (
373
- <text fg={DIM}> {formatDuration(completedDuration)}</text>
374
- ) : null}
375
- </box>
376
- <box paddingLeft={2}>
377
- <ProgressBarSimple value={pct} />
378
- <text>
379
- <span fg='green'> {done}</span>
380
- <span fg={DIM}>/{total > 0 ? total : '?'}</span>
381
- {failedCount && failedCount > 0 ? <span fg='red'> {failedCount}✗</span> : null}
382
- </text>
383
- </box>
384
- {stepData?.details?.length ? (
385
- <box flexDirection='column' paddingLeft={2}>
386
- {stepData.details.map((d, i) => (
387
- // eslint-disable-next-line react/no-array-index-key
388
- <text fg={DIM} key={i}>
389
- {d}
390
- </text>
391
- ))}
392
- </box>
393
- ) : null}
394
- {isRunning && runningStatus !== '' ? (
395
- <box paddingLeft={2}>
396
- <text fg='cyan'>{runningStatus}</text>
397
- </box>
398
- ) : null}
399
- {showRequires ? (
400
- <box paddingLeft={2}>
401
- <text fg='yellow'>⚠ {stepData?.requires}</text>
402
- </box>
403
- ) : null}
404
- {showOcr && ocrProgress ? <OcrLiveInfo progress={ocrProgress} /> : null}
405
- </box>
406
- )
407
- }
408
-
409
- const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
410
-
411
- const PreflightBanner = ({ errors, warnings }: { errors: string[]; warnings: string[] }) => {
412
- if (errors.length === 0 && warnings.length === 0) return null
413
- return (
414
- <box
415
- border
416
- borderColor={errors.length > 0 ? 'red' : 'yellow'}
417
- borderStyle='rounded'
418
- flexDirection='column'
419
- marginTop={1}
420
- paddingLeft={1}
421
- paddingRight={1}>
422
- {errors.length > 0 ? (
423
- <>
424
- <text fg='red'>
425
- <b>✗ Missing required tools:</b>
426
- </text>
427
- {errors.map((err, i) => (
428
- // eslint-disable-next-line react/no-array-index-key
429
- <text fg='red' key={`e${i}`}>
430
- {' '}• {err}
431
- </text>
432
- ))}
433
- </>
434
- ) : null}
435
- {warnings.length > 0 ? (
436
- <>
437
- <text fg='yellow'>
438
- <b>⚠ Warnings:</b>
439
- </text>
440
- {warnings.map((warn, i) => (
441
- // eslint-disable-next-line react/no-array-index-key
442
- <text fg='yellow' key={`w${i}`}>
443
- {' '}• {warn}
444
- </text>
445
- ))}
446
- </>
447
- ) : null}
448
- {errors.length > 0 ? <text fg={DIM}>Fix the errors above and restart.</text> : null}
449
- </box>
450
- )
451
- }
452
-
453
- // eslint-disable-next-line max-statements
454
- const TitleBarTop = ({
455
- allDone,
456
- backgroundOcr,
457
- failed,
458
- pipelineStartedAt,
459
- runningCommand,
460
- stepsData
461
- }: {
462
- allDone: boolean
463
- backgroundOcr: boolean
464
- failed: boolean
465
- pipelineStartedAt: number
466
- runningCommand: CommandKey | null
467
- stepsData: AllStepsData | null
468
- }) => {
469
- const [elapsed, setElapsed] = useState(0)
470
-
471
- useEffect(() => {
472
- if (pipelineStartedAt <= 0) return
473
- const tick = (): void => {
474
- setElapsed(Math.max(0, Math.floor((Date.now() - pipelineStartedAt) / 1000)))
475
- }
476
- tick()
477
- const timer = setInterval(tick, 1000)
478
- return () => clearInterval(timer)
479
- }, [pipelineStartedAt])
480
-
481
- let statusText = ''
482
- if (allDone) statusText = '✓ Complete'
483
- else if (failed) statusText = '✗ Failed'
484
- else if (runningCommand && stepsData) {
485
- let currentIdx = 0
486
- let currentName = ''
487
- for (const s of STEPS)
488
- if (s.command === runningCommand) {
489
- currentIdx = s.stepNum
490
- currentName = s.name
491
- break
492
- }
493
- if (currentIdx > 0) {
494
- const parallelLabel = backgroundOcr && runningCommand === 'pipeline' ? '2+3/5 Convert+OCR' : null
495
- statusText = parallelLabel ?? `${currentIdx}/5 ${currentName}`
496
- }
497
- }
498
-
499
- const statusColor = allDone ? 'green' : failed ? 'red' : 'cyan'
500
-
501
- return (
502
- <box justifyContent='space-between' paddingLeft={1} paddingRight={1}>
503
- <text fg='#e0a040'>
504
- <b>anymd</b>
505
- </text>
506
- <box gap={2}>
507
- {statusText === '' ? null : (
508
- <text fg={statusColor}>
509
- <b>{statusText}</b>
510
- </text>
511
- )}
512
- {pipelineStartedAt > 0 ? <text fg={DIM}>{formatDuration(elapsed)}</text> : null}
513
- </box>
514
- </box>
515
- )
516
- }
517
-
518
- const formatChars = (chars: number): string => {
519
- if (chars >= 1_000_000) return `${(chars / 1_000_000).toFixed(1)}M`
520
- if (chars >= 1000) return `${(chars / 1000).toFixed(1)}K`
521
- return String(chars)
522
- }
523
-
524
- const SidebarSummary = ({
525
- allDone,
526
- datasetResult,
527
- stepDurations
528
- }: {
529
- allDone: boolean
530
- datasetResult: DatasetResult | null
531
- stepDurations: Partial<Record<CommandKey, number>>
532
- }) => {
533
- if (!allDone) return null
534
- return (
535
- <box flexDirection='column' paddingLeft={1} paddingTop={1}>
536
- <text fg='green'>
537
- <b>✓ All 5 steps complete</b>
538
- </text>
539
- {datasetResult ? (
540
- <box flexDirection='column' paddingLeft={1}>
541
- <text>
542
- <span fg={DIM}>entries </span>
543
- <b fg='green'>{datasetResult.entries.toLocaleString()}</b>
544
- </text>
545
- <text>
546
- <span fg={DIM}>chars </span>
547
- <b>{formatChars(datasetResult.totalChars)}</b>
548
- </text>
549
- {datasetResult.skipped > 0 ? (
550
- <text>
551
- <span fg={DIM}>skipped </span>
552
- <span fg='yellow'>{datasetResult.skipped}</span>
553
- </text>
554
- ) : null}
555
- {datasetResult.duplicates > 0 ? (
556
- <text>
557
- <span fg={DIM}>deduped </span>
558
- <span fg='yellow'>{datasetResult.duplicates}</span>
559
- </text>
560
- ) : null}
561
- </box>
562
- ) : null}
563
- <box flexDirection='column' paddingLeft={1}>
564
- {STEPS.map(step => {
565
- const dur = stepDurations[step.command]
566
- if (dur === undefined) return null
567
- return (
568
- <text key={step.command}>
569
- <span fg={DIM}>{step.name.slice(0, 14).padEnd(14)} </span>
570
- <span fg='cyan'>{formatDuration(dur)}</span>
571
- </text>
572
- )
573
- })}
574
- </box>
575
- </box>
576
- )
577
- }
578
-
579
- const HelpDialog = ({ height, width }: { height: number; width: number }) => {
580
- const boxW = 40
581
- const boxH = 12
582
- const left = Math.max(0, Math.floor((width - boxW) / 2))
583
- const top = Math.max(0, Math.floor((height - boxH) / 2))
584
-
585
- return (
586
- <box
587
- border
588
- borderColor='#e0a040'
589
- borderStyle='rounded'
590
- flexDirection='column'
591
- height={boxH}
592
- marginLeft={left}
593
- marginTop={top}
594
- paddingLeft={2}
595
- paddingRight={2}
596
- width={boxW}>
597
- <text fg='#e0a040'>
598
- <b>Keybinds</b>
599
- </text>
600
- <text> </text>
601
- <text>
602
- <b fg='cyan'>Q</b>
603
- <span fg={DIM}>{' '}Quit</span>
604
- </text>
605
- <text>
606
- <b fg='cyan'>L</b>
607
- <span fg={DIM}>{' '}Toggle log / output view</span>
608
- </text>
609
- <text>
610
- <b fg='cyan'>R</b>
611
- <span fg={DIM}>{' '}Retry failed step</span>
612
- </text>
613
- <text>
614
- <b fg='cyan'>S</b>
615
- <span fg={DIM}>{' '}Skip failed step</span>
616
- </text>
617
- <text>
618
- <b fg='cyan'>?</b>
619
- <span fg={DIM}>{' '}Toggle this help</span>
620
- </text>
621
- <text> </text>
622
- <text fg={DIM}>Press ? or Esc to close</text>
623
- </box>
624
- )
625
- }
626
-
627
- const computeTerminalTitle = (s: AppState): string => {
628
- if (s.allDone) return 'Doc Pipeline \u2014 \u2713 Complete'
629
- if (s.failed) return 'Doc Pipeline \u2014 \u2717 Failed'
630
- if (!s.runningCommand) return 'Doc Pipeline'
631
- const step = STEPS.find(st => st.command === s.runningCommand)
632
- if (!step) return 'Doc Pipeline'
633
- const sd = s.stepsData?.[s.runningCommand]
634
- const pct = sd && sd.total > 0 ? Math.round((sd.done / sd.total) * 100) : 0
635
- const label =
636
- s.backgroundOcr && s.runningCommand === 'pipeline'
637
- ? `Steps 2+3/5 \u2014 Convert + OCR ${pct}%`
638
- : `Step ${step.stepNum}/5 \u2014 ${step.name} ${pct}%`
639
- return `Doc Pipeline \u2014 ${label}`
640
- }
641
-
642
- // eslint-disable-next-line max-statements
643
- const readStream = async (stream: ReadableStream<Uint8Array>, onLine: (line: string) => void): Promise<void> => {
644
- const reader = stream.getReader()
645
- const decoder = new TextDecoder()
646
- let buffer = ''
647
- try {
648
- for (;;) {
649
- // biome-ignore lint/performance/noAwaitInLoops: streaming reads
650
- const { done, value } = await reader.read() // eslint-disable-line no-await-in-loop
651
- if (done) break
652
- buffer += decoder.decode(value, { stream: true })
653
- const parts = buffer.split('\n')
654
- buffer = parts.pop() ?? ''
655
- for (const part of parts) if (part.trim() !== '') onLine(part)
656
- }
657
- if (buffer.trim() !== '') onLine(buffer)
658
- } finally {
659
- reader.releaseLock()
660
- }
661
- }
662
-
663
- // eslint-disable-next-line max-statements, complexity
664
- const App = () => {
665
- const [state, dispatch] = useReducer(reducer, initialState)
666
- const procRef = useRef<null | ReturnType<typeof Bun.spawn>>(null)
667
- const ocrProcRef = useRef<null | ReturnType<typeof Bun.spawn>>(null)
668
- const busyRef = useRef(false)
669
- const errorLogClearedRef = useRef(false)
670
- const [showHelp, setShowHelp] = useState(false)
671
- const { height, width } = useTerminalDimensions()
672
-
673
- useEffect(() => {
674
- setTerminalTitle(computeTerminalTitle(state))
675
- }, [state.allDone, state.failed, state.runningCommand, state.backgroundOcr, state.stepsData])
676
-
677
- const refreshSteps = useCallback(async (): Promise<AllStepsData> => {
678
- const data = await fetchStepData()
679
- dispatch({ data, type: 'SET_STEPS' })
680
- return data
681
- }, [])
682
-
683
- const refreshLog = useCallback(async () => {
684
- const lines = await readLogTail(200)
685
- dispatch({ lines, type: 'SET_LOG' })
686
- }, [])
687
-
688
- const executeCommand = useCallback(
689
- // eslint-disable-next-line complexity, max-statements
690
- async (key: CommandKey): Promise<number> => {
691
- if (busyRef.current) return -1
692
- busyRef.current = true
693
-
694
- try {
695
- if (key === 'classify') {
696
- dispatch({ label: 'Classifying PDFs...', stepCommand: key, type: 'START_COMMAND' })
697
- await runClassify(prog => {
698
- const statusLine = `${prog.done}/${prog.total} \u2014 native=${prog.native} scanned=${prog.scanned} mixed=${prog.mixed}`
699
- dispatch({ status: statusLine, type: 'SET_RUNNING_STATUS' })
700
- dispatch({ line: `${prog.file} \u2192 ${prog.category}`, type: 'APPEND_OUTPUT' })
701
- if (prog.category === 'error') appendErrorLog('classify', `Failed: ${prog.file}`)
702
- })
703
- dispatch({ code: 0, type: 'COMMAND_DONE' })
704
- return 0
705
- }
706
-
707
- if (key === 'dataset') {
708
- dispatch({ label: 'Building dataset...', stepCommand: key, type: 'START_COMMAND' })
709
- const result: DatasetResult = await buildDataset({
710
- onFileResult: prog => {
711
- const icon = prog.status === 'added' ? '\u2713' : prog.status === 'duplicate' ? '\u2261' : '\u2192'
712
- const charStr = prog.chars >= 1000 ? `${(prog.chars / 1000).toFixed(1)}K` : `${prog.chars}`
713
- dispatch({ line: `${icon} ${prog.file} \u2192 ${prog.status} (${charStr} chars)`, type: 'APPEND_OUTPUT' })
714
- },
715
- onReadProgress: (readDone, total) => {
716
- dispatch({ status: `Reading ${readDone}/${total} files...`, type: 'SET_RUNNING_STATUS' })
717
- }
718
- })
719
- dispatch({ result, type: 'SET_DATASET_RESULT' })
720
- const dupStr = result.duplicates > 0 ? `, ${result.duplicates} duplicates removed` : ''
721
- dispatch({
722
- line: `Dataset: ${result.entries} entries, ${result.skipped} skipped${dupStr}, ${result.totalChars.toLocaleString()} chars`,
723
- type: 'APPEND_OUTPUT'
724
- })
725
- dispatch({ code: 0, type: 'COMMAND_DONE' })
726
- return 0
727
- }
728
-
729
- if (key === 'ocr') {
730
- const stats = await getOcrStats()
731
- if (stats.total === 0) {
732
- dispatch({ label: 'OCR', stepCommand: key, type: 'START_COMMAND' })
733
- dispatch({ line: 'No classification found. Classify PDFs first.', type: 'APPEND_OUTPUT' })
734
- dispatch({ code: 1, type: 'COMMAND_DONE' })
735
- return 1
736
- }
737
- if (stats.remaining === 0) {
738
- dispatch({ label: 'OCR', stepCommand: key, type: 'START_COMMAND' })
739
- dispatch({ line: "All files already OCR'd.", type: 'APPEND_OUTPUT' })
740
- dispatch({ code: 0, type: 'COMMAND_DONE' })
741
- return 0
742
- }
743
- }
744
-
745
- if (key === 'enhance') {
746
- dispatch({ label: 'Enhancing OCR markdown...', stepCommand: key, type: 'START_COMMAND' })
747
- const result = await runEnhanceOcr(prog => {
748
- const statusLine = `${prog.done}/${prog.total} files enhanced`
749
- dispatch({ status: statusLine, type: 'SET_RUNNING_STATUS' })
750
- const icon = prog.status === 'enhanced' ? '\u2713' : prog.status === 'skipped' ? '\u2192' : '\u2717'
751
- dispatch({ line: `${icon} ${prog.file} \u2192 ${prog.status}`, type: 'APPEND_OUTPUT' })
752
- if (prog.status === 'failed') appendErrorLog('enhance', `Failed: ${prog.file}`)
753
- })
754
- dispatch({ command: 'enhance', count: result.failed, type: 'RECORD_FAILURES' })
755
- const enhanceLine =
756
- result.skipped > 0
757
- ? `Enhanced: ${result.enhanced}, Skipped: ${result.skipped}, Failed: ${result.failed}`
758
- : `Enhanced: ${result.enhanced}, Failed: ${result.failed}`
759
- dispatch({ line: enhanceLine, type: 'APPEND_OUTPUT' })
760
- dispatch({ code: 0, type: 'COMMAND_DONE' })
761
- return 0
762
- }
763
-
764
- if (key === 'pipeline') {
765
- await clearPipelineLog()
766
- await writeNativeFileList()
767
- }
768
-
769
- const spawned = spawnCommand(key)
770
- if (!spawned) return -1
771
- dispatch({ label: spawned.label, stepCommand: key, type: 'START_COMMAND' })
772
- procRef.current = spawned.proc
773
-
774
- const onLine = (line: string): void => {
775
- dispatch({ line, type: 'APPEND_OUTPUT' })
776
- if (key === 'pipeline') appendPipelineLog(line)
777
- if (ERROR_PATTERN.test(line)) appendErrorLog(key, line)
778
- }
779
-
780
- const { stderr, stdout } = spawned.proc
781
- const stdoutPromise = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
782
- const stderrPromise = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
783
-
784
- await Promise.all([stdoutPromise, stderrPromise])
785
- const code = await spawned.proc.exited
786
- dispatch({ code, type: 'COMMAND_DONE' })
787
- procRef.current = null
788
- return code
789
- } catch (execError) {
790
- const msg = String(execError)
791
- dispatch({ line: `Error: ${msg}`, type: 'APPEND_OUTPUT' })
792
- appendErrorLog(key, msg)
793
- dispatch({ code: 1, type: 'COMMAND_DONE' })
794
- return 1
795
- } finally {
796
- busyRef.current = false // eslint-disable-line require-atomic-updates
797
- }
798
- },
799
- []
800
- )
801
-
802
- // eslint-disable-next-line max-statements
803
- const runParallelConvertOcr = useCallback(async (): Promise<number> => {
804
- busyRef.current = true
805
- const parallelStart = Date.now()
806
- try {
807
- const ocrStats = await getOcrStats()
808
- let ocrSpawned: ReturnType<typeof spawnCommand> = null
809
- if (ocrStats.remaining > 0) {
810
- ocrSpawned = spawnCommand('ocr')
811
- if (ocrSpawned) {
812
- ocrProcRef.current = ocrSpawned.proc
813
- dispatch({ active: true, type: 'SET_BACKGROUND_OCR' })
814
- const { stderr: ocrErr, stdout: ocrOut } = ocrSpawned.proc
815
- const ocrErrorLine = (line: string): void => {
816
- if (ERROR_PATTERN.test(line)) appendErrorLog('ocr', line)
817
- }
818
- if (ocrOut instanceof ReadableStream) readStream(ocrOut, ocrErrorLine)
819
- if (ocrErr instanceof ReadableStream) readStream(ocrErr, ocrErrorLine)
820
- }
821
- }
822
-
823
- await clearPipelineLog()
824
- await writeNativeFileList()
825
- const pipelineSpawned = spawnCommand('pipeline')
826
- if (!pipelineSpawned) return -1
827
-
828
- dispatch({ label: pipelineSpawned.label, stepCommand: 'pipeline', type: 'START_COMMAND' })
829
- procRef.current = pipelineSpawned.proc
830
-
831
- const onLine = (line: string): void => {
832
- dispatch({ line, type: 'APPEND_OUTPUT' })
833
- appendPipelineLog(line)
834
- if (ERROR_PATTERN.test(line)) appendErrorLog('pipeline', line)
835
- }
836
- const { stderr, stdout } = pipelineSpawned.proc
837
- const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
838
- const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
839
-
840
- // biome-ignore lint/performance/noAwaitInLoops: awaiting parallel streams
841
- await Promise.all([stdoutP, stderrP]) // eslint-disable-line no-await-in-loop
842
- const pipeCode = await pipelineSpawned.proc.exited
843
- procRef.current = null
844
- dispatch({ code: pipeCode, type: 'COMMAND_DONE' })
845
-
846
- if (pipeCode !== 0) {
847
- if (ocrProcRef.current) {
848
- ocrProcRef.current.kill()
849
- ocrProcRef.current = null
850
- }
851
- dispatch({ active: false, type: 'SET_BACKGROUND_OCR' })
852
- return pipeCode
853
- }
854
-
855
- if (ocrProcRef.current) {
856
- dispatch({ label: 'Chandra OCR', stepCommand: 'ocr', type: 'START_COMMAND' })
857
- procRef.current = ocrProcRef.current
858
- const ocrCode = await ocrProcRef.current.exited
859
- procRef.current = null
860
- ocrProcRef.current = null // eslint-disable-line require-atomic-updates
861
- dispatch({ active: false, type: 'SET_BACKGROUND_OCR' })
862
- dispatch({ command: 'ocr', seconds: Math.floor((Date.now() - parallelStart) / 1000), type: 'RECORD_DURATION' })
863
- dispatch({ code: ocrCode, type: 'COMMAND_DONE' })
864
- return ocrCode
865
- }
866
-
867
- dispatch({ active: false, type: 'SET_BACKGROUND_OCR' })
868
- return 0
869
- } finally {
870
- busyRef.current = false // eslint-disable-line require-atomic-updates
871
- }
872
- }, [])
873
-
874
- // eslint-disable-next-line max-statements
875
- const autoRun = useCallback(async () => {
876
- if (busyRef.current) return
877
-
878
- if (!errorLogClearedRef.current) {
879
- errorLogClearedRef.current = true
880
- await clearErrorLog()
881
- }
882
-
883
- dispatch({ line: 'Checking Python environment...', type: 'APPEND_OUTPUT' })
884
- const ok = await bootstrapPython({
885
- onDone: () => {
886
- dispatch({ line: 'Python environment ready.', type: 'APPEND_OUTPUT' })
887
- },
888
- onStep: (msg: string) => {
889
- dispatch({ line: msg, type: 'APPEND_OUTPUT' })
890
- }
891
- })
892
- if (!ok) {
893
- dispatch({
894
- errors: ['Python bootstrap failed. Install uv and try again.'],
895
- type: 'SET_PREFLIGHT',
896
- warnings: []
897
- })
898
- return
899
- }
900
-
901
- const preflight = await runPreflight()
902
- dispatch({ errors: preflight.errors, type: 'SET_PREFLIGHT', warnings: preflight.warnings })
903
- if (preflight.errors.length > 0) return
904
-
905
- const data = await refreshSteps()
906
- const nextStep = findFirstIncomplete(data)
907
-
908
- if (!nextStep) {
909
- dispatch({ type: 'SET_ALL_DONE' })
910
- process.stdout.write('\u0007')
911
- Bun.spawn(['osascript', '-e', 'display notification "Pipeline complete" with title "Doc Pipeline"'])
912
- return
913
- }
914
-
915
- let code: number
916
- const classifyDone = isStepDone('classify', data)
917
- const pipelineNeeded = !isStepDone('pipeline', data)
918
- const ocrNeeded = !isStepDone('ocr', data)
919
-
920
- if (nextStep === 'pipeline' && classifyDone && pipelineNeeded && ocrNeeded) code = await runParallelConvertOcr()
921
- else code = await executeCommand(nextStep)
922
-
923
- if (code === 0) {
924
- await refreshSteps()
925
- setTimeout(() => {
926
- autoRun()
927
- }, 500)
928
- }
929
- }, [refreshSteps, executeCommand, runParallelConvertOcr])
930
-
931
- useEffect(() => {
932
- autoRun()
933
- }, [autoRun])
934
-
935
- useEffect(() => {
936
- const interval = setInterval(() => {
937
- refreshSteps()
938
- if (state.showLog) refreshLog()
939
- }, 2000)
940
- return () => clearInterval(interval)
941
- }, [refreshSteps, refreshLog, state.showLog])
942
-
943
- // eslint-disable-next-line max-statements
944
- const skipCurrentStep = useCallback(async () => {
945
- const currentCmd = state.runningCommand
946
- dispatch({ type: 'CLEAR_FAILURE' })
947
- if (!currentCmd) return
948
-
949
- const idx = STEP_ORDER.indexOf(currentCmd)
950
- const data = await refreshSteps()
951
- const remaining = STEP_ORDER.slice(idx + 1)
952
- let nextKey: CommandKey | null = null
953
- for (const k of remaining)
954
- if (!isStepDone(k, data)) {
955
- nextKey = k
956
- break
957
- }
958
-
959
- if (!nextKey) {
960
- dispatch({ type: 'SET_ALL_DONE' })
961
- return
962
- }
963
-
964
- const code = await executeCommand(nextKey)
965
- if (code === 0) {
966
- await refreshSteps()
967
- autoRun()
968
- }
969
- }, [state.runningCommand, refreshSteps, executeCommand, autoRun])
970
-
971
- // eslint-disable-next-line complexity, max-statements
972
- useKeyboard(key => {
973
- if (showHelp) {
974
- if (key.name === '?' || key.name === 'escape') setShowHelp(false)
975
- return
976
- }
977
-
978
- if (key.name === '?') {
979
- setShowHelp(true)
980
- return
981
- }
982
-
983
- if (key.name === 'q' || key.name === 'escape') {
984
- if (procRef.current) {
985
- procRef.current.kill()
986
- procRef.current = null
987
- }
988
- if (ocrProcRef.current) {
989
- ocrProcRef.current.kill()
990
- ocrProcRef.current = null
991
- }
992
- setTerminalTitle('')
993
- process.exit(0)
994
- return
995
- }
996
-
997
- if (key.name === 'l') {
998
- refreshLog()
999
- dispatch({ type: 'TOGGLE_LOG' })
1000
- return
1001
- }
1002
-
1003
- if (state.failed) {
1004
- if (key.name === 'r') {
1005
- dispatch({ type: 'CLEAR_FAILURE' })
1006
- autoRun()
1007
- return
1008
- }
1009
- if (key.name === 's') skipCurrentStep()
1010
- }
1011
- })
1012
-
1013
- const logHeight = Math.max(5, height - 3)
1014
- const displayLines = state.showLog ? state.logLines : state.runningLines
1015
-
1016
- return (
1017
- <box flexDirection='column' height={height}>
1018
- <TitleBarTop
1019
- allDone={state.allDone}
1020
- backgroundOcr={state.backgroundOcr}
1021
- failed={state.failed}
1022
- pipelineStartedAt={state.pipelineStartedAt}
1023
- runningCommand={state.runningCommand}
1024
- stepsData={state.stepsData}
1025
- />
1026
- <box flexGrow={1}>
1027
- <box flexDirection='column' paddingLeft={1} width={SIDEBAR_WIDTH}>
1028
- {state.stepsData ? (
1029
- <box flexDirection='column'>
1030
- {STEPS.map(step => {
1031
- const isFg = state.runningCommand === step.command
1032
- const isBgOcr = step.command === 'ocr' && state.backgroundOcr
1033
- const isActive = isFg || isBgOcr
1034
- const sd = state.stepsData ? state.stepsData[step.command] : undefined
1035
- const failures = sd?.failed ?? state.stepFailures[step.command]
1036
- return (
1037
- <SidebarStep
1038
- completedDuration={state.stepDurations[step.command]}
1039
- failedCount={failures}
1040
- isFailed={state.failed ? isFg : false}
1041
- isRunning={!state.failed && isActive}
1042
- key={step.command}
1043
- ocrProgress={step.command === 'ocr' ? state.stepsData?.ocr.progress : undefined}
1044
- runningStatus={isFg ? state.runningStatus : ''}
1045
- step={step}
1046
- stepData={sd}
1047
- stepStartedAt={isActive ? state.stepStartedAt : 0}
1048
- />
1049
- )
1050
- })}
1051
- </box>
1052
- ) : (
1053
- <text fg={DIM}>Loading...</text>
1054
- )}
1055
- <SidebarSummary
1056
- allDone={state.allDone}
1057
- datasetResult={state.datasetResult}
1058
- stepDurations={state.stepDurations}
1059
- />
1060
- </box>
1061
- <box flexDirection='column' flexGrow={1} paddingLeft={2}>
1062
- {state.preflightErrors.length > 0 || state.preflightWarnings.length > 0 ? (
1063
- <PreflightBanner errors={state.preflightErrors} warnings={state.preflightWarnings} />
1064
- ) : null}
1065
- <scrollbox focused height={logHeight} paddingLeft={1} stickyScroll>
1066
- {displayLines.length > 0 ? (
1067
- displayLines.map((line, i) => {
1068
- const isError = ERROR_PATTERN.test(line)
1069
- return (
1070
- // eslint-disable-next-line react/no-array-index-key
1071
- <text fg={isError ? 'red' : DIM} key={i}>
1072
- {line}
1073
- </text>
1074
- )
1075
- })
1076
- ) : (
1077
- <text fg={DIM}>Waiting for output...</text>
1078
- )}
1079
- </scrollbox>
1080
- </box>
1081
- </box>
1082
- {showHelp ? <HelpDialog height={height} width={width} /> : null}
1083
- </box>
1084
- )
1085
- }
1086
-
1087
- const start = async () => {
1088
- const renderer = await createCliRenderer({ exitOnCtrlC: false })
1089
- createRoot(renderer).render(<App />)
1090
- }
1091
-
1092
- const isDirectRun = process.argv[1]?.endsWith('tui.tsx') ?? false
1093
- if (isDirectRun) {
1094
- const { resolve } = await import('node:path')
1095
- const { initPaths } = await import('~/paths')
1096
- try {
1097
- getPaths()
1098
- } catch {
1099
- initPaths(resolve('data'), resolve('output'))
1100
- }
1101
- // oxlint-disable-next-line unicorn/prefer-top-level-await
1102
- start()
1103
- }
1104
-
1105
- export { start }