anymd 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -27
- package/cli.tsx +2 -2
- package/package.json +2 -8
- package/runner.ts +308 -0
- package/scripts/pdf-to-md.py +38 -9
- package/src/bootstrap.ts +1 -1
- package/src/tui-data.ts +55 -29
- package/tui.tsx +0 -1105
package/tui.tsx
DELETED
|
@@ -1,1105 +0,0 @@
|
|
|
1
|
-
// oxlint-disable react/no-unknown-property
|
|
2
|
-
/* eslint-disable react/no-unknown-property, react-hooks/rules-of-hooks, @typescript-eslint/promise-function-async */
|
|
3
|
-
/** biome-ignore-all lint/nursery/noUnknownAttribute: OpenTUI uses custom JSX intrinsics */
|
|
4
|
-
import { createCliRenderer } from '@opentui/core'
|
|
5
|
-
import { createRoot, useKeyboard, useTerminalDimensions } from '@opentui/react'
|
|
6
|
-
import { useCallback, useEffect, useReducer, useRef, useState } from 'react'
|
|
7
|
-
|
|
8
|
-
import type { AllStepsData, CommandKey, DatasetResult, OcrProgress, StepData } from '~/tui-data'
|
|
9
|
-
|
|
10
|
-
import { bootstrapPython } from '~/bootstrap'
|
|
11
|
-
import { getPaths } from '~/paths'
|
|
12
|
-
import { runPreflight } from '~/preflight'
|
|
13
|
-
import {
|
|
14
|
-
appendErrorLog,
|
|
15
|
-
appendPipelineLog,
|
|
16
|
-
buildDataset,
|
|
17
|
-
clearErrorLog,
|
|
18
|
-
clearPipelineLog,
|
|
19
|
-
fetchStepData,
|
|
20
|
-
getOcrStats,
|
|
21
|
-
readLogTail,
|
|
22
|
-
runClassify,
|
|
23
|
-
runEnhanceOcr,
|
|
24
|
-
spawnCommand,
|
|
25
|
-
writeNativeFileList
|
|
26
|
-
} from '~/tui-data'
|
|
27
|
-
|
|
28
|
-
const stripAnsi = (s: string): string => s.replaceAll(new RegExp(`${String.fromCodePoint(0x1b)}\\[[0-9;]*m`, 'gu'), '')
|
|
29
|
-
|
|
30
|
-
const DIM = '#888888'
|
|
31
|
-
const SIDEBAR_WIDTH = 38
|
|
32
|
-
|
|
33
|
-
const setTerminalTitle = (title: string): void => {
|
|
34
|
-
process.stdout.write(`\u001B]0;${title}\u0007`)
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
interface StepConfig {
|
|
38
|
-
command: CommandKey
|
|
39
|
-
name: string
|
|
40
|
-
stepNum: number
|
|
41
|
-
unit: string
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
const STEPS: StepConfig[] = [
|
|
45
|
-
{ command: 'classify', name: 'Classify PDFs', stepNum: 1, unit: 'PDFs' },
|
|
46
|
-
{ command: 'pipeline', name: 'Convert to Markdown', stepNum: 2, unit: 'files' },
|
|
47
|
-
{ command: 'ocr', name: 'OCR Scanned PDFs', stepNum: 3, unit: 'files' },
|
|
48
|
-
{ command: 'enhance', name: 'Enhance Markdown', stepNum: 4, unit: 'files' },
|
|
49
|
-
{ command: 'dataset', name: 'Build Dataset', stepNum: 5, unit: 'entries' }
|
|
50
|
-
]
|
|
51
|
-
|
|
52
|
-
const STEP_ORDER: CommandKey[] = ['classify', 'pipeline', 'ocr', 'enhance', 'dataset']
|
|
53
|
-
|
|
54
|
-
type Action =
|
|
55
|
-
| { active: boolean; type: 'SET_BACKGROUND_OCR' }
|
|
56
|
-
| { code: number; type: 'COMMAND_DONE' }
|
|
57
|
-
| { command: CommandKey; count: number; type: 'RECORD_FAILURES' }
|
|
58
|
-
| { command: CommandKey; seconds: number; type: 'RECORD_DURATION' }
|
|
59
|
-
| { data: AllStepsData; type: 'SET_STEPS' }
|
|
60
|
-
| { errors: string[]; type: 'SET_PREFLIGHT'; warnings: string[] }
|
|
61
|
-
| { label: string; stepCommand: CommandKey; type: 'START_COMMAND' }
|
|
62
|
-
| { line: string; type: 'APPEND_OUTPUT' }
|
|
63
|
-
| { lines: string[]; type: 'SET_LOG' }
|
|
64
|
-
| { result: DatasetResult; type: 'SET_DATASET_RESULT' }
|
|
65
|
-
| { status: string; type: 'SET_RUNNING_STATUS' }
|
|
66
|
-
| { type: 'CLEAR_FAILURE' }
|
|
67
|
-
| { type: 'SET_ALL_DONE' }
|
|
68
|
-
| { type: 'TOGGLE_LOG' }
|
|
69
|
-
|
|
70
|
-
interface AppState {
|
|
71
|
-
allDone: boolean
|
|
72
|
-
backgroundOcr: boolean
|
|
73
|
-
datasetResult: DatasetResult | null
|
|
74
|
-
failed: boolean
|
|
75
|
-
logLines: string[]
|
|
76
|
-
pipelineStartedAt: number
|
|
77
|
-
preflightErrors: string[]
|
|
78
|
-
preflightWarnings: string[]
|
|
79
|
-
runningCommand: CommandKey | null
|
|
80
|
-
runningLines: string[]
|
|
81
|
-
runningStatus: string
|
|
82
|
-
showLog: boolean
|
|
83
|
-
stepDurations: Partial<Record<CommandKey, number>>
|
|
84
|
-
stepFailures: Partial<Record<CommandKey, number>>
|
|
85
|
-
stepsData: AllStepsData | null
|
|
86
|
-
stepStartedAt: number
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
const MAX_OUTPUT_LINES = 15
|
|
90
|
-
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
91
|
-
const noop = (): Promise<void> => Promise.resolve() // eslint-disable-line @typescript-eslint/promise-function-async
|
|
92
|
-
|
|
93
|
-
const initialState: AppState = {
|
|
94
|
-
allDone: false,
|
|
95
|
-
backgroundOcr: false,
|
|
96
|
-
datasetResult: null,
|
|
97
|
-
failed: false,
|
|
98
|
-
logLines: [],
|
|
99
|
-
pipelineStartedAt: Date.now(),
|
|
100
|
-
preflightErrors: [],
|
|
101
|
-
preflightWarnings: [],
|
|
102
|
-
runningCommand: null,
|
|
103
|
-
runningLines: [],
|
|
104
|
-
runningStatus: '',
|
|
105
|
-
showLog: false,
|
|
106
|
-
stepDurations: {},
|
|
107
|
-
stepFailures: {},
|
|
108
|
-
stepsData: null,
|
|
109
|
-
stepStartedAt: 0
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
const reducer = (state: AppState, action: Action): AppState => {
|
|
113
|
-
switch (action.type) {
|
|
114
|
-
case 'APPEND_OUTPUT':
|
|
115
|
-
return { ...state, runningLines: [...state.runningLines.slice(-MAX_OUTPUT_LINES), stripAnsi(action.line)] }
|
|
116
|
-
case 'CLEAR_FAILURE':
|
|
117
|
-
return { ...state, backgroundOcr: false, failed: false, runningCommand: null }
|
|
118
|
-
case 'COMMAND_DONE': {
|
|
119
|
-
const durations =
|
|
120
|
-
action.code === 0 && state.runningCommand && state.stepStartedAt > 0
|
|
121
|
-
? { ...state.stepDurations, [state.runningCommand]: Math.floor((Date.now() - state.stepStartedAt) / 1000) }
|
|
122
|
-
: state.stepDurations
|
|
123
|
-
return {
|
|
124
|
-
...state,
|
|
125
|
-
failed: action.code !== 0,
|
|
126
|
-
runningCommand: action.code === 0 ? null : state.runningCommand,
|
|
127
|
-
stepDurations: durations
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
case 'RECORD_DURATION':
|
|
131
|
-
return { ...state, stepDurations: { ...state.stepDurations, [action.command]: action.seconds } }
|
|
132
|
-
case 'RECORD_FAILURES':
|
|
133
|
-
if (action.count <= 0) return state
|
|
134
|
-
return { ...state, stepFailures: { ...state.stepFailures, [action.command]: action.count } }
|
|
135
|
-
case 'SET_ALL_DONE':
|
|
136
|
-
return { ...state, allDone: true, backgroundOcr: false, runningCommand: null }
|
|
137
|
-
case 'SET_BACKGROUND_OCR':
|
|
138
|
-
return { ...state, backgroundOcr: action.active }
|
|
139
|
-
case 'SET_DATASET_RESULT':
|
|
140
|
-
return { ...state, datasetResult: action.result }
|
|
141
|
-
case 'SET_LOG':
|
|
142
|
-
return { ...state, logLines: action.lines.map(stripAnsi) }
|
|
143
|
-
case 'SET_PREFLIGHT':
|
|
144
|
-
return { ...state, preflightErrors: action.errors, preflightWarnings: action.warnings }
|
|
145
|
-
case 'SET_RUNNING_STATUS':
|
|
146
|
-
return { ...state, runningStatus: action.status }
|
|
147
|
-
case 'SET_STEPS':
|
|
148
|
-
return { ...state, stepsData: action.data }
|
|
149
|
-
case 'START_COMMAND':
|
|
150
|
-
return {
|
|
151
|
-
...state,
|
|
152
|
-
failed: false,
|
|
153
|
-
runningCommand: action.stepCommand,
|
|
154
|
-
runningLines: [],
|
|
155
|
-
runningStatus: '',
|
|
156
|
-
stepStartedAt: Date.now()
|
|
157
|
-
}
|
|
158
|
-
case 'TOGGLE_LOG':
|
|
159
|
-
return { ...state, showLog: !state.showLog }
|
|
160
|
-
default:
|
|
161
|
-
return state
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
const isStepDone = (step: CommandKey, data: AllStepsData): boolean => {
|
|
166
|
-
const s: StepData = data[step]
|
|
167
|
-
if (s.requires) return false
|
|
168
|
-
return s.done >= s.total
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
const findFirstIncomplete = (data: AllStepsData): CommandKey | null => {
|
|
172
|
-
for (const key of STEP_ORDER) if (!isStepDone(key, data)) return key
|
|
173
|
-
|
|
174
|
-
return null
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
const formatDuration = (seconds: number): string => {
|
|
178
|
-
const h = Math.floor(seconds / 3600)
|
|
179
|
-
const m = Math.floor((seconds % 3600) / 60)
|
|
180
|
-
const s = Math.floor(seconds % 60)
|
|
181
|
-
if (h > 0) return `${h}h${m.toString().padStart(2, '0')}m`
|
|
182
|
-
if (m > 0) return `${m}m${s.toString().padStart(2, '0')}s`
|
|
183
|
-
return `${s}s`
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
const SPINNER_FRAMES = ['\u28CB', '\u28D9', '\u28F9', '\u28F8', '\u28FC', '\u28F4', '\u28E6', '\u28E7', '\u28C7', '\u28CF']
|
|
187
|
-
|
|
188
|
-
const SpinnerDots = ({ fg }: { fg?: string }) => {
|
|
189
|
-
const [frame, setFrame] = useState(0)
|
|
190
|
-
|
|
191
|
-
useEffect(() => {
|
|
192
|
-
const interval = setInterval(() => {
|
|
193
|
-
setFrame(f => (f + 1) % SPINNER_FRAMES.length)
|
|
194
|
-
}, 80)
|
|
195
|
-
return () => clearInterval(interval)
|
|
196
|
-
}, [])
|
|
197
|
-
|
|
198
|
-
return <text fg={fg ?? 'yellow'}>{SPINNER_FRAMES[frame]} </text>
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
const PROGRESS_FULL = '█'
|
|
202
|
-
const PROGRESS_EMPTY = '░'
|
|
203
|
-
|
|
204
|
-
const ProgressBarSimple = ({ value, width = 12 }: { value: number; width?: number }) => {
|
|
205
|
-
const filled = Math.round((value / 100) * width)
|
|
206
|
-
const empty = width - filled
|
|
207
|
-
return (
|
|
208
|
-
<text>
|
|
209
|
-
<span fg='green'>{PROGRESS_FULL.repeat(filled)}</span>
|
|
210
|
-
<span fg='#444444'>{PROGRESS_EMPTY.repeat(empty)}</span>
|
|
211
|
-
</text>
|
|
212
|
-
)
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
const ElapsedTimer = ({ startedAt }: { startedAt: number }) => {
|
|
216
|
-
const [secs, setSecs] = useState(0)
|
|
217
|
-
|
|
218
|
-
useEffect(() => {
|
|
219
|
-
if (startedAt <= 0) return
|
|
220
|
-
const tick = (): void => {
|
|
221
|
-
setSecs(Math.max(0, Math.floor((Date.now() - startedAt) / 1000)))
|
|
222
|
-
}
|
|
223
|
-
tick()
|
|
224
|
-
const timer = setInterval(tick, 1000)
|
|
225
|
-
return () => clearInterval(timer)
|
|
226
|
-
}, [startedAt])
|
|
227
|
-
|
|
228
|
-
if (startedAt <= 0) return null
|
|
229
|
-
return <text fg={DIM}> {formatDuration(secs)}</text>
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
const RecentFiles = ({ files }: { files: { duration: number; name: string; pages: number; per_page: number }[] }) => {
|
|
233
|
-
const display = files.slice(-3)
|
|
234
|
-
return (
|
|
235
|
-
<box flexDirection='column'>
|
|
236
|
-
<text fg={DIM}>
|
|
237
|
-
<b>── recent ──</b>
|
|
238
|
-
</text>
|
|
239
|
-
{display.map((f, i) => (
|
|
240
|
-
// eslint-disable-next-line react/no-array-index-key
|
|
241
|
-
<text key={i}>
|
|
242
|
-
<span fg='green'>✓</span>
|
|
243
|
-
<span fg='cyan'> {f.name.slice(0, 18)}</span>
|
|
244
|
-
<span> {f.pages.toString().padStart(2)}p</span>
|
|
245
|
-
<span fg='yellow'> {formatDuration(f.duration)}</span>
|
|
246
|
-
</text>
|
|
247
|
-
))}
|
|
248
|
-
</box>
|
|
249
|
-
)
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
const OcrLiveInfo = ({ progress }: { progress: OcrProgress }) => {
|
|
253
|
-
const [elapsed, setElapsed] = useState(0)
|
|
254
|
-
|
|
255
|
-
useEffect(() => {
|
|
256
|
-
const started = progress.current_file_started
|
|
257
|
-
if (progress.current_file === '-' || !started || started <= 0) {
|
|
258
|
-
setElapsed(0)
|
|
259
|
-
return
|
|
260
|
-
}
|
|
261
|
-
const update = (): void => {
|
|
262
|
-
setElapsed(Date.now() / 1000 - started)
|
|
263
|
-
}
|
|
264
|
-
update()
|
|
265
|
-
const timer = setInterval(update, 1000)
|
|
266
|
-
return () => clearInterval(timer)
|
|
267
|
-
}, [progress.current_file, progress.current_file_started])
|
|
268
|
-
|
|
269
|
-
return (
|
|
270
|
-
<box flexDirection='column' paddingLeft={2}>
|
|
271
|
-
{progress.current_file === '-' ? null : (
|
|
272
|
-
<text>
|
|
273
|
-
<span fg={DIM}>Now: </span>
|
|
274
|
-
<b fg='cyan'>{progress.current_file.slice(0, 20)}</b>
|
|
275
|
-
</text>
|
|
276
|
-
)}
|
|
277
|
-
{progress.current_file === '-' ? null : (
|
|
278
|
-
<text>
|
|
279
|
-
<span fg={DIM}>p</span>
|
|
280
|
-
<span>{progress.current_page}</span>
|
|
281
|
-
{progress.current_pages_total ? <span fg={DIM}>/{progress.current_pages_total}</span> : null}
|
|
282
|
-
<span fg={DIM}> </span>
|
|
283
|
-
<span fg='yellow'>{formatDuration(elapsed)}</span>
|
|
284
|
-
</text>
|
|
285
|
-
)}
|
|
286
|
-
<text>
|
|
287
|
-
<span fg={DIM}>ETA </span>
|
|
288
|
-
<span fg='yellow'>{progress.eta}</span>
|
|
289
|
-
<span fg={DIM}> avg </span>
|
|
290
|
-
<span>{progress.avg_per_file}</span>
|
|
291
|
-
</text>
|
|
292
|
-
<text>
|
|
293
|
-
<span fg={DIM}>err </span>
|
|
294
|
-
<span fg={progress.errors > 0 ? 'red' : 'green'}>{progress.errors}</span>
|
|
295
|
-
</text>
|
|
296
|
-
{progress.recent_files?.length ? <RecentFiles files={progress.recent_files} /> : null}
|
|
297
|
-
</box>
|
|
298
|
-
)
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
const getStepIcon = (isDone: boolean, isFailed: boolean): string => {
|
|
302
|
-
if (isFailed) return '✗'
|
|
303
|
-
if (isDone) return '✓'
|
|
304
|
-
return '·'
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
const getStepColor = (isRunning: boolean, isDone: boolean, isFailed: boolean): string => {
|
|
308
|
-
if (isFailed) return 'red'
|
|
309
|
-
if (isRunning) return 'yellow'
|
|
310
|
-
if (isDone) return 'green'
|
|
311
|
-
return 'gray'
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
const StepIcon = ({
|
|
315
|
-
color,
|
|
316
|
-
isDone,
|
|
317
|
-
isFailed,
|
|
318
|
-
isRunning
|
|
319
|
-
}: {
|
|
320
|
-
color: string
|
|
321
|
-
isDone: boolean
|
|
322
|
-
isFailed: boolean
|
|
323
|
-
isRunning: boolean
|
|
324
|
-
}) => {
|
|
325
|
-
if (isRunning) return <SpinnerDots fg='yellow' />
|
|
326
|
-
return (
|
|
327
|
-
<text fg={color}>
|
|
328
|
-
<b>{getStepIcon(isDone, isFailed)} </b>
|
|
329
|
-
</text>
|
|
330
|
-
)
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
const SidebarStep = ({
|
|
334
|
-
completedDuration,
|
|
335
|
-
failedCount,
|
|
336
|
-
isFailed,
|
|
337
|
-
isRunning,
|
|
338
|
-
ocrProgress,
|
|
339
|
-
runningStatus,
|
|
340
|
-
step,
|
|
341
|
-
stepData,
|
|
342
|
-
stepStartedAt
|
|
343
|
-
}: {
|
|
344
|
-
completedDuration?: number
|
|
345
|
-
failedCount?: number
|
|
346
|
-
isFailed: boolean
|
|
347
|
-
isRunning: boolean
|
|
348
|
-
ocrProgress?: null | OcrProgress
|
|
349
|
-
runningStatus: string
|
|
350
|
-
step: StepConfig
|
|
351
|
-
stepData: StepData | undefined
|
|
352
|
-
stepStartedAt: number
|
|
353
|
-
// eslint-disable-next-line complexity, max-statements
|
|
354
|
-
}) => {
|
|
355
|
-
const done = stepData?.done ?? 0
|
|
356
|
-
const total = stepData?.total ?? 0
|
|
357
|
-
const isDone = total > 0 && done >= total
|
|
358
|
-
const pct = total > 0 ? Math.round((done / total) * 100) : 0
|
|
359
|
-
const color = getStepColor(isRunning, isDone, isFailed)
|
|
360
|
-
const isActive = isRunning || isDone || isFailed
|
|
361
|
-
const showRequires = Boolean(stepData?.requires) && !isDone && !isRunning
|
|
362
|
-
const showOcr = isRunning && step.command === 'ocr'
|
|
363
|
-
|
|
364
|
-
return (
|
|
365
|
-
<box flexDirection='column'>
|
|
366
|
-
<box>
|
|
367
|
-
<box width={2}>
|
|
368
|
-
<StepIcon color={color} isDone={isDone} isFailed={false} isRunning={isRunning} />
|
|
369
|
-
</box>
|
|
370
|
-
<text fg={isActive ? color : DIM}>{step.name}</text>
|
|
371
|
-
{isRunning ? <ElapsedTimer startedAt={stepStartedAt} /> : null}
|
|
372
|
-
{isDone && !isRunning && completedDuration !== undefined ? (
|
|
373
|
-
<text fg={DIM}> {formatDuration(completedDuration)}</text>
|
|
374
|
-
) : null}
|
|
375
|
-
</box>
|
|
376
|
-
<box paddingLeft={2}>
|
|
377
|
-
<ProgressBarSimple value={pct} />
|
|
378
|
-
<text>
|
|
379
|
-
<span fg='green'> {done}</span>
|
|
380
|
-
<span fg={DIM}>/{total > 0 ? total : '?'}</span>
|
|
381
|
-
{failedCount && failedCount > 0 ? <span fg='red'> {failedCount}✗</span> : null}
|
|
382
|
-
</text>
|
|
383
|
-
</box>
|
|
384
|
-
{stepData?.details?.length ? (
|
|
385
|
-
<box flexDirection='column' paddingLeft={2}>
|
|
386
|
-
{stepData.details.map((d, i) => (
|
|
387
|
-
// eslint-disable-next-line react/no-array-index-key
|
|
388
|
-
<text fg={DIM} key={i}>
|
|
389
|
-
{d}
|
|
390
|
-
</text>
|
|
391
|
-
))}
|
|
392
|
-
</box>
|
|
393
|
-
) : null}
|
|
394
|
-
{isRunning && runningStatus !== '' ? (
|
|
395
|
-
<box paddingLeft={2}>
|
|
396
|
-
<text fg='cyan'>{runningStatus}</text>
|
|
397
|
-
</box>
|
|
398
|
-
) : null}
|
|
399
|
-
{showRequires ? (
|
|
400
|
-
<box paddingLeft={2}>
|
|
401
|
-
<text fg='yellow'>⚠ {stepData?.requires}</text>
|
|
402
|
-
</box>
|
|
403
|
-
) : null}
|
|
404
|
-
{showOcr && ocrProgress ? <OcrLiveInfo progress={ocrProgress} /> : null}
|
|
405
|
-
</box>
|
|
406
|
-
)
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
|
|
410
|
-
|
|
411
|
-
const PreflightBanner = ({ errors, warnings }: { errors: string[]; warnings: string[] }) => {
|
|
412
|
-
if (errors.length === 0 && warnings.length === 0) return null
|
|
413
|
-
return (
|
|
414
|
-
<box
|
|
415
|
-
border
|
|
416
|
-
borderColor={errors.length > 0 ? 'red' : 'yellow'}
|
|
417
|
-
borderStyle='rounded'
|
|
418
|
-
flexDirection='column'
|
|
419
|
-
marginTop={1}
|
|
420
|
-
paddingLeft={1}
|
|
421
|
-
paddingRight={1}>
|
|
422
|
-
{errors.length > 0 ? (
|
|
423
|
-
<>
|
|
424
|
-
<text fg='red'>
|
|
425
|
-
<b>✗ Missing required tools:</b>
|
|
426
|
-
</text>
|
|
427
|
-
{errors.map((err, i) => (
|
|
428
|
-
// eslint-disable-next-line react/no-array-index-key
|
|
429
|
-
<text fg='red' key={`e${i}`}>
|
|
430
|
-
{' '}• {err}
|
|
431
|
-
</text>
|
|
432
|
-
))}
|
|
433
|
-
</>
|
|
434
|
-
) : null}
|
|
435
|
-
{warnings.length > 0 ? (
|
|
436
|
-
<>
|
|
437
|
-
<text fg='yellow'>
|
|
438
|
-
<b>⚠ Warnings:</b>
|
|
439
|
-
</text>
|
|
440
|
-
{warnings.map((warn, i) => (
|
|
441
|
-
// eslint-disable-next-line react/no-array-index-key
|
|
442
|
-
<text fg='yellow' key={`w${i}`}>
|
|
443
|
-
{' '}• {warn}
|
|
444
|
-
</text>
|
|
445
|
-
))}
|
|
446
|
-
</>
|
|
447
|
-
) : null}
|
|
448
|
-
{errors.length > 0 ? <text fg={DIM}>Fix the errors above and restart.</text> : null}
|
|
449
|
-
</box>
|
|
450
|
-
)
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
// eslint-disable-next-line max-statements
|
|
454
|
-
const TitleBarTop = ({
|
|
455
|
-
allDone,
|
|
456
|
-
backgroundOcr,
|
|
457
|
-
failed,
|
|
458
|
-
pipelineStartedAt,
|
|
459
|
-
runningCommand,
|
|
460
|
-
stepsData
|
|
461
|
-
}: {
|
|
462
|
-
allDone: boolean
|
|
463
|
-
backgroundOcr: boolean
|
|
464
|
-
failed: boolean
|
|
465
|
-
pipelineStartedAt: number
|
|
466
|
-
runningCommand: CommandKey | null
|
|
467
|
-
stepsData: AllStepsData | null
|
|
468
|
-
}) => {
|
|
469
|
-
const [elapsed, setElapsed] = useState(0)
|
|
470
|
-
|
|
471
|
-
useEffect(() => {
|
|
472
|
-
if (pipelineStartedAt <= 0) return
|
|
473
|
-
const tick = (): void => {
|
|
474
|
-
setElapsed(Math.max(0, Math.floor((Date.now() - pipelineStartedAt) / 1000)))
|
|
475
|
-
}
|
|
476
|
-
tick()
|
|
477
|
-
const timer = setInterval(tick, 1000)
|
|
478
|
-
return () => clearInterval(timer)
|
|
479
|
-
}, [pipelineStartedAt])
|
|
480
|
-
|
|
481
|
-
let statusText = ''
|
|
482
|
-
if (allDone) statusText = '✓ Complete'
|
|
483
|
-
else if (failed) statusText = '✗ Failed'
|
|
484
|
-
else if (runningCommand && stepsData) {
|
|
485
|
-
let currentIdx = 0
|
|
486
|
-
let currentName = ''
|
|
487
|
-
for (const s of STEPS)
|
|
488
|
-
if (s.command === runningCommand) {
|
|
489
|
-
currentIdx = s.stepNum
|
|
490
|
-
currentName = s.name
|
|
491
|
-
break
|
|
492
|
-
}
|
|
493
|
-
if (currentIdx > 0) {
|
|
494
|
-
const parallelLabel = backgroundOcr && runningCommand === 'pipeline' ? '2+3/5 Convert+OCR' : null
|
|
495
|
-
statusText = parallelLabel ?? `${currentIdx}/5 ${currentName}`
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
const statusColor = allDone ? 'green' : failed ? 'red' : 'cyan'
|
|
500
|
-
|
|
501
|
-
return (
|
|
502
|
-
<box justifyContent='space-between' paddingLeft={1} paddingRight={1}>
|
|
503
|
-
<text fg='#e0a040'>
|
|
504
|
-
<b>anymd</b>
|
|
505
|
-
</text>
|
|
506
|
-
<box gap={2}>
|
|
507
|
-
{statusText === '' ? null : (
|
|
508
|
-
<text fg={statusColor}>
|
|
509
|
-
<b>{statusText}</b>
|
|
510
|
-
</text>
|
|
511
|
-
)}
|
|
512
|
-
{pipelineStartedAt > 0 ? <text fg={DIM}>{formatDuration(elapsed)}</text> : null}
|
|
513
|
-
</box>
|
|
514
|
-
</box>
|
|
515
|
-
)
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
const formatChars = (chars: number): string => {
|
|
519
|
-
if (chars >= 1_000_000) return `${(chars / 1_000_000).toFixed(1)}M`
|
|
520
|
-
if (chars >= 1000) return `${(chars / 1000).toFixed(1)}K`
|
|
521
|
-
return String(chars)
|
|
522
|
-
}
|
|
523
|
-
|
|
524
|
-
const SidebarSummary = ({
|
|
525
|
-
allDone,
|
|
526
|
-
datasetResult,
|
|
527
|
-
stepDurations
|
|
528
|
-
}: {
|
|
529
|
-
allDone: boolean
|
|
530
|
-
datasetResult: DatasetResult | null
|
|
531
|
-
stepDurations: Partial<Record<CommandKey, number>>
|
|
532
|
-
}) => {
|
|
533
|
-
if (!allDone) return null
|
|
534
|
-
return (
|
|
535
|
-
<box flexDirection='column' paddingLeft={1} paddingTop={1}>
|
|
536
|
-
<text fg='green'>
|
|
537
|
-
<b>✓ All 5 steps complete</b>
|
|
538
|
-
</text>
|
|
539
|
-
{datasetResult ? (
|
|
540
|
-
<box flexDirection='column' paddingLeft={1}>
|
|
541
|
-
<text>
|
|
542
|
-
<span fg={DIM}>entries </span>
|
|
543
|
-
<b fg='green'>{datasetResult.entries.toLocaleString()}</b>
|
|
544
|
-
</text>
|
|
545
|
-
<text>
|
|
546
|
-
<span fg={DIM}>chars </span>
|
|
547
|
-
<b>{formatChars(datasetResult.totalChars)}</b>
|
|
548
|
-
</text>
|
|
549
|
-
{datasetResult.skipped > 0 ? (
|
|
550
|
-
<text>
|
|
551
|
-
<span fg={DIM}>skipped </span>
|
|
552
|
-
<span fg='yellow'>{datasetResult.skipped}</span>
|
|
553
|
-
</text>
|
|
554
|
-
) : null}
|
|
555
|
-
{datasetResult.duplicates > 0 ? (
|
|
556
|
-
<text>
|
|
557
|
-
<span fg={DIM}>deduped </span>
|
|
558
|
-
<span fg='yellow'>{datasetResult.duplicates}</span>
|
|
559
|
-
</text>
|
|
560
|
-
) : null}
|
|
561
|
-
</box>
|
|
562
|
-
) : null}
|
|
563
|
-
<box flexDirection='column' paddingLeft={1}>
|
|
564
|
-
{STEPS.map(step => {
|
|
565
|
-
const dur = stepDurations[step.command]
|
|
566
|
-
if (dur === undefined) return null
|
|
567
|
-
return (
|
|
568
|
-
<text key={step.command}>
|
|
569
|
-
<span fg={DIM}>{step.name.slice(0, 14).padEnd(14)} </span>
|
|
570
|
-
<span fg='cyan'>{formatDuration(dur)}</span>
|
|
571
|
-
</text>
|
|
572
|
-
)
|
|
573
|
-
})}
|
|
574
|
-
</box>
|
|
575
|
-
</box>
|
|
576
|
-
)
|
|
577
|
-
}
|
|
578
|
-
|
|
579
|
-
const HelpDialog = ({ height, width }: { height: number; width: number }) => {
|
|
580
|
-
const boxW = 40
|
|
581
|
-
const boxH = 12
|
|
582
|
-
const left = Math.max(0, Math.floor((width - boxW) / 2))
|
|
583
|
-
const top = Math.max(0, Math.floor((height - boxH) / 2))
|
|
584
|
-
|
|
585
|
-
return (
|
|
586
|
-
<box
|
|
587
|
-
border
|
|
588
|
-
borderColor='#e0a040'
|
|
589
|
-
borderStyle='rounded'
|
|
590
|
-
flexDirection='column'
|
|
591
|
-
height={boxH}
|
|
592
|
-
marginLeft={left}
|
|
593
|
-
marginTop={top}
|
|
594
|
-
paddingLeft={2}
|
|
595
|
-
paddingRight={2}
|
|
596
|
-
width={boxW}>
|
|
597
|
-
<text fg='#e0a040'>
|
|
598
|
-
<b>Keybinds</b>
|
|
599
|
-
</text>
|
|
600
|
-
<text> </text>
|
|
601
|
-
<text>
|
|
602
|
-
<b fg='cyan'>Q</b>
|
|
603
|
-
<span fg={DIM}>{' '}Quit</span>
|
|
604
|
-
</text>
|
|
605
|
-
<text>
|
|
606
|
-
<b fg='cyan'>L</b>
|
|
607
|
-
<span fg={DIM}>{' '}Toggle log / output view</span>
|
|
608
|
-
</text>
|
|
609
|
-
<text>
|
|
610
|
-
<b fg='cyan'>R</b>
|
|
611
|
-
<span fg={DIM}>{' '}Retry failed step</span>
|
|
612
|
-
</text>
|
|
613
|
-
<text>
|
|
614
|
-
<b fg='cyan'>S</b>
|
|
615
|
-
<span fg={DIM}>{' '}Skip failed step</span>
|
|
616
|
-
</text>
|
|
617
|
-
<text>
|
|
618
|
-
<b fg='cyan'>?</b>
|
|
619
|
-
<span fg={DIM}>{' '}Toggle this help</span>
|
|
620
|
-
</text>
|
|
621
|
-
<text> </text>
|
|
622
|
-
<text fg={DIM}>Press ? or Esc to close</text>
|
|
623
|
-
</box>
|
|
624
|
-
)
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
const computeTerminalTitle = (s: AppState): string => {
|
|
628
|
-
if (s.allDone) return 'Doc Pipeline \u2014 \u2713 Complete'
|
|
629
|
-
if (s.failed) return 'Doc Pipeline \u2014 \u2717 Failed'
|
|
630
|
-
if (!s.runningCommand) return 'Doc Pipeline'
|
|
631
|
-
const step = STEPS.find(st => st.command === s.runningCommand)
|
|
632
|
-
if (!step) return 'Doc Pipeline'
|
|
633
|
-
const sd = s.stepsData?.[s.runningCommand]
|
|
634
|
-
const pct = sd && sd.total > 0 ? Math.round((sd.done / sd.total) * 100) : 0
|
|
635
|
-
const label =
|
|
636
|
-
s.backgroundOcr && s.runningCommand === 'pipeline'
|
|
637
|
-
? `Steps 2+3/5 \u2014 Convert + OCR ${pct}%`
|
|
638
|
-
: `Step ${step.stepNum}/5 \u2014 ${step.name} ${pct}%`
|
|
639
|
-
return `Doc Pipeline \u2014 ${label}`
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
// eslint-disable-next-line max-statements
|
|
643
|
-
const readStream = async (stream: ReadableStream<Uint8Array>, onLine: (line: string) => void): Promise<void> => {
|
|
644
|
-
const reader = stream.getReader()
|
|
645
|
-
const decoder = new TextDecoder()
|
|
646
|
-
let buffer = ''
|
|
647
|
-
try {
|
|
648
|
-
for (;;) {
|
|
649
|
-
// biome-ignore lint/performance/noAwaitInLoops: streaming reads
|
|
650
|
-
const { done, value } = await reader.read() // eslint-disable-line no-await-in-loop
|
|
651
|
-
if (done) break
|
|
652
|
-
buffer += decoder.decode(value, { stream: true })
|
|
653
|
-
const parts = buffer.split('\n')
|
|
654
|
-
buffer = parts.pop() ?? ''
|
|
655
|
-
for (const part of parts) if (part.trim() !== '') onLine(part)
|
|
656
|
-
}
|
|
657
|
-
if (buffer.trim() !== '') onLine(buffer)
|
|
658
|
-
} finally {
|
|
659
|
-
reader.releaseLock()
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
// eslint-disable-next-line max-statements, complexity
|
|
664
|
-
const App = () => {
|
|
665
|
-
const [state, dispatch] = useReducer(reducer, initialState)
|
|
666
|
-
const procRef = useRef<null | ReturnType<typeof Bun.spawn>>(null)
|
|
667
|
-
const ocrProcRef = useRef<null | ReturnType<typeof Bun.spawn>>(null)
|
|
668
|
-
const busyRef = useRef(false)
|
|
669
|
-
const errorLogClearedRef = useRef(false)
|
|
670
|
-
const [showHelp, setShowHelp] = useState(false)
|
|
671
|
-
const { height, width } = useTerminalDimensions()
|
|
672
|
-
|
|
673
|
-
useEffect(() => {
|
|
674
|
-
setTerminalTitle(computeTerminalTitle(state))
|
|
675
|
-
}, [state.allDone, state.failed, state.runningCommand, state.backgroundOcr, state.stepsData])
|
|
676
|
-
|
|
677
|
-
const refreshSteps = useCallback(async (): Promise<AllStepsData> => {
|
|
678
|
-
const data = await fetchStepData()
|
|
679
|
-
dispatch({ data, type: 'SET_STEPS' })
|
|
680
|
-
return data
|
|
681
|
-
}, [])
|
|
682
|
-
|
|
683
|
-
const refreshLog = useCallback(async () => {
|
|
684
|
-
const lines = await readLogTail(200)
|
|
685
|
-
dispatch({ lines, type: 'SET_LOG' })
|
|
686
|
-
}, [])
|
|
687
|
-
|
|
688
|
-
const executeCommand = useCallback(
|
|
689
|
-
// eslint-disable-next-line complexity, max-statements
|
|
690
|
-
async (key: CommandKey): Promise<number> => {
|
|
691
|
-
if (busyRef.current) return -1
|
|
692
|
-
busyRef.current = true
|
|
693
|
-
|
|
694
|
-
try {
|
|
695
|
-
if (key === 'classify') {
|
|
696
|
-
dispatch({ label: 'Classifying PDFs...', stepCommand: key, type: 'START_COMMAND' })
|
|
697
|
-
await runClassify(prog => {
|
|
698
|
-
const statusLine = `${prog.done}/${prog.total} \u2014 native=${prog.native} scanned=${prog.scanned} mixed=${prog.mixed}`
|
|
699
|
-
dispatch({ status: statusLine, type: 'SET_RUNNING_STATUS' })
|
|
700
|
-
dispatch({ line: `${prog.file} \u2192 ${prog.category}`, type: 'APPEND_OUTPUT' })
|
|
701
|
-
if (prog.category === 'error') appendErrorLog('classify', `Failed: ${prog.file}`)
|
|
702
|
-
})
|
|
703
|
-
dispatch({ code: 0, type: 'COMMAND_DONE' })
|
|
704
|
-
return 0
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
if (key === 'dataset') {
|
|
708
|
-
dispatch({ label: 'Building dataset...', stepCommand: key, type: 'START_COMMAND' })
|
|
709
|
-
const result: DatasetResult = await buildDataset({
|
|
710
|
-
onFileResult: prog => {
|
|
711
|
-
const icon = prog.status === 'added' ? '\u2713' : prog.status === 'duplicate' ? '\u2261' : '\u2192'
|
|
712
|
-
const charStr = prog.chars >= 1000 ? `${(prog.chars / 1000).toFixed(1)}K` : `${prog.chars}`
|
|
713
|
-
dispatch({ line: `${icon} ${prog.file} \u2192 ${prog.status} (${charStr} chars)`, type: 'APPEND_OUTPUT' })
|
|
714
|
-
},
|
|
715
|
-
onReadProgress: (readDone, total) => {
|
|
716
|
-
dispatch({ status: `Reading ${readDone}/${total} files...`, type: 'SET_RUNNING_STATUS' })
|
|
717
|
-
}
|
|
718
|
-
})
|
|
719
|
-
dispatch({ result, type: 'SET_DATASET_RESULT' })
|
|
720
|
-
const dupStr = result.duplicates > 0 ? `, ${result.duplicates} duplicates removed` : ''
|
|
721
|
-
dispatch({
|
|
722
|
-
line: `Dataset: ${result.entries} entries, ${result.skipped} skipped${dupStr}, ${result.totalChars.toLocaleString()} chars`,
|
|
723
|
-
type: 'APPEND_OUTPUT'
|
|
724
|
-
})
|
|
725
|
-
dispatch({ code: 0, type: 'COMMAND_DONE' })
|
|
726
|
-
return 0
|
|
727
|
-
}
|
|
728
|
-
|
|
729
|
-
if (key === 'ocr') {
|
|
730
|
-
const stats = await getOcrStats()
|
|
731
|
-
if (stats.total === 0) {
|
|
732
|
-
dispatch({ label: 'OCR', stepCommand: key, type: 'START_COMMAND' })
|
|
733
|
-
dispatch({ line: 'No classification found. Classify PDFs first.', type: 'APPEND_OUTPUT' })
|
|
734
|
-
dispatch({ code: 1, type: 'COMMAND_DONE' })
|
|
735
|
-
return 1
|
|
736
|
-
}
|
|
737
|
-
if (stats.remaining === 0) {
|
|
738
|
-
dispatch({ label: 'OCR', stepCommand: key, type: 'START_COMMAND' })
|
|
739
|
-
dispatch({ line: "All files already OCR'd.", type: 'APPEND_OUTPUT' })
|
|
740
|
-
dispatch({ code: 0, type: 'COMMAND_DONE' })
|
|
741
|
-
return 0
|
|
742
|
-
}
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
if (key === 'enhance') {
|
|
746
|
-
dispatch({ label: 'Enhancing OCR markdown...', stepCommand: key, type: 'START_COMMAND' })
|
|
747
|
-
const result = await runEnhanceOcr(prog => {
|
|
748
|
-
const statusLine = `${prog.done}/${prog.total} files enhanced`
|
|
749
|
-
dispatch({ status: statusLine, type: 'SET_RUNNING_STATUS' })
|
|
750
|
-
const icon = prog.status === 'enhanced' ? '\u2713' : prog.status === 'skipped' ? '\u2192' : '\u2717'
|
|
751
|
-
dispatch({ line: `${icon} ${prog.file} \u2192 ${prog.status}`, type: 'APPEND_OUTPUT' })
|
|
752
|
-
if (prog.status === 'failed') appendErrorLog('enhance', `Failed: ${prog.file}`)
|
|
753
|
-
})
|
|
754
|
-
dispatch({ command: 'enhance', count: result.failed, type: 'RECORD_FAILURES' })
|
|
755
|
-
const enhanceLine =
|
|
756
|
-
result.skipped > 0
|
|
757
|
-
? `Enhanced: ${result.enhanced}, Skipped: ${result.skipped}, Failed: ${result.failed}`
|
|
758
|
-
: `Enhanced: ${result.enhanced}, Failed: ${result.failed}`
|
|
759
|
-
dispatch({ line: enhanceLine, type: 'APPEND_OUTPUT' })
|
|
760
|
-
dispatch({ code: 0, type: 'COMMAND_DONE' })
|
|
761
|
-
return 0
|
|
762
|
-
}
|
|
763
|
-
|
|
764
|
-
if (key === 'pipeline') {
|
|
765
|
-
await clearPipelineLog()
|
|
766
|
-
await writeNativeFileList()
|
|
767
|
-
}
|
|
768
|
-
|
|
769
|
-
const spawned = spawnCommand(key)
|
|
770
|
-
if (!spawned) return -1
|
|
771
|
-
dispatch({ label: spawned.label, stepCommand: key, type: 'START_COMMAND' })
|
|
772
|
-
procRef.current = spawned.proc
|
|
773
|
-
|
|
774
|
-
const onLine = (line: string): void => {
|
|
775
|
-
dispatch({ line, type: 'APPEND_OUTPUT' })
|
|
776
|
-
if (key === 'pipeline') appendPipelineLog(line)
|
|
777
|
-
if (ERROR_PATTERN.test(line)) appendErrorLog(key, line)
|
|
778
|
-
}
|
|
779
|
-
|
|
780
|
-
const { stderr, stdout } = spawned.proc
|
|
781
|
-
const stdoutPromise = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
|
|
782
|
-
const stderrPromise = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
|
|
783
|
-
|
|
784
|
-
await Promise.all([stdoutPromise, stderrPromise])
|
|
785
|
-
const code = await spawned.proc.exited
|
|
786
|
-
dispatch({ code, type: 'COMMAND_DONE' })
|
|
787
|
-
procRef.current = null
|
|
788
|
-
return code
|
|
789
|
-
} catch (execError) {
|
|
790
|
-
const msg = String(execError)
|
|
791
|
-
dispatch({ line: `Error: ${msg}`, type: 'APPEND_OUTPUT' })
|
|
792
|
-
appendErrorLog(key, msg)
|
|
793
|
-
dispatch({ code: 1, type: 'COMMAND_DONE' })
|
|
794
|
-
return 1
|
|
795
|
-
} finally {
|
|
796
|
-
busyRef.current = false // eslint-disable-line require-atomic-updates
|
|
797
|
-
}
|
|
798
|
-
},
|
|
799
|
-
[]
|
|
800
|
-
)
|
|
801
|
-
|
|
802
|
-
// eslint-disable-next-line max-statements
|
|
803
|
-
const runParallelConvertOcr = useCallback(async (): Promise<number> => {
|
|
804
|
-
busyRef.current = true
|
|
805
|
-
const parallelStart = Date.now()
|
|
806
|
-
try {
|
|
807
|
-
const ocrStats = await getOcrStats()
|
|
808
|
-
let ocrSpawned: ReturnType<typeof spawnCommand> = null
|
|
809
|
-
if (ocrStats.remaining > 0) {
|
|
810
|
-
ocrSpawned = spawnCommand('ocr')
|
|
811
|
-
if (ocrSpawned) {
|
|
812
|
-
ocrProcRef.current = ocrSpawned.proc
|
|
813
|
-
dispatch({ active: true, type: 'SET_BACKGROUND_OCR' })
|
|
814
|
-
const { stderr: ocrErr, stdout: ocrOut } = ocrSpawned.proc
|
|
815
|
-
const ocrErrorLine = (line: string): void => {
|
|
816
|
-
if (ERROR_PATTERN.test(line)) appendErrorLog('ocr', line)
|
|
817
|
-
}
|
|
818
|
-
if (ocrOut instanceof ReadableStream) readStream(ocrOut, ocrErrorLine)
|
|
819
|
-
if (ocrErr instanceof ReadableStream) readStream(ocrErr, ocrErrorLine)
|
|
820
|
-
}
|
|
821
|
-
}
|
|
822
|
-
|
|
823
|
-
await clearPipelineLog()
|
|
824
|
-
await writeNativeFileList()
|
|
825
|
-
const pipelineSpawned = spawnCommand('pipeline')
|
|
826
|
-
if (!pipelineSpawned) return -1
|
|
827
|
-
|
|
828
|
-
dispatch({ label: pipelineSpawned.label, stepCommand: 'pipeline', type: 'START_COMMAND' })
|
|
829
|
-
procRef.current = pipelineSpawned.proc
|
|
830
|
-
|
|
831
|
-
const onLine = (line: string): void => {
|
|
832
|
-
dispatch({ line, type: 'APPEND_OUTPUT' })
|
|
833
|
-
appendPipelineLog(line)
|
|
834
|
-
if (ERROR_PATTERN.test(line)) appendErrorLog('pipeline', line)
|
|
835
|
-
}
|
|
836
|
-
const { stderr, stdout } = pipelineSpawned.proc
|
|
837
|
-
const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
|
|
838
|
-
const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
|
|
839
|
-
|
|
840
|
-
// biome-ignore lint/performance/noAwaitInLoops: awaiting parallel streams
|
|
841
|
-
await Promise.all([stdoutP, stderrP]) // eslint-disable-line no-await-in-loop
|
|
842
|
-
const pipeCode = await pipelineSpawned.proc.exited
|
|
843
|
-
procRef.current = null
|
|
844
|
-
dispatch({ code: pipeCode, type: 'COMMAND_DONE' })
|
|
845
|
-
|
|
846
|
-
if (pipeCode !== 0) {
|
|
847
|
-
if (ocrProcRef.current) {
|
|
848
|
-
ocrProcRef.current.kill()
|
|
849
|
-
ocrProcRef.current = null
|
|
850
|
-
}
|
|
851
|
-
dispatch({ active: false, type: 'SET_BACKGROUND_OCR' })
|
|
852
|
-
return pipeCode
|
|
853
|
-
}
|
|
854
|
-
|
|
855
|
-
if (ocrProcRef.current) {
|
|
856
|
-
dispatch({ label: 'Chandra OCR', stepCommand: 'ocr', type: 'START_COMMAND' })
|
|
857
|
-
procRef.current = ocrProcRef.current
|
|
858
|
-
const ocrCode = await ocrProcRef.current.exited
|
|
859
|
-
procRef.current = null
|
|
860
|
-
ocrProcRef.current = null // eslint-disable-line require-atomic-updates
|
|
861
|
-
dispatch({ active: false, type: 'SET_BACKGROUND_OCR' })
|
|
862
|
-
dispatch({ command: 'ocr', seconds: Math.floor((Date.now() - parallelStart) / 1000), type: 'RECORD_DURATION' })
|
|
863
|
-
dispatch({ code: ocrCode, type: 'COMMAND_DONE' })
|
|
864
|
-
return ocrCode
|
|
865
|
-
}
|
|
866
|
-
|
|
867
|
-
dispatch({ active: false, type: 'SET_BACKGROUND_OCR' })
|
|
868
|
-
return 0
|
|
869
|
-
} finally {
|
|
870
|
-
busyRef.current = false // eslint-disable-line require-atomic-updates
|
|
871
|
-
}
|
|
872
|
-
}, [])
|
|
873
|
-
|
|
874
|
-
// eslint-disable-next-line max-statements
|
|
875
|
-
const autoRun = useCallback(async () => {
|
|
876
|
-
if (busyRef.current) return
|
|
877
|
-
|
|
878
|
-
if (!errorLogClearedRef.current) {
|
|
879
|
-
errorLogClearedRef.current = true
|
|
880
|
-
await clearErrorLog()
|
|
881
|
-
}
|
|
882
|
-
|
|
883
|
-
dispatch({ line: 'Checking Python environment...', type: 'APPEND_OUTPUT' })
|
|
884
|
-
const ok = await bootstrapPython({
|
|
885
|
-
onDone: () => {
|
|
886
|
-
dispatch({ line: 'Python environment ready.', type: 'APPEND_OUTPUT' })
|
|
887
|
-
},
|
|
888
|
-
onStep: (msg: string) => {
|
|
889
|
-
dispatch({ line: msg, type: 'APPEND_OUTPUT' })
|
|
890
|
-
}
|
|
891
|
-
})
|
|
892
|
-
if (!ok) {
|
|
893
|
-
dispatch({
|
|
894
|
-
errors: ['Python bootstrap failed. Install uv and try again.'],
|
|
895
|
-
type: 'SET_PREFLIGHT',
|
|
896
|
-
warnings: []
|
|
897
|
-
})
|
|
898
|
-
return
|
|
899
|
-
}
|
|
900
|
-
|
|
901
|
-
const preflight = await runPreflight()
|
|
902
|
-
dispatch({ errors: preflight.errors, type: 'SET_PREFLIGHT', warnings: preflight.warnings })
|
|
903
|
-
if (preflight.errors.length > 0) return
|
|
904
|
-
|
|
905
|
-
const data = await refreshSteps()
|
|
906
|
-
const nextStep = findFirstIncomplete(data)
|
|
907
|
-
|
|
908
|
-
if (!nextStep) {
|
|
909
|
-
dispatch({ type: 'SET_ALL_DONE' })
|
|
910
|
-
process.stdout.write('\u0007')
|
|
911
|
-
Bun.spawn(['osascript', '-e', 'display notification "Pipeline complete" with title "Doc Pipeline"'])
|
|
912
|
-
return
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
let code: number
|
|
916
|
-
const classifyDone = isStepDone('classify', data)
|
|
917
|
-
const pipelineNeeded = !isStepDone('pipeline', data)
|
|
918
|
-
const ocrNeeded = !isStepDone('ocr', data)
|
|
919
|
-
|
|
920
|
-
if (nextStep === 'pipeline' && classifyDone && pipelineNeeded && ocrNeeded) code = await runParallelConvertOcr()
|
|
921
|
-
else code = await executeCommand(nextStep)
|
|
922
|
-
|
|
923
|
-
if (code === 0) {
|
|
924
|
-
await refreshSteps()
|
|
925
|
-
setTimeout(() => {
|
|
926
|
-
autoRun()
|
|
927
|
-
}, 500)
|
|
928
|
-
}
|
|
929
|
-
}, [refreshSteps, executeCommand, runParallelConvertOcr])
|
|
930
|
-
|
|
931
|
-
useEffect(() => {
|
|
932
|
-
autoRun()
|
|
933
|
-
}, [autoRun])
|
|
934
|
-
|
|
935
|
-
useEffect(() => {
|
|
936
|
-
const interval = setInterval(() => {
|
|
937
|
-
refreshSteps()
|
|
938
|
-
if (state.showLog) refreshLog()
|
|
939
|
-
}, 2000)
|
|
940
|
-
return () => clearInterval(interval)
|
|
941
|
-
}, [refreshSteps, refreshLog, state.showLog])
|
|
942
|
-
|
|
943
|
-
// eslint-disable-next-line max-statements
|
|
944
|
-
const skipCurrentStep = useCallback(async () => {
|
|
945
|
-
const currentCmd = state.runningCommand
|
|
946
|
-
dispatch({ type: 'CLEAR_FAILURE' })
|
|
947
|
-
if (!currentCmd) return
|
|
948
|
-
|
|
949
|
-
const idx = STEP_ORDER.indexOf(currentCmd)
|
|
950
|
-
const data = await refreshSteps()
|
|
951
|
-
const remaining = STEP_ORDER.slice(idx + 1)
|
|
952
|
-
let nextKey: CommandKey | null = null
|
|
953
|
-
for (const k of remaining)
|
|
954
|
-
if (!isStepDone(k, data)) {
|
|
955
|
-
nextKey = k
|
|
956
|
-
break
|
|
957
|
-
}
|
|
958
|
-
|
|
959
|
-
if (!nextKey) {
|
|
960
|
-
dispatch({ type: 'SET_ALL_DONE' })
|
|
961
|
-
return
|
|
962
|
-
}
|
|
963
|
-
|
|
964
|
-
const code = await executeCommand(nextKey)
|
|
965
|
-
if (code === 0) {
|
|
966
|
-
await refreshSteps()
|
|
967
|
-
autoRun()
|
|
968
|
-
}
|
|
969
|
-
}, [state.runningCommand, refreshSteps, executeCommand, autoRun])
|
|
970
|
-
|
|
971
|
-
// eslint-disable-next-line complexity, max-statements
|
|
972
|
-
useKeyboard(key => {
|
|
973
|
-
if (showHelp) {
|
|
974
|
-
if (key.name === '?' || key.name === 'escape') setShowHelp(false)
|
|
975
|
-
return
|
|
976
|
-
}
|
|
977
|
-
|
|
978
|
-
if (key.name === '?') {
|
|
979
|
-
setShowHelp(true)
|
|
980
|
-
return
|
|
981
|
-
}
|
|
982
|
-
|
|
983
|
-
if (key.name === 'q' || key.name === 'escape') {
|
|
984
|
-
if (procRef.current) {
|
|
985
|
-
procRef.current.kill()
|
|
986
|
-
procRef.current = null
|
|
987
|
-
}
|
|
988
|
-
if (ocrProcRef.current) {
|
|
989
|
-
ocrProcRef.current.kill()
|
|
990
|
-
ocrProcRef.current = null
|
|
991
|
-
}
|
|
992
|
-
setTerminalTitle('')
|
|
993
|
-
process.exit(0)
|
|
994
|
-
return
|
|
995
|
-
}
|
|
996
|
-
|
|
997
|
-
if (key.name === 'l') {
|
|
998
|
-
refreshLog()
|
|
999
|
-
dispatch({ type: 'TOGGLE_LOG' })
|
|
1000
|
-
return
|
|
1001
|
-
}
|
|
1002
|
-
|
|
1003
|
-
if (state.failed) {
|
|
1004
|
-
if (key.name === 'r') {
|
|
1005
|
-
dispatch({ type: 'CLEAR_FAILURE' })
|
|
1006
|
-
autoRun()
|
|
1007
|
-
return
|
|
1008
|
-
}
|
|
1009
|
-
if (key.name === 's') skipCurrentStep()
|
|
1010
|
-
}
|
|
1011
|
-
})
|
|
1012
|
-
|
|
1013
|
-
const logHeight = Math.max(5, height - 3)
|
|
1014
|
-
const displayLines = state.showLog ? state.logLines : state.runningLines
|
|
1015
|
-
|
|
1016
|
-
return (
|
|
1017
|
-
<box flexDirection='column' height={height}>
|
|
1018
|
-
<TitleBarTop
|
|
1019
|
-
allDone={state.allDone}
|
|
1020
|
-
backgroundOcr={state.backgroundOcr}
|
|
1021
|
-
failed={state.failed}
|
|
1022
|
-
pipelineStartedAt={state.pipelineStartedAt}
|
|
1023
|
-
runningCommand={state.runningCommand}
|
|
1024
|
-
stepsData={state.stepsData}
|
|
1025
|
-
/>
|
|
1026
|
-
<box flexGrow={1}>
|
|
1027
|
-
<box flexDirection='column' paddingLeft={1} width={SIDEBAR_WIDTH}>
|
|
1028
|
-
{state.stepsData ? (
|
|
1029
|
-
<box flexDirection='column'>
|
|
1030
|
-
{STEPS.map(step => {
|
|
1031
|
-
const isFg = state.runningCommand === step.command
|
|
1032
|
-
const isBgOcr = step.command === 'ocr' && state.backgroundOcr
|
|
1033
|
-
const isActive = isFg || isBgOcr
|
|
1034
|
-
const sd = state.stepsData ? state.stepsData[step.command] : undefined
|
|
1035
|
-
const failures = sd?.failed ?? state.stepFailures[step.command]
|
|
1036
|
-
return (
|
|
1037
|
-
<SidebarStep
|
|
1038
|
-
completedDuration={state.stepDurations[step.command]}
|
|
1039
|
-
failedCount={failures}
|
|
1040
|
-
isFailed={state.failed ? isFg : false}
|
|
1041
|
-
isRunning={!state.failed && isActive}
|
|
1042
|
-
key={step.command}
|
|
1043
|
-
ocrProgress={step.command === 'ocr' ? state.stepsData?.ocr.progress : undefined}
|
|
1044
|
-
runningStatus={isFg ? state.runningStatus : ''}
|
|
1045
|
-
step={step}
|
|
1046
|
-
stepData={sd}
|
|
1047
|
-
stepStartedAt={isActive ? state.stepStartedAt : 0}
|
|
1048
|
-
/>
|
|
1049
|
-
)
|
|
1050
|
-
})}
|
|
1051
|
-
</box>
|
|
1052
|
-
) : (
|
|
1053
|
-
<text fg={DIM}>Loading...</text>
|
|
1054
|
-
)}
|
|
1055
|
-
<SidebarSummary
|
|
1056
|
-
allDone={state.allDone}
|
|
1057
|
-
datasetResult={state.datasetResult}
|
|
1058
|
-
stepDurations={state.stepDurations}
|
|
1059
|
-
/>
|
|
1060
|
-
</box>
|
|
1061
|
-
<box flexDirection='column' flexGrow={1} paddingLeft={2}>
|
|
1062
|
-
{state.preflightErrors.length > 0 || state.preflightWarnings.length > 0 ? (
|
|
1063
|
-
<PreflightBanner errors={state.preflightErrors} warnings={state.preflightWarnings} />
|
|
1064
|
-
) : null}
|
|
1065
|
-
<scrollbox focused height={logHeight} paddingLeft={1} stickyScroll>
|
|
1066
|
-
{displayLines.length > 0 ? (
|
|
1067
|
-
displayLines.map((line, i) => {
|
|
1068
|
-
const isError = ERROR_PATTERN.test(line)
|
|
1069
|
-
return (
|
|
1070
|
-
// eslint-disable-next-line react/no-array-index-key
|
|
1071
|
-
<text fg={isError ? 'red' : DIM} key={i}>
|
|
1072
|
-
{line}
|
|
1073
|
-
</text>
|
|
1074
|
-
)
|
|
1075
|
-
})
|
|
1076
|
-
) : (
|
|
1077
|
-
<text fg={DIM}>Waiting for output...</text>
|
|
1078
|
-
)}
|
|
1079
|
-
</scrollbox>
|
|
1080
|
-
</box>
|
|
1081
|
-
</box>
|
|
1082
|
-
{showHelp ? <HelpDialog height={height} width={width} /> : null}
|
|
1083
|
-
</box>
|
|
1084
|
-
)
|
|
1085
|
-
}
|
|
1086
|
-
|
|
1087
|
-
const start = async () => {
|
|
1088
|
-
const renderer = await createCliRenderer({ exitOnCtrlC: false })
|
|
1089
|
-
createRoot(renderer).render(<App />)
|
|
1090
|
-
}
|
|
1091
|
-
|
|
1092
|
-
const isDirectRun = process.argv[1]?.endsWith('tui.tsx') ?? false
|
|
1093
|
-
if (isDirectRun) {
|
|
1094
|
-
const { resolve } = await import('node:path')
|
|
1095
|
-
const { initPaths } = await import('~/paths')
|
|
1096
|
-
try {
|
|
1097
|
-
getPaths()
|
|
1098
|
-
} catch {
|
|
1099
|
-
initPaths(resolve('data'), resolve('output'))
|
|
1100
|
-
}
|
|
1101
|
-
// oxlint-disable-next-line unicorn/prefer-top-level-await
|
|
1102
|
-
start()
|
|
1103
|
-
}
|
|
1104
|
-
|
|
1105
|
-
export { start }
|