@arbidocs/cli 0.3.33 → 0.3.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arbidocs/cli",
3
- "version": "0.3.33",
3
+ "version": "0.3.35",
4
4
  "description": "CLI tool for interacting with ARBI — login, manage workspaces, upload documents, query the RAG assistant",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -21,9 +21,9 @@
21
21
  "preuninstall": "node scripts/preuninstall.js"
22
22
  },
23
23
  "dependencies": {
24
- "@arbidocs/sdk": "0.3.33",
25
- "@arbidocs/client": "0.3.33",
26
- "@arbidocs/tui": "0.3.33",
24
+ "@arbidocs/sdk": "0.3.35",
25
+ "@arbidocs/client": "0.3.35",
26
+ "@arbidocs/tui": "0.3.35",
27
27
  "@inquirer/prompts": "^8.2.0",
28
28
  "chalk": "^5.6.2",
29
29
  "commander": "^13.1.0"
@@ -0,0 +1,275 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * bench-upload.mjs
4
+ *
5
+ * Benchmark arbi upload speed: legacy multipart vs direct-to-MinIO (--s3).
6
+ *
7
+ * Generates fresh unique PDFs into a temp dir, creates a disposable workspace
8
+ * per run (so server-side dedup never kicks in), invokes `arbi upload --watch`
9
+ * with and without `--s3`, and measures wall-clock time from spawn to close.
10
+ * Watch mode is used by default because the upload endpoint only acks —
11
+ * actual time-to-completion includes server-side processing.
12
+ *
13
+ * Requires:
14
+ * - arbi is logged in (`arbi login`) with access to a project
15
+ * - a live backend reachable from the CLI (defaults to the configured one)
16
+ *
17
+ * Usage:
18
+ * node packages/arbi-cli/scripts/bench-upload.mjs
19
+ * node packages/arbi-cli/scripts/bench-upload.mjs --count=10 --runs=2
20
+ * node packages/arbi-cli/scripts/bench-upload.mjs --no-watch # upload-only
21
+ * node packages/arbi-cli/scripts/bench-upload.mjs --bin=arbi # use global
22
+ */
23
+
24
+ import { spawn } from 'node:child_process'
25
+ import { writeFileSync, mkdtempSync, rmSync } from 'node:fs'
26
+ import { tmpdir } from 'node:os'
27
+ import path from 'node:path'
28
+ import { fileURLToPath } from 'node:url'
29
+ import { randomUUID } from 'node:crypto'
30
+
31
+ // ─── arg parsing ─────────────────────────────────────────────────────────
32
+ const args = {}
33
+ for (const raw of process.argv.slice(2)) {
34
+ const m = raw.match(/^--([^=]+)(?:=(.*))?$/)
35
+ if (!m) continue
36
+ args[m[1]] = m[2] ?? 'true'
37
+ }
38
+ const COUNT = Number(args.count ?? 20)
39
+ const RUNS = Number(args.runs ?? 3)
40
+ const WATCH = args.watch !== 'false' && args['no-watch'] !== 'true'
41
+
42
+ const here = path.dirname(fileURLToPath(import.meta.url))
43
+ const DEFAULT_BIN = path.resolve(here, '..', 'dist', 'index.js')
44
+ const ARBI_BIN = args.bin ?? DEFAULT_BIN
45
+ const SPAWN_ARGV = ARBI_BIN.endsWith('.js') ? ['node', [ARBI_BIN]] : [ARBI_BIN, []]
46
+
47
+ console.log(`bench: ${COUNT} files × ${RUNS} runs per mode, watch=${WATCH}, bin=${ARBI_BIN}`)
48
+
49
+ // ─── PDF generation ─────────────────────────────────────────────────────
50
+ // Multi-page PDF with section headings and visible paragraph breaks.
51
+ //
52
+ // Why so much structure? marker-pdf's layout model can reflow consecutive
53
+ // short lines into a single paragraph block, and arbi-app's chunker then
54
+ // emits one chunk per resulting paragraph. The backend marks any doc with
55
+ // ≤ EMPTY_CONTENT_CHUNK_THRESHOLD (=2) chunks as `status=empty`. Single-page
56
+ // flat-text PDFs sit right on that boundary and intermittently get marked
57
+ // empty, hanging --watch. By emitting multiple pages, headings, and visible
58
+ // vertical gaps between paragraphs, we guarantee marker produces well over
59
+ // 2 blocks regardless of how aggressively it reflows.
60
+ function makePdf(uuid) {
61
+ const PAGES = 3
62
+ const PARAGRAPHS_PER_PAGE = 6
63
+ const LINES_PER_PARAGRAPH = 3
64
+
65
+ // Build the PDF text stream for a single page.
66
+ function pageOps(pageIdx) {
67
+ const items = [] // { text, gapAfter }
68
+ items.push({
69
+ text: `Section ${pageIdx + 1}: Benchmark ${uuid.slice(0, 8)}`,
70
+ gapAfter: 32, // big gap after heading
71
+ })
72
+ for (let p = 0; p < PARAGRAPHS_PER_PAGE; p++) {
73
+ for (let l = 0; l < LINES_PER_PARAGRAPH; l++) {
74
+ items.push({
75
+ text: `Paragraph ${p + 1} line ${l + 1}: lorem ipsum dolor sit amet consectetur adipiscing elit`,
76
+ // Larger gap after the last line of each paragraph creates a
77
+ // visible blank-line break that marker treats as a block boundary.
78
+ gapAfter: l === LINES_PER_PARAGRAPH - 1 ? 28 : 16,
79
+ })
80
+ }
81
+ }
82
+
83
+ let ops = 'BT\n/F1 11 Tf\n50 760 Td\n'
84
+ ops += `(${items[0].text}) Tj\n`
85
+ for (let i = 1; i < items.length; i++) {
86
+ ops += `0 -${items[i - 1].gapAfter} Td\n(${items[i].text}) Tj\n`
87
+ }
88
+ ops += 'ET'
89
+ return ops
90
+ }
91
+
92
+ // Lay out the object table:
93
+ // 1: Catalog
94
+ // 2: Pages
95
+ // 3, 5, 7, ... : Page objects
96
+ // 4, 6, 8, ... : Page contents streams
97
+ // 3 + 2*PAGES : Font
98
+ const pageIds = Array.from({ length: PAGES }, (_, i) => 3 + i * 2)
99
+ const fontId = 3 + PAGES * 2
100
+
101
+ const objects = []
102
+ objects.push('<< /Type /Catalog /Pages 2 0 R >>')
103
+ objects.push(
104
+ `<< /Type /Pages /Kids [${pageIds.map((id) => `${id} 0 R`).join(' ')}] /Count ${PAGES} >>`
105
+ )
106
+ for (let i = 0; i < PAGES; i++) {
107
+ const contentsId = pageIds[i] + 1
108
+ objects.push(
109
+ `<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] ` +
110
+ `/Contents ${contentsId} 0 R /Resources << /Font << /F1 ${fontId} 0 R >> >> >>`
111
+ )
112
+ const ops = pageOps(i)
113
+ objects.push(`<< /Length ${Buffer.byteLength(ops)} >>\nstream\n${ops}\nendstream`)
114
+ }
115
+ objects.push('<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>')
116
+
117
+ // Serialize objects into a body and record byte offsets for the xref.
118
+ const header = '%PDF-1.4\n'
119
+ const offsets = []
120
+ let body = ''
121
+ let cursor = Buffer.byteLength(header)
122
+ for (let i = 0; i < objects.length; i++) {
123
+ offsets.push(cursor)
124
+ const obj = `${i + 1} 0 obj\n${objects[i]}\nendobj\n`
125
+ body += obj
126
+ cursor += Buffer.byteLength(obj)
127
+ }
128
+
129
+ const xrefOff = cursor
130
+ const p = (n) => String(n).padStart(10, '0')
131
+ let xref = `xref\n0 ${objects.length + 1}\n0000000000 65535 f \n`
132
+ for (const off of offsets) xref += `${p(off)} 00000 n \n`
133
+ xref += `trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\nstartxref\n${xrefOff}\n%%EOF\n`
134
+
135
+ return Buffer.from(header + body + xref, 'utf8')
136
+ }
137
+
138
+ // ─── process helpers ─────────────────────────────────────────────────────
139
+ function runCli(cliArgs, { timeoutMs = 15 * 60 * 1000 } = {}) {
140
+ return new Promise((resolve, reject) => {
141
+ const [cmd, prefix] = SPAWN_ARGV
142
+ const child = spawn(cmd, [...prefix, ...cliArgs], {
143
+ stdio: ['ignore', 'pipe', 'pipe'],
144
+ })
145
+ let stdout = ''
146
+ let stderr = ''
147
+ const timer = setTimeout(() => {
148
+ child.kill('SIGTERM')
149
+ reject(new Error(`timed out after ${timeoutMs}ms: arbi ${cliArgs.join(' ')}`))
150
+ }, timeoutMs)
151
+ child.stdout.on('data', (b) => (stdout += b.toString()))
152
+ child.stderr.on('data', (b) => (stderr += b.toString()))
153
+ child.on('error', reject)
154
+ child.on('close', (code) => {
155
+ clearTimeout(timer)
156
+ resolve({ code: code ?? -1, stdout, stderr })
157
+ })
158
+ })
159
+ }
160
+
161
+ function stripAnsi(s) {
162
+ return s.replace(/\x1b\[[0-9;]*m/g, '')
163
+ }
164
+
165
+ async function createWorkspace(label) {
166
+ const name = `bench-${label}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
167
+ const r = await runCli(['workspace', 'create', name])
168
+ if (r.code !== 0) {
169
+ throw new Error(`workspace create failed (${r.code}): ${stripAnsi(r.stderr || r.stdout)}`)
170
+ }
171
+ const m = stripAnsi(r.stdout).match(/Created:[^(]*\(([^)]+)\)/)
172
+ if (!m) throw new Error(`could not parse workspace id from: ${r.stdout}`)
173
+ return m[1]
174
+ }
175
+
176
+ async function deleteWorkspace(id) {
177
+ try {
178
+ await runCli(['workspace', 'delete', id])
179
+ } catch {
180
+ /* best-effort cleanup */
181
+ }
182
+ }
183
+
184
+ function writeUniqueFiles(dir, count) {
185
+ const files = []
186
+ for (let i = 0; i < count; i++) {
187
+ const p = path.join(dir, `bench-${i}-${randomUUID()}.pdf`)
188
+ writeFileSync(p, makePdf(randomUUID()))
189
+ files.push(p)
190
+ }
191
+ return files
192
+ }
193
+
194
+ async function runBench(mode) {
195
+ const tmp = mkdtempSync(path.join(tmpdir(), `arbi-bench-${mode}-`))
196
+ let wsId
197
+ try {
198
+ const files = writeUniqueFiles(tmp, COUNT)
199
+ wsId = await createWorkspace(mode)
200
+
201
+ const cliArgs = ['upload']
202
+ if (mode === 's3') cliArgs.push('--s3')
203
+ cliArgs.push(WATCH ? '--watch' : '--no-watch')
204
+ cliArgs.push('-w', wsId, ...files)
205
+
206
+ const start = process.hrtime.bigint()
207
+ const r = await runCli(cliArgs)
208
+ const end = process.hrtime.bigint()
209
+ const elapsedMs = Number(end - start) / 1e6
210
+
211
+ if (r.code !== 0) {
212
+ process.stderr.write(stripAnsi(r.stdout))
213
+ process.stderr.write(stripAnsi(r.stderr))
214
+ throw new Error(`upload failed with code ${r.code}`)
215
+ }
216
+ return elapsedMs
217
+ } finally {
218
+ if (wsId) await deleteWorkspace(wsId)
219
+ rmSync(tmp, { recursive: true, force: true })
220
+ }
221
+ }
222
+
223
+ // ─── stats & formatting ─────────────────────────────────────────────────
224
+ function stats(values) {
225
+ const mean = values.reduce((a, b) => a + b, 0) / values.length
226
+ const variance = values.reduce((a, b) => a + (b - mean) ** 2, 0) / values.length
227
+ return {
228
+ mean,
229
+ stddev: Math.sqrt(variance),
230
+ min: Math.min(...values),
231
+ max: Math.max(...values),
232
+ }
233
+ }
234
+
235
+ function fmt(ms) {
236
+ return `${(ms / 1000).toFixed(2)}s`
237
+ }
238
+
239
+ // ─── main ────────────────────────────────────────────────────────────────
240
+ const modes = ['legacy', 's3']
241
+ const results = { legacy: [], s3: [] }
242
+
243
+ try {
244
+ // Interleave modes (L S L S ...) instead of batching (LLL SSS) so that
245
+ // backend warmup / cache state affects both modes equally.
246
+ for (let i = 0; i < RUNS; i++) {
247
+ for (const mode of modes) {
248
+ process.stdout.write(` run ${i + 1}/${RUNS} [${mode.padEnd(6)}]... `)
249
+ const ms = await runBench(mode)
250
+ results[mode].push(ms)
251
+ console.log(fmt(ms))
252
+ }
253
+ }
254
+ } catch (e) {
255
+ console.error('\nbench failed:', e.message || e)
256
+ process.exit(1)
257
+ }
258
+
259
+ console.log('\n─── results ───')
260
+ for (const mode of modes) {
261
+ const s = stats(results[mode])
262
+ console.log(
263
+ `${mode.padEnd(8)} mean=${fmt(s.mean).padEnd(9)} stddev=${fmt(s.stddev).padEnd(9)} ` +
264
+ `min=${fmt(s.min).padEnd(9)} max=${fmt(s.max)}`
265
+ )
266
+ }
267
+
268
+ const legacyMean = stats(results.legacy).mean
269
+ const s3Mean = stats(results.s3).mean
270
+ const delta = legacyMean - s3Mean
271
+ const pct = (delta / legacyMean) * 100
272
+ console.log(
273
+ `\n--s3 vs legacy: ${delta >= 0 ? `${pct.toFixed(1)}% faster` : `${(-pct).toFixed(1)}% slower`} ` +
274
+ `(${fmt(Math.abs(delta))} delta)`
275
+ )