npm - @arbidocs/cli - Versions diffs - 0.3.33 → 0.3.34 - Mend

@arbidocs/cli 0.3.33 → 0.3.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +38 -0
package/dist/index.js +408 -130
package/dist/index.js.map +1 -1
package/package.json +4 -4
package/scripts/bench-upload.mjs +275 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arbidocs/cli",
-  "version": "0.3.33",
+  "version": "0.3.34",
   "description": "CLI tool for interacting with ARBI — login, manage workspaces, upload documents, query the RAG assistant",
   "main": "dist/index.js",
   "bin": {
@@ -21,9 +21,9 @@
     "preuninstall": "node scripts/preuninstall.js"
   },
   "dependencies": {
-    "@arbidocs/sdk": "0.3.33",
-    "@arbidocs/client": "0.3.33",
-    "@arbidocs/tui": "0.3.33",
+    "@arbidocs/sdk": "0.3.34",
+    "@arbidocs/client": "0.3.34",
+    "@arbidocs/tui": "0.3.34",
     "@inquirer/prompts": "^8.2.0",
     "chalk": "^5.6.2",
     "commander": "^13.1.0"

package/scripts/bench-upload.mjs ADDED Viewed

@@ -0,0 +1,275 @@
+#!/usr/bin/env node
+/**
+ * bench-upload.mjs
+ *
+ * Benchmark arbi upload speed: legacy multipart vs direct-to-MinIO (--s3).
+ *
+ * Generates fresh unique PDFs into a temp dir, creates a disposable workspace
+ * per run (so server-side dedup never kicks in), invokes `arbi upload --watch`
+ * with and without `--s3`, and measures wall-clock time from spawn to close.
+ * Watch mode is used by default because the upload endpoint only acks —
+ * actual time-to-completion includes server-side processing.
+ *
+ * Requires:
+ *   - arbi is logged in (`arbi login`) with access to a project
+ *   - a live backend reachable from the CLI (defaults to the configured one)
+ *
+ * Usage:
+ *   node packages/arbi-cli/scripts/bench-upload.mjs
+ *   node packages/arbi-cli/scripts/bench-upload.mjs --count=10 --runs=2
+ *   node packages/arbi-cli/scripts/bench-upload.mjs --no-watch      # upload-only
+ *   node packages/arbi-cli/scripts/bench-upload.mjs --bin=arbi      # use global
+ */
+import { spawn } from 'node:child_process'
+import { writeFileSync, mkdtempSync, rmSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import path from 'node:path'
+import { fileURLToPath } from 'node:url'
+import { randomUUID } from 'node:crypto'
+// ─── arg parsing ─────────────────────────────────────────────────────────
+const args = {}
+for (const raw of process.argv.slice(2)) {
+  const m = raw.match(/^--([^=]+)(?:=(.*))?$/)
+  if (!m) continue
+  args[m[1]] = m[2] ?? 'true'
+}
+const COUNT = Number(args.count ?? 20)
+const RUNS = Number(args.runs ?? 3)
+const WATCH = args.watch !== 'false' && args['no-watch'] !== 'true'
+const here = path.dirname(fileURLToPath(import.meta.url))
+const DEFAULT_BIN = path.resolve(here, '..', 'dist', 'index.js')
+const ARBI_BIN = args.bin ?? DEFAULT_BIN
+const SPAWN_ARGV = ARBI_BIN.endsWith('.js') ? ['node', [ARBI_BIN]] : [ARBI_BIN, []]
+console.log(`bench: ${COUNT} files × ${RUNS} runs per mode, watch=${WATCH}, bin=${ARBI_BIN}`)
+// ─── PDF generation ─────────────────────────────────────────────────────
+// Multi-page PDF with section headings and visible paragraph breaks.
+//
+// Why so much structure? marker-pdf's layout model can reflow consecutive
+// short lines into a single paragraph block, and arbi-app's chunker then
+// emits one chunk per resulting paragraph. The backend marks any doc with
+// ≤ EMPTY_CONTENT_CHUNK_THRESHOLD (=2) chunks as `status=empty`. Single-page
+// flat-text PDFs sit right on that boundary and intermittently get marked
+// empty, hanging --watch. By emitting multiple pages, headings, and visible
+// vertical gaps between paragraphs, we guarantee marker produces well over
+// 2 blocks regardless of how aggressively it reflows.
+function makePdf(uuid) {
+  const PAGES = 3
+  const PARAGRAPHS_PER_PAGE = 6
+  const LINES_PER_PARAGRAPH = 3
+  // Build the PDF text stream for a single page.
+  function pageOps(pageIdx) {
+    const items = [] // { text, gapAfter }
+    items.push({
+      text: `Section ${pageIdx + 1}: Benchmark ${uuid.slice(0, 8)}`,
+      gapAfter: 32, // big gap after heading
+    })
+    for (let p = 0; p < PARAGRAPHS_PER_PAGE; p++) {
+      for (let l = 0; l < LINES_PER_PARAGRAPH; l++) {
+        items.push({
+          text: `Paragraph ${p + 1} line ${l + 1}: lorem ipsum dolor sit amet consectetur adipiscing elit`,
+          // Larger gap after the last line of each paragraph creates a
+          // visible blank-line break that marker treats as a block boundary.
+          gapAfter: l === LINES_PER_PARAGRAPH - 1 ? 28 : 16,
+        })
+      }
+    }
+    let ops = 'BT\n/F1 11 Tf\n50 760 Td\n'
+    ops += `(${items[0].text}) Tj\n`
+    for (let i = 1; i < items.length; i++) {
+      ops += `0 -${items[i - 1].gapAfter} Td\n(${items[i].text}) Tj\n`
+    }
+    ops += 'ET'
+    return ops
+  }
+  // Lay out the object table:
+  //   1: Catalog
+  //   2: Pages
+  //   3, 5, 7, ...           : Page objects
+  //   4, 6, 8, ...           : Page contents streams
+  //   3 + 2*PAGES            : Font
+  const pageIds = Array.from({ length: PAGES }, (_, i) => 3 + i * 2)
+  const fontId = 3 + PAGES * 2
+  const objects = []
+  objects.push('<< /Type /Catalog /Pages 2 0 R >>')
+  objects.push(
+    `<< /Type /Pages /Kids [${pageIds.map((id) => `${id} 0 R`).join(' ')}] /Count ${PAGES} >>`
+  )
+  for (let i = 0; i < PAGES; i++) {
+    const contentsId = pageIds[i] + 1
+    objects.push(
+      `<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] ` +
+        `/Contents ${contentsId} 0 R /Resources << /Font << /F1 ${fontId} 0 R >> >> >>`
+    )
+    const ops = pageOps(i)
+    objects.push(`<< /Length ${Buffer.byteLength(ops)} >>\nstream\n${ops}\nendstream`)
+  }
+  objects.push('<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>')
+  // Serialize objects into a body and record byte offsets for the xref.
+  const header = '%PDF-1.4\n'
+  const offsets = []
+  let body = ''
+  let cursor = Buffer.byteLength(header)
+  for (let i = 0; i < objects.length; i++) {
+    offsets.push(cursor)
+    const obj = `${i + 1} 0 obj\n${objects[i]}\nendobj\n`
+    body += obj
+    cursor += Buffer.byteLength(obj)
+  }
+  const xrefOff = cursor
+  const p = (n) => String(n).padStart(10, '0')
+  let xref = `xref\n0 ${objects.length + 1}\n0000000000 65535 f \n`
+  for (const off of offsets) xref += `${p(off)} 00000 n \n`
+  xref += `trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\nstartxref\n${xrefOff}\n%%EOF\n`
+  return Buffer.from(header + body + xref, 'utf8')
+}
+// ─── process helpers ─────────────────────────────────────────────────────
+function runCli(cliArgs, { timeoutMs = 15 * 60 * 1000 } = {}) {
+  return new Promise((resolve, reject) => {
+    const [cmd, prefix] = SPAWN_ARGV
+    const child = spawn(cmd, [...prefix, ...cliArgs], {
+      stdio: ['ignore', 'pipe', 'pipe'],
+    })
+    let stdout = ''
+    let stderr = ''
+    const timer = setTimeout(() => {
+      child.kill('SIGTERM')
+      reject(new Error(`timed out after ${timeoutMs}ms: arbi ${cliArgs.join(' ')}`))
+    }, timeoutMs)
+    child.stdout.on('data', (b) => (stdout += b.toString()))
+    child.stderr.on('data', (b) => (stderr += b.toString()))
+    child.on('error', reject)
+    child.on('close', (code) => {
+      clearTimeout(timer)
+      resolve({ code: code ?? -1, stdout, stderr })
+    })
+  })
+}
+function stripAnsi(s) {
+  return s.replace(/\x1b\[[0-9;]*m/g, '')
+}
+async function createWorkspace(label) {
+  const name = `bench-${label}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+  const r = await runCli(['workspace', 'create', name])
+  if (r.code !== 0) {
+    throw new Error(`workspace create failed (${r.code}): ${stripAnsi(r.stderr || r.stdout)}`)
+  }
+  const m = stripAnsi(r.stdout).match(/Created:[^(]*\(([^)]+)\)/)
+  if (!m) throw new Error(`could not parse workspace id from: ${r.stdout}`)
+  return m[1]
+}
+async function deleteWorkspace(id) {
+  try {
+    await runCli(['workspace', 'delete', id])
+  } catch {
+    /* best-effort cleanup */
+  }
+}
+function writeUniqueFiles(dir, count) {
+  const files = []
+  for (let i = 0; i < count; i++) {
+    const p = path.join(dir, `bench-${i}-${randomUUID()}.pdf`)
+    writeFileSync(p, makePdf(randomUUID()))
+    files.push(p)
+  }
+  return files
+}
+async function runBench(mode) {
+  const tmp = mkdtempSync(path.join(tmpdir(), `arbi-bench-${mode}-`))
+  let wsId
+  try {
+    const files = writeUniqueFiles(tmp, COUNT)
+    wsId = await createWorkspace(mode)
+    const cliArgs = ['upload']
+    if (mode === 's3') cliArgs.push('--s3')
+    cliArgs.push(WATCH ? '--watch' : '--no-watch')
+    cliArgs.push('-w', wsId, ...files)
+    const start = process.hrtime.bigint()
+    const r = await runCli(cliArgs)
+    const end = process.hrtime.bigint()
+    const elapsedMs = Number(end - start) / 1e6
+    if (r.code !== 0) {
+      process.stderr.write(stripAnsi(r.stdout))
+      process.stderr.write(stripAnsi(r.stderr))
+      throw new Error(`upload failed with code ${r.code}`)
+    }
+    return elapsedMs
+  } finally {
+    if (wsId) await deleteWorkspace(wsId)
+    rmSync(tmp, { recursive: true, force: true })
+  }
+}
+// ─── stats & formatting ─────────────────────────────────────────────────
+function stats(values) {
+  const mean = values.reduce((a, b) => a + b, 0) / values.length
+  const variance = values.reduce((a, b) => a + (b - mean) ** 2, 0) / values.length
+  return {
+    mean,
+    stddev: Math.sqrt(variance),
+    min: Math.min(...values),
+    max: Math.max(...values),
+  }
+}
+function fmt(ms) {
+  return `${(ms / 1000).toFixed(2)}s`
+}
+// ─── main ────────────────────────────────────────────────────────────────
+const modes = ['legacy', 's3']
+const results = { legacy: [], s3: [] }
+try {
+  // Interleave modes (L S L S ...) instead of batching (LLL SSS) so that
+  // backend warmup / cache state affects both modes equally.
+  for (let i = 0; i < RUNS; i++) {
+    for (const mode of modes) {
+      process.stdout.write(`  run ${i + 1}/${RUNS} [${mode.padEnd(6)}]... `)
+      const ms = await runBench(mode)
+      results[mode].push(ms)
+      console.log(fmt(ms))
+    }
+  }
+} catch (e) {
+  console.error('\nbench failed:', e.message || e)
+  process.exit(1)
+}
+console.log('\n─── results ───')
+for (const mode of modes) {
+  const s = stats(results[mode])
+  console.log(
+    `${mode.padEnd(8)} mean=${fmt(s.mean).padEnd(9)} stddev=${fmt(s.stddev).padEnd(9)} ` +
+      `min=${fmt(s.min).padEnd(9)} max=${fmt(s.max)}`
+  )
+}
+const legacyMean = stats(results.legacy).mean
+const s3Mean = stats(results.s3).mean
+const delta = legacyMean - s3Mean
+const pct = (delta / legacyMean) * 100
+console.log(
+  `\n--s3 vs legacy: ${delta >= 0 ? `${pct.toFixed(1)}% faster` : `${(-pct).toFixed(1)}% slower`} ` +
+    `(${fmt(Math.abs(delta))} delta)`
+)