@fs/mycroft 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -0
- package/completions/mycroft.bash +11 -1
- package/completions/mycroft.fish +15 -2
- package/completions/mycroft.zsh +14 -1
- package/dist/batch-embedder-6IIWAZPW.js +14 -0
- package/dist/batch-embedder-6IIWAZPW.js.map +1 -0
- package/dist/batch-embedder-7DGZAQKL.js +14 -0
- package/dist/batch-embedder-7DGZAQKL.js.map +1 -0
- package/dist/batch-embedder-C2E6OHBQ.js +14 -0
- package/dist/batch-embedder-C2E6OHBQ.js.map +1 -0
- package/dist/batch-embedder-IZDBS3IL.js +13 -0
- package/dist/batch-embedder-IZDBS3IL.js.map +1 -0
- package/dist/batch-embedder-LYCZDYI4.js +15 -0
- package/dist/batch-embedder-LYCZDYI4.js.map +1 -0
- package/dist/batch-embedder-RHKD2OJD.js +14 -0
- package/dist/batch-embedder-RHKD2OJD.js.map +1 -0
- package/dist/batch-embedder-VQZUI7R6.js +14 -0
- package/dist/batch-embedder-VQZUI7R6.js.map +1 -0
- package/dist/batch-embedder-ZJZLNLOK.js +14 -0
- package/dist/batch-embedder-ZJZLNLOK.js.map +1 -0
- package/dist/batch-summarizer-7MCT4HJB.js +14 -0
- package/dist/batch-summarizer-7MCT4HJB.js.map +1 -0
- package/dist/batch-summarizer-BMIBVFAE.js +14 -0
- package/dist/batch-summarizer-BMIBVFAE.js.map +1 -0
- package/dist/batch-summarizer-CM3NO7TK.js +14 -0
- package/dist/batch-summarizer-CM3NO7TK.js.map +1 -0
- package/dist/chunk-35EO53CC.js +8058 -0
- package/dist/chunk-35EO53CC.js.map +1 -0
- package/dist/chunk-57ZGGKEF.js +8060 -0
- package/dist/chunk-57ZGGKEF.js.map +1 -0
- package/dist/chunk-6DLQHHCC.js +249 -0
- package/dist/chunk-6DLQHHCC.js.map +1 -0
- package/dist/chunk-7CO4PMU5.js +92 -0
- package/dist/chunk-7CO4PMU5.js.map +1 -0
- package/dist/chunk-7DUQNGEK.js +253 -0
- package/dist/chunk-7DUQNGEK.js.map +1 -0
- package/dist/chunk-7IPX4MKA.js +4637 -0
- package/dist/chunk-7IPX4MKA.js.map +1 -0
- package/dist/chunk-7NLMBXXY.js +6438 -0
- package/dist/chunk-7NLMBXXY.js.map +1 -0
- package/dist/chunk-BR2PM6D3.js +11047 -0
- package/dist/chunk-BR2PM6D3.js.map +1 -0
- package/dist/chunk-KGG7WEYE.js +162 -0
- package/dist/chunk-KGG7WEYE.js.map +1 -0
- package/dist/chunk-LV52FEMB.js +169 -0
- package/dist/chunk-LV52FEMB.js.map +1 -0
- package/dist/chunk-QRDUQX63.js +256 -0
- package/dist/chunk-QRDUQX63.js.map +1 -0
- package/dist/chunk-R3FOJK5A.js +2088 -0
- package/dist/chunk-R3FOJK5A.js.map +1 -0
- package/dist/chunk-T6X7DRBN.js +275 -0
- package/dist/chunk-T6X7DRBN.js.map +1 -0
- package/dist/chunk-VBEGUDHG.js +103 -0
- package/dist/chunk-VBEGUDHG.js.map +1 -0
- package/dist/chunk-XXO66RCF.js +94 -0
- package/dist/chunk-XXO66RCF.js.map +1 -0
- package/dist/cli.js +769 -317
- package/dist/cli.js.map +1 -1
- package/dist/fileFromPath-FLANAQWT.js +128 -0
- package/dist/fileFromPath-FLANAQWT.js.map +1 -0
- package/dist/main-36PRDAPE.js +1857 -0
- package/dist/main-36PRDAPE.js.map +1 -0
- package/dist/main-B3QJZGLU.js +1859 -0
- package/dist/main-B3QJZGLU.js.map +1 -0
- package/package.json +14 -2
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/batch-summarizer.ts","../src/shared/summary.ts"],"sourcesContent":["import OpenAI from \"openai\";\nimport type { Chapter, ChapterSummary } from \"../shared/types.js\";\nimport { SUMMARY_MAX_TOKENS, SUMMARY_TARGET_WORDS, getModels, logInfo, logWarn } from \"./constants.js\";\nimport { CHARS_PER_TOKEN, SUMMARY_PROMPT, parseStructuredSummary, splitIntoSections } from \"../shared/summary.js\";\n\nconst estimateTokens = (text: string): number => Math.ceil(text.length / CHARS_PER_TOKEN);\n\ntype BatchRequestLine = {\n custom_id: string;\n method: \"POST\";\n url: \"/v1/chat/completions\";\n body: {\n model: string;\n messages: { role: \"user\"; content: string }[];\n };\n};\n\nexport type SummaryBatchChapter = {\n chapterIndex: number;\n title: string;\n needsTwoPass: boolean;\n sectionCount: number;\n};\n\nconst buildJsonl = (chapters: Chapter[], model: string): { jsonl: string; metadata: SummaryBatchChapter[] } => {\n const lines: string[] = [];\n const metadata: SummaryBatchChapter[] = [];\n\n for (let i = 0; i < chapters.length; i++) {\n const chapter = chapters[i]!;\n const tokens = estimateTokens(chapter.content);\n\n if (tokens <= SUMMARY_MAX_TOKENS) {\n // Single-pass: one request for the structured summary\n const line: BatchRequestLine = {\n custom_id: `summary-${i}`,\n method: \"POST\",\n url: \"/v1/chat/completions\",\n body: {\n model,\n messages: [{ role: \"user\", content: SUMMARY_PROMPT(chapter.title, i + 1, chapter.content, SUMMARY_TARGET_WORDS) }],\n },\n };\n lines.push(JSON.stringify(line));\n metadata.push({ chapterIndex: i, title: chapter.title, needsTwoPass: false, sectionCount: 1 });\n } else {\n // Two-pass: first submit section summary requests, then a merge request\n const sections = splitIntoSections(chapter.content, SUMMARY_MAX_TOKENS);\n\n for (let s = 0; s < sections.length; s++) {\n const line: BatchRequestLine = {\n custom_id: `section-${i}-${s}`,\n method: \"POST\",\n url: \"/v1/chat/completions\",\n body: {\n model,\n messages: [{\n role: \"user\",\n content: `Summarize this section from chapter \"${chapter.title}\" (Part ${s + 1}). Focus on key events, characters, and revelations. Keep it concise (100-150 words):\\n\\n${sections[s]}`,\n }],\n },\n };\n lines.push(JSON.stringify(line));\n }\n\n metadata.push({ chapterIndex: i, title: chapter.title, needsTwoPass: true, sectionCount: sections.length });\n }\n }\n\n return { jsonl: lines.join(\"\\n\"), metadata };\n};\n\nexport type BatchSubmitResult = {\n batchId: string;\n inputFileId: string;\n metadata: SummaryBatchChapter[];\n};\n\nexport const submitBatchSummaries = async (chapters: Chapter[]): Promise<BatchSubmitResult> => {\n const models = await getModels();\n const client = new OpenAI();\n\n logInfo(`[BatchSummarizer] Preparing batch request for ${chapters.length} chapters`);\n\n const { jsonl, metadata } = buildJsonl(chapters, models.summary);\n const blob = new Blob([jsonl], { type: \"application/jsonl\" });\n const file = await client.files.create({\n file: new File([blob], \"summaries.jsonl\", { type: \"application/jsonl\" }),\n purpose: \"batch\",\n });\n logInfo(`[BatchSummarizer] Uploaded input file ${file.id}`);\n\n const batch = await client.batches.create({\n input_file_id: file.id,\n endpoint: \"/v1/chat/completions\",\n completion_window: \"24h\",\n });\n logInfo(`[BatchSummarizer] Created batch ${batch.id} — status: ${batch.status}`);\n\n return { batchId: batch.id, inputFileId: file.id, metadata };\n};\n\nexport const downloadBatchSummaryResults = async (\n outputFileId: string,\n chapters: Chapter[],\n metadata: SummaryBatchChapter[],\n): Promise<{ summaries: ChapterSummary[]; needsMergePass: { chapterIndex: number; title: string; sectionSummaries: string[] }[] }> => {\n const client = new OpenAI();\n\n logInfo(`[BatchSummarizer] Downloading results from ${outputFileId}`);\n const response = await client.files.content(outputFileId);\n const text = await response.text();\n const lines = text.trim().split(\"\\n\");\n\n const results = new Map<string, string>();\n for (const line of lines) {\n let result: any;\n try {\n result = JSON.parse(line);\n } catch {\n logWarn(`[BatchSummarizer] Skipping malformed JSONL line`);\n continue;\n }\n if (result.response?.status_code === 200) {\n const content = result.response.body?.choices?.[0]?.message?.content;\n if (content) {\n results.set(result.custom_id, content);\n }\n } else {\n logWarn(`[BatchSummarizer] Request ${result.custom_id} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`);\n }\n }\n\n const summaries: ChapterSummary[] = [];\n const needsMergePass: { chapterIndex: number; title: string; sectionSummaries: string[] }[] = [];\n\n for (const meta of metadata) {\n if (!meta.needsTwoPass) {\n // Single-pass chapter: parse the structured summary directly\n const content = results.get(`summary-${meta.chapterIndex}`);\n if (content) {\n const summary = parseStructuredSummary(content, meta.chapterIndex, meta.title);\n if (summary) summaries.push(summary);\n }\n } else {\n // Two-pass chapter: collect section summaries, need a merge pass\n const sectionSummaries: string[] = [];\n let allPresent = true;\n for (let s = 0; s < meta.sectionCount; s++) {\n const content = results.get(`section-${meta.chapterIndex}-${s}`);\n if (content) {\n sectionSummaries.push(content);\n } else {\n allPresent = false;\n }\n }\n if (allPresent && sectionSummaries.length > 0) {\n needsMergePass.push({ chapterIndex: meta.chapterIndex, title: meta.title, sectionSummaries });\n } else {\n logWarn(`[BatchSummarizer] Missing section results for chapter ${meta.chapterIndex + 1} \"${meta.title}\"`);\n }\n }\n }\n\n logInfo(`[BatchSummarizer] Parsed ${summaries.length} direct summaries, ${needsMergePass.length} chapters need merge pass`);\n return { summaries, needsMergePass };\n};\n\n/**\n * Submit a second batch for the merge pass: combine section summaries into structured summaries.\n * Returns a new batch ID for the merge requests.\n */\nexport const submitMergePass = async (\n mergeChapters: { chapterIndex: number; title: string; sectionSummaries: string[] }[],\n): Promise<BatchSubmitResult> => {\n const models = await getModels();\n const client = new OpenAI();\n\n const lines: string[] = [];\n const metadata: SummaryBatchChapter[] = [];\n\n for (const ch of mergeChapters) {\n const combined = ch.sectionSummaries.join(\"\\n\\n\");\n const line: BatchRequestLine = {\n custom_id: `summary-${ch.chapterIndex}`,\n method: \"POST\",\n url: \"/v1/chat/completions\",\n body: {\n model: models.summary,\n messages: [{ role: \"user\", content: SUMMARY_PROMPT(ch.title, ch.chapterIndex + 1, combined, SUMMARY_TARGET_WORDS) }],\n },\n };\n lines.push(JSON.stringify(line));\n metadata.push({ chapterIndex: ch.chapterIndex, title: ch.title, needsTwoPass: false, sectionCount: 1 });\n }\n\n const jsonl = lines.join(\"\\n\");\n const blob = new Blob([jsonl], { type: \"application/jsonl\" });\n const file = await client.files.create({\n file: new File([blob], \"summaries-merge.jsonl\", { type: \"application/jsonl\" }),\n purpose: \"batch\",\n });\n\n logInfo(`[BatchSummarizer] Uploaded merge input file ${file.id} (${mergeChapters.length} chapters)`);\n\n const batch = await client.batches.create({\n input_file_id: file.id,\n endpoint: \"/v1/chat/completions\",\n completion_window: \"24h\",\n });\n logInfo(`[BatchSummarizer] Created merge batch ${batch.id} — status: ${batch.status}`);\n\n return { batchId: batch.id, inputFileId: file.id, metadata };\n};\n\nexport const downloadMergeResults = async (\n outputFileId: string,\n mergeChapters: { chapterIndex: number; title: string }[],\n): Promise<ChapterSummary[]> => {\n const client = new OpenAI();\n\n logInfo(`[BatchSummarizer] Downloading merge results from ${outputFileId}`);\n const response = await client.files.content(outputFileId);\n const text = await response.text();\n const lines = text.trim().split(\"\\n\");\n\n const summaries: ChapterSummary[] = [];\n for (const line of lines) {\n let result: any;\n try {\n result = JSON.parse(line);\n } catch {\n logWarn(`[BatchSummarizer] Skipping malformed JSONL line in merge results`);\n continue;\n }\n if (result.response?.status_code === 200) {\n const content = result.response.body?.choices?.[0]?.message?.content;\n if (content) {\n // Extract chapter index from custom_id: \"summary-{idx}\"\n const idx = Number(result.custom_id.replace(\"summary-\", \"\"));\n const meta = mergeChapters.find((ch) => ch.chapterIndex === idx);\n if (meta) {\n const summary = parseStructuredSummary(content, idx, meta.title);\n if (summary) summaries.push(summary);\n }\n }\n } else {\n logWarn(`[BatchSummarizer] Merge request ${result.custom_id} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`);\n }\n }\n\n logInfo(`[BatchSummarizer] Parsed ${summaries.length} merged summaries`);\n return summaries;\n};\n","import type { ChapterSummary } from \"./types.js\";\nimport { logWarn } from \"../commands/io.js\";\n\nexport const CHARS_PER_TOKEN = 4;\n\nexport type SummaryJSON = {\n characters: string[];\n events: string;\n setting: string;\n revelations: string;\n};\n\nexport const SUMMARY_PROMPT = (title: string, chapterNum: number, content: string, targetWords: number) => `You are analyzing a chapter from a book (fiction or nonfiction). Extract key information to help readers understand the chapter's content.\n\nChapter Title: ${title}\nChapter Number: ${chapterNum}\n\n---\n${content}\n---\n\nExtract the following information and respond ONLY with valid JSON (no markdown, no code blocks):\n\n{\n \"characters\": [\"Name - brief description (role, traits, first appearance)\", ...],\n \"events\": \"What happens in this chapter? (2-3 sentences)\",\n \"setting\": \"Where does this chapter take place?\",\n \"revelations\": \"Any important information revealed? (secrets, backstory, foreshadowing)\"\n}\n\nKeep the total response around ${targetWords} words.`;\n\nexport const parseStructuredSummary = (text: string, chapterIndex: number, title: string): ChapterSummary | null => {\n try {\n let jsonText = text.trim();\n if (jsonText.startsWith(\"```json\")) {\n jsonText = jsonText.slice(7, -3).trim();\n } else if (jsonText.startsWith(\"```\")) {\n jsonText = jsonText.slice(3, -3).trim();\n }\n\n const parsed: SummaryJSON = JSON.parse(jsonText);\n\n const fullSummary = `Chapter ${chapterIndex + 1}: ${title}\n\nCharacters: ${parsed.characters.join(\", \")}\n\nEvents: ${parsed.events}\n\nSetting: ${parsed.setting}\n\nRevelations: ${parsed.revelations}`;\n\n return {\n chapterIndex,\n chapterTitle: title,\n characters: parsed.characters,\n events: parsed.events,\n setting: parsed.setting,\n revelations: parsed.revelations,\n fullSummary,\n };\n } catch (error) {\n logWarn(`[Summary] Failed to parse summary JSON for \"${title}\": ${error instanceof Error ? error.message : String(error)}`);\n return null;\n }\n};\n\nexport const splitIntoSections = (text: string, maxTokens: number): string[] => {\n const estimatedTokens = Math.ceil(text.length / CHARS_PER_TOKEN);\n\n if (estimatedTokens <= maxTokens) {\n return [text];\n }\n\n const numSections = Math.ceil(estimatedTokens / maxTokens);\n const charsPerSection = Math.floor(text.length / numSections);\n const sections: string[] = [];\n\n for (let i = 0; i < numSections; i++) {\n const start = i * charsPerSection;\n const end = i === numSections - 1 ? text.length : (i + 1) * charsPerSection;\n sections.push(text.slice(start, end));\n }\n\n return sections;\n};\n"],"mappings":";;;;;;;;;AAAA,OAAO,YAAY;;;ACGZ,IAAM,kBAAkB;AASxB,IAAM,iBAAiB,CAAC,OAAe,YAAoB,SAAiB,gBAAwB;AAAA;AAAA,iBAE1F,KAAK;AAAA,kBACJ,UAAU;AAAA;AAAA;AAAA,EAG1B,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,iCAYwB,WAAW;AAErC,IAAM,yBAAyB,CAAC,MAAc,cAAsB,UAAyC;AAClH,MAAI;AACF,QAAI,WAAW,KAAK,KAAK;AACzB,QAAI,SAAS,WAAW,SAAS,GAAG;AAClC,iBAAW,SAAS,MAAM,GAAG,EAAE,EAAE,KAAK;AAAA,IACxC,WAAW,SAAS,WAAW,KAAK,GAAG;AACrC,iBAAW,SAAS,MAAM,GAAG,EAAE,EAAE,KAAK;AAAA,IACxC;AAEA,UAAM,SAAsB,KAAK,MAAM,QAAQ;AAE/C,UAAM,cAAc,WAAW,eAAe,CAAC,KAAK,KAAK;AAAA;AAAA,cAE/C,OAAO,WAAW,KAAK,IAAI,CAAC;AAAA;AAAA,UAEhC,OAAO,MAAM;AAAA;AAAA,WAEZ,OAAO,OAAO;AAAA;AAAA,eAEV,OAAO,WAAW;AAE7B,WAAO;AAAA,MACL;AAAA,MACA,cAAc;AAAA,MACd,YAAY,OAAO;AAAA,MACnB,QAAQ,OAAO;AAAA,MACf,SAAS,OAAO;AAAA,MAChB,aAAa,OAAO;AAAA,MACpB;AAAA,IACF;AAAA,EACF,SAAS,OAAO;AACd,YAAQ,+CAA+C,KAAK,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC,EAAE;AAC1H,WAAO;AAAA,EACT;AACF;AAEO,IAAM,oBAAoB,CAAC,MAAc,cAAgC;AAC9E,QAAM,kBAAkB,KAAK,KAAK,KAAK,SAAS,eAAe;AAE/D,MAAI,mBAAmB,WAAW;AAChC,WAAO,CAAC,IAAI;AAAA,EACd;AAEA,QAAM,cAAc,KAAK,KAAK,kBAAkB,SAAS;AACzD,QAAM,kBAAkB,KAAK,MAAM,KAAK,SAAS,WAAW;AAC5D,QAAM,WAAqB,CAAC;AAE5B,WAAS,IAAI,GAAG,IAAI,aAAa,KAAK;AACpC,UAAM,QAAQ,IAAI;AAClB,UAAM,MAAM,MAAM,cAAc,IAAI,KAAK,UAAU,IAAI,KAAK;AAC5D,aAAS,KAAK,KAAK,MAAM,OAAO,GAAG,CAAC;AAAA,EACtC;AAEA,SAAO;AACT;;;ADjFA,IAAM,iBAAiB,CAAC,SAAyB,KAAK,KAAK,KAAK,SAAS,eAAe;AAmBxF,IAAM,aAAa,CAAC,UAAqB,UAAsE;AAC7G,QAAM,QAAkB,CAAC;AACzB,QAAM,WAAkC,CAAC;AAEzC,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,UAAU,SAAS,CAAC;AAC1B,UAAM,SAAS,eAAe,QAAQ,OAAO;AAE7C,QAAI,UAAU,oBAAoB;AAEhC,YAAM,OAAyB;AAAA,QAC7B,WAAW,WAAW,CAAC;AAAA,QACvB,QAAQ;AAAA,QACR,KAAK;AAAA,QACL,MAAM;AAAA,UACJ;AAAA,UACA,UAAU,CAAC,EAAE,MAAM,QAAQ,SAAS,eAAe,QAAQ,OAAO,IAAI,GAAG,QAAQ,SAAS,oBAAoB,EAAE,CAAC;AAAA,QACnH;AAAA,MACF;AACA,YAAM,KAAK,KAAK,UAAU,IAAI,CAAC;AAC/B,eAAS,KAAK,EAAE,cAAc,GAAG,OAAO,QAAQ,OAAO,cAAc,OAAO,cAAc,EAAE,CAAC;AAAA,IAC/F,OAAO;AAEL,YAAM,WAAW,kBAAkB,QAAQ,SAAS,kBAAkB;AAEtE,eAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,cAAM,OAAyB;AAAA,UAC7B,WAAW,WAAW,CAAC,IAAI,CAAC;AAAA,UAC5B,QAAQ;AAAA,UACR,KAAK;AAAA,UACH,MAAM;AAAA,YACJ;AAAA,YACA,UAAU,CAAC;AAAA,cACT,MAAM;AAAA,cACN,SAAS,wCAAwC,QAAQ,KAAK,WAAW,IAAI,CAAC;AAAA;AAAA,EAA4F,SAAS,CAAC,CAAC;AAAA,YACvL,CAAC;AAAA,UACH;AAAA,QACJ;AACA,cAAM,KAAK,KAAK,UAAU,IAAI,CAAC;AAAA,MACjC;AAEA,eAAS,KAAK,EAAE,cAAc,GAAG,OAAO,QAAQ,OAAO,cAAc,MAAM,cAAc,SAAS,OAAO,CAAC;AAAA,IAC5G;AAAA,EACF;AAEA,SAAO,EAAE,OAAO,MAAM,KAAK,IAAI,GAAG,SAAS;AAC7C;AAQO,IAAM,uBAAuB,OAAO,aAAoD;AAC7F,QAAM,SAAS,MAAM,UAAU;AAC/B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,iDAAiD,SAAS,MAAM,WAAW;AAEnF,QAAM,EAAE,OAAO,SAAS,IAAI,WAAW,UAAU,OAAO,OAAO;AAC/D,QAAM,OAAO,IAAI,KAAK,CAAC,KAAK,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAC5D,QAAM,OAAO,MAAM,OAAO,MAAM,OAAO;AAAA,IACrC,MAAM,IAAI,KAAK,CAAC,IAAI,GAAG,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AAAA,IACvE,SAAS;AAAA,EACX,CAAC;AACD,UAAQ,yCAAyC,KAAK,EAAE,EAAE;AAE1D,QAAM,QAAQ,MAAM,OAAO,QAAQ,OAAO;AAAA,IACxC,eAAe,KAAK;AAAA,IACpB,UAAU;AAAA,IACV,mBAAmB;AAAA,EACrB,CAAC;AACD,UAAQ,mCAAmC,MAAM,EAAE,mBAAc,MAAM,MAAM,EAAE;AAE/E,SAAO,EAAE,SAAS,MAAM,IAAI,aAAa,KAAK,IAAI,SAAS;AAC7D;AAEO,IAAM,8BAA8B,OACzC,cACA,UACA,aACoI;AACpI,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,8CAA8C,YAAY,EAAE;AACpE,QAAM,WAAW,MAAM,OAAO,MAAM,QAAQ,YAAY;AACxD,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,QAAM,QAAQ,KAAK,KAAK,EAAE,MAAM,IAAI;AAEpC,QAAM,UAAU,oBAAI,IAAoB;AACxC,aAAW,QAAQ,OAAO;AACxB,QAAI;AACJ,QAAI;AACF,eAAS,KAAK,MAAM,IAAI;AAAA,IAC1B,QAAQ;AACN,cAAQ,iDAAiD;AACzD;AAAA,IACF;AACA,QAAI,OAAO,UAAU,gBAAgB,KAAK;AACxC,YAAM,UAAU,OAAO,SAAS,MAAM,UAAU,CAAC,GAAG,SAAS;AAC7D,UAAI,SAAS;AACX,gBAAQ,IAAI,OAAO,WAAW,OAAO;AAAA,MACvC;AAAA,IACF,OAAO;AACL,cAAQ,6BAA6B,OAAO,SAAS,YAAY,KAAK,UAAU,OAAO,UAAU,MAAM,SAAS,OAAO,KAAK,CAAC,EAAE;AAAA,IACjI;AAAA,EACF;AAEA,QAAM,YAA8B,CAAC;AACrC,QAAM,iBAAwF,CAAC;AAE/F,aAAW,QAAQ,UAAU;AAC3B,QAAI,CAAC,KAAK,cAAc;AAEtB,YAAM,UAAU,QAAQ,IAAI,WAAW,KAAK,YAAY,EAAE;AAC1D,UAAI,SAAS;AACX,cAAM,UAAU,uBAAuB,SAAS,KAAK,cAAc,KAAK,KAAK;AAC7E,YAAI,QAAS,WAAU,KAAK,OAAO;AAAA,MACrC;AAAA,IACF,OAAO;AAEL,YAAM,mBAA6B,CAAC;AACpC,UAAI,aAAa;AACjB,eAAS,IAAI,GAAG,IAAI,KAAK,cAAc,KAAK;AAC1C,cAAM,UAAU,QAAQ,IAAI,WAAW,KAAK,YAAY,IAAI,CAAC,EAAE;AAC/D,YAAI,SAAS;AACX,2BAAiB,KAAK,OAAO;AAAA,QAC/B,OAAO;AACL,uBAAa;AAAA,QACf;AAAA,MACF;AACA,UAAI,cAAc,iBAAiB,SAAS,GAAG;AAC7C,uBAAe,KAAK,EAAE,cAAc,KAAK,cAAc,OAAO,KAAK,OAAO,iBAAiB,CAAC;AAAA,MAC9F,OAAO;AACL,gBAAQ,yDAAyD,KAAK,eAAe,CAAC,KAAK,KAAK,KAAK,GAAG;AAAA,MAC1G;AAAA,IACF;AAAA,EACF;AAEA,UAAQ,4BAA4B,UAAU,MAAM,sBAAsB,eAAe,MAAM,2BAA2B;AAC1H,SAAO,EAAE,WAAW,eAAe;AACrC;AAMO,IAAM,kBAAkB,OAC7B,kBAC+B;AAC/B,QAAM,SAAS,MAAM,UAAU;AAC/B,QAAM,SAAS,IAAI,OAAO;AAE1B,QAAM,QAAkB,CAAC;AACzB,QAAM,WAAkC,CAAC;AAEzC,aAAW,MAAM,eAAe;AAC9B,UAAM,WAAW,GAAG,iBAAiB,KAAK,MAAM;AAChD,UAAM,OAAyB;AAAA,MAC7B,WAAW,WAAW,GAAG,YAAY;AAAA,MACrC,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,MAAM;AAAA,QACJ,OAAO,OAAO;AAAA,QACd,UAAU,CAAC,EAAE,MAAM,QAAQ,SAAS,eAAe,GAAG,OAAO,GAAG,eAAe,GAAG,UAAU,oBAAoB,EAAE,CAAC;AAAA,MACrH;AAAA,IACF;AACA,UAAM,KAAK,KAAK,UAAU,IAAI,CAAC;AAC/B,aAAS,KAAK,EAAE,cAAc,GAAG,cAAc,OAAO,GAAG,OAAO,cAAc,OAAO,cAAc,EAAE,CAAC;AAAA,EACxG;AAEA,QAAM,QAAQ,MAAM,KAAK,IAAI;AAC7B,QAAM,OAAO,IAAI,KAAK,CAAC,KAAK,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAC5D,QAAM,OAAO,MAAM,OAAO,MAAM,OAAO;AAAA,IACrC,MAAM,IAAI,KAAK,CAAC,IAAI,GAAG,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAAA,IAC7E,SAAS;AAAA,EACX,CAAC;AAED,UAAQ,+CAA+C,KAAK,EAAE,KAAK,cAAc,MAAM,YAAY;AAEnG,QAAM,QAAQ,MAAM,OAAO,QAAQ,OAAO;AAAA,IACxC,eAAe,KAAK;AAAA,IACpB,UAAU;AAAA,IACV,mBAAmB;AAAA,EACrB,CAAC;AACD,UAAQ,yCAAyC,MAAM,EAAE,mBAAc,MAAM,MAAM,EAAE;AAErF,SAAO,EAAE,SAAS,MAAM,IAAI,aAAa,KAAK,IAAI,SAAS;AAC7D;AAEO,IAAM,uBAAuB,OAClC,cACA,kBAC8B;AAC9B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,oDAAoD,YAAY,EAAE;AAC1E,QAAM,WAAW,MAAM,OAAO,MAAM,QAAQ,YAAY;AACxD,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,QAAM,QAAQ,KAAK,KAAK,EAAE,MAAM,IAAI;AAEpC,QAAM,YAA8B,CAAC;AACrC,aAAW,QAAQ,OAAO;AACxB,QAAI;AACJ,QAAI;AACF,eAAS,KAAK,MAAM,IAAI;AAAA,IAC1B,QAAQ;AACN,cAAQ,kEAAkE;AAC1E;AAAA,IACF;AACA,QAAI,OAAO,UAAU,gBAAgB,KAAK;AACxC,YAAM,UAAU,OAAO,SAAS,MAAM,UAAU,CAAC,GAAG,SAAS;AAC7D,UAAI,SAAS;AAEX,cAAM,MAAM,OAAO,OAAO,UAAU,QAAQ,YAAY,EAAE,CAAC;AAC3D,cAAM,OAAO,cAAc,KAAK,CAAC,OAAO,GAAG,iBAAiB,GAAG;AAC/D,YAAI,MAAM;AACR,gBAAM,UAAU,uBAAuB,SAAS,KAAK,KAAK,KAAK;AAC/D,cAAI,QAAS,WAAU,KAAK,OAAO;AAAA,QACrC;AAAA,MACF;AAAA,IACF,OAAO;AACL,cAAQ,mCAAmC,OAAO,SAAS,YAAY,KAAK,UAAU,OAAO,UAAU,MAAM,SAAS,OAAO,KAAK,CAAC,EAAE;AAAA,IACvI;AAAA,EACF;AAEA,UAAQ,4BAA4B,UAAU,MAAM,mBAAmB;AACvE,SAAO;AACT;","names":[]}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getModels,
|
|
3
|
+
logInfo,
|
|
4
|
+
logWarn
|
|
5
|
+
} from "./chunk-LV52FEMB.js";
|
|
6
|
+
|
|
7
|
+
// src/services/batch-embedder.ts
|
|
8
|
+
import OpenAI from "openai";
|
|
9
|
+
var buildJsonl = (chunks, model) => chunks.map(
|
|
10
|
+
(chunk, i) => ({
|
|
11
|
+
custom_id: String(i),
|
|
12
|
+
method: "POST",
|
|
13
|
+
url: "/v1/embeddings",
|
|
14
|
+
body: { model, input: chunk.content }
|
|
15
|
+
})
|
|
16
|
+
).map((line) => JSON.stringify(line)).join("\n");
|
|
17
|
+
var submitBatchEmbeddings = async (chunks) => {
|
|
18
|
+
const models = await getModels();
|
|
19
|
+
const client = new OpenAI();
|
|
20
|
+
logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);
|
|
21
|
+
const jsonl = buildJsonl(chunks, models.embedding);
|
|
22
|
+
const blob = new Blob([jsonl], { type: "application/jsonl" });
|
|
23
|
+
const file = await client.files.create({
|
|
24
|
+
file: new File([blob], "embeddings.jsonl", { type: "application/jsonl" }),
|
|
25
|
+
purpose: "batch"
|
|
26
|
+
});
|
|
27
|
+
logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);
|
|
28
|
+
const batch = await client.batches.create({
|
|
29
|
+
input_file_id: file.id,
|
|
30
|
+
endpoint: "/v1/embeddings",
|
|
31
|
+
completion_window: "24h"
|
|
32
|
+
});
|
|
33
|
+
logInfo(`[BatchEmbedder] Created batch ${batch.id} \u2014 status: ${batch.status}`);
|
|
34
|
+
return { batchId: batch.id, inputFileId: file.id };
|
|
35
|
+
};
|
|
36
|
+
var checkBatchStatus = async (batchId) => {
|
|
37
|
+
const client = new OpenAI();
|
|
38
|
+
const batch = await client.batches.retrieve(batchId);
|
|
39
|
+
return {
|
|
40
|
+
status: batch.status,
|
|
41
|
+
completed: batch.request_counts?.completed ?? 0,
|
|
42
|
+
failed: batch.request_counts?.failed ?? 0,
|
|
43
|
+
total: batch.request_counts?.total ?? 0,
|
|
44
|
+
outputFileId: batch.output_file_id ?? null,
|
|
45
|
+
errorFileId: batch.error_file_id ?? null
|
|
46
|
+
};
|
|
47
|
+
};
|
|
48
|
+
var downloadBatchResults = async (outputFileId, chunks) => {
|
|
49
|
+
const client = new OpenAI();
|
|
50
|
+
logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);
|
|
51
|
+
const response = await client.files.content(outputFileId);
|
|
52
|
+
const text = await response.text();
|
|
53
|
+
const lines = text.trim().split("\n");
|
|
54
|
+
const vectors = /* @__PURE__ */ new Map();
|
|
55
|
+
for (const line of lines) {
|
|
56
|
+
let result;
|
|
57
|
+
try {
|
|
58
|
+
result = JSON.parse(line);
|
|
59
|
+
} catch {
|
|
60
|
+
logWarn(`[BatchEmbedder] Skipping malformed JSONL line`);
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
const idx = Number(result.custom_id);
|
|
64
|
+
if (result.response?.status_code === 200) {
|
|
65
|
+
const embedding = result.response.body?.data?.[0]?.embedding;
|
|
66
|
+
if (embedding) {
|
|
67
|
+
vectors.set(idx, embedding);
|
|
68
|
+
}
|
|
69
|
+
} else {
|
|
70
|
+
logWarn(
|
|
71
|
+
`[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
const embedded = chunks.map((chunk, i) => {
|
|
76
|
+
const vector = vectors.get(i) ?? [];
|
|
77
|
+
if (vector.length === 0) {
|
|
78
|
+
logWarn(`[BatchEmbedder] Chunk ${i} has empty embedding \u2014 skipping vector insertion`);
|
|
79
|
+
}
|
|
80
|
+
return { ...chunk, vector };
|
|
81
|
+
});
|
|
82
|
+
const missing = embedded.filter((e) => e.vector.length === 0).length;
|
|
83
|
+
if (missing > 0) {
|
|
84
|
+
logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);
|
|
85
|
+
}
|
|
86
|
+
logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);
|
|
87
|
+
return embedded;
|
|
88
|
+
};
|
|
89
|
+
var cleanupBatchFiles = async (inputFileId, outputFileId) => {
|
|
90
|
+
const client = new OpenAI();
|
|
91
|
+
await client.files.del(inputFileId).catch(() => void 0);
|
|
92
|
+
if (outputFileId) {
|
|
93
|
+
await client.files.del(outputFileId).catch(() => void 0);
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
export {
|
|
98
|
+
submitBatchEmbeddings,
|
|
99
|
+
checkBatchStatus,
|
|
100
|
+
downloadBatchResults,
|
|
101
|
+
cleanupBatchFiles
|
|
102
|
+
};
|
|
103
|
+
//# sourceMappingURL=chunk-VBEGUDHG.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/batch-embedder.ts"],"sourcesContent":["import OpenAI from \"openai\";\nimport type { BookChunk } from \"../shared/types.js\";\nimport type { EmbeddedChunk } from \"./embedder.js\";\nimport { getModels, logInfo, logWarn } from \"./constants.js\";\n\ntype BatchRequestLine = {\n custom_id: string;\n method: \"POST\";\n url: \"/v1/embeddings\";\n body: { model: string; input: string };\n};\n\nconst buildJsonl = (chunks: BookChunk[], model: string): string =>\n chunks\n .map(\n (chunk, i): BatchRequestLine => ({\n custom_id: String(i),\n method: \"POST\",\n url: \"/v1/embeddings\",\n body: { model, input: chunk.content },\n })\n )\n .map((line) => JSON.stringify(line))\n .join(\"\\n\");\n\nexport type BatchSubmitResult = {\n batchId: string;\n inputFileId: string;\n};\n\nexport const submitBatchEmbeddings = async (chunks: BookChunk[]): Promise<BatchSubmitResult> => {\n const models = await getModels();\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);\n\n const jsonl = buildJsonl(chunks, models.embedding);\n const blob = new Blob([jsonl], { type: \"application/jsonl\" });\n const file = await client.files.create({\n file: new File([blob], \"embeddings.jsonl\", { type: \"application/jsonl\" }),\n purpose: \"batch\",\n });\n logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);\n\n const batch = await client.batches.create({\n input_file_id: file.id,\n endpoint: \"/v1/embeddings\",\n completion_window: \"24h\",\n });\n logInfo(`[BatchEmbedder] Created batch ${batch.id} — status: ${batch.status}`);\n\n return { batchId: batch.id, inputFileId: file.id };\n};\n\nexport type BatchStatus = {\n status: string;\n completed: number;\n failed: number;\n total: number;\n outputFileId: string | null;\n errorFileId: string | null;\n};\n\nexport const checkBatchStatus = async (batchId: string): Promise<BatchStatus> => {\n const client = new OpenAI();\n const batch = await client.batches.retrieve(batchId);\n return {\n status: batch.status,\n completed: batch.request_counts?.completed ?? 0,\n failed: batch.request_counts?.failed ?? 0,\n total: batch.request_counts?.total ?? 0,\n outputFileId: batch.output_file_id ?? null,\n errorFileId: batch.error_file_id ?? null,\n };\n};\n\nexport const downloadBatchResults = async (\n outputFileId: string,\n chunks: BookChunk[],\n): Promise<EmbeddedChunk[]> => {\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);\n const response = await client.files.content(outputFileId);\n const text = await response.text();\n const lines = text.trim().split(\"\\n\");\n\n const vectors = new Map<number, number[]>();\n for (const line of lines) {\n let result: any;\n try {\n result = JSON.parse(line);\n } catch {\n logWarn(`[BatchEmbedder] Skipping malformed JSONL line`);\n continue;\n }\n const idx = Number(result.custom_id);\n if (result.response?.status_code === 200) {\n const embedding = result.response.body?.data?.[0]?.embedding;\n if (embedding) {\n vectors.set(idx, embedding);\n }\n } else {\n logWarn(\n `[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`\n );\n }\n }\n\n const embedded: EmbeddedChunk[] = chunks.map((chunk, i) => {\n const vector = vectors.get(i) ?? [];\n if (vector.length === 0) {\n logWarn(`[BatchEmbedder] Chunk ${i} has empty embedding — skipping vector insertion`);\n }\n return { ...chunk, vector };\n });\n\n const missing = embedded.filter((e) => e.vector.length === 0).length;\n if (missing > 0) {\n logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);\n }\n\n logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);\n return embedded;\n};\n\nexport const cleanupBatchFiles = async (inputFileId: string, outputFileId?: string | null) => {\n const client = new OpenAI();\n await client.files.del(inputFileId).catch(() => undefined);\n if (outputFileId) {\n await client.files.del(outputFileId).catch(() => undefined);\n }\n};\n"],"mappings":";;;;;;;AAAA,OAAO,YAAY;AAYnB,IAAM,aAAa,CAAC,QAAqB,UACvC,OACG;AAAA,EACC,CAAC,OAAO,OAAyB;AAAA,IAC/B,WAAW,OAAO,CAAC;AAAA,IACnB,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,MAAM,EAAE,OAAO,OAAO,MAAM,QAAQ;AAAA,EACtC;AACF,EACC,IAAI,CAAC,SAAS,KAAK,UAAU,IAAI,CAAC,EAClC,KAAK,IAAI;AAOP,IAAM,wBAAwB,OAAO,WAAoD;AAC9F,QAAM,SAAS,MAAM,UAAU;AAC/B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,+CAA+C,OAAO,MAAM,SAAS;AAE7E,QAAM,QAAQ,WAAW,QAAQ,OAAO,SAAS;AACjD,QAAM,OAAO,IAAI,KAAK,CAAC,KAAK,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAC5D,QAAM,OAAO,MAAM,OAAO,MAAM,OAAO;AAAA,IACrC,MAAM,IAAI,KAAK,CAAC,IAAI,GAAG,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAAA,IACxE,SAAS;AAAA,EACX,CAAC;AACD,UAAQ,uCAAuC,KAAK,EAAE,EAAE;AAExD,QAAM,QAAQ,MAAM,OAAO,QAAQ,OAAO;AAAA,IACxC,eAAe,KAAK;AAAA,IACpB,UAAU;AAAA,IACV,mBAAmB;AAAA,EACrB,CAAC;AACD,UAAQ,iCAAiC,MAAM,EAAE,mBAAc,MAAM,MAAM,EAAE;AAE7E,SAAO,EAAE,SAAS,MAAM,IAAI,aAAa,KAAK,GAAG;AACnD;AAWO,IAAM,mBAAmB,OAAO,YAA0C;AAC/E,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,QAAQ,MAAM,OAAO,QAAQ,SAAS,OAAO;AACnD,SAAO;AAAA,IACL,QAAQ,MAAM;AAAA,IACd,WAAW,MAAM,gBAAgB,aAAa;AAAA,IAC9C,QAAQ,MAAM,gBAAgB,UAAU;AAAA,IACxC,OAAO,MAAM,gBAAgB,SAAS;AAAA,IACtC,cAAc,MAAM,kBAAkB;AAAA,IACtC,aAAa,MAAM,iBAAiB;AAAA,EACtC;AACF;AAEO,IAAM,uBAAuB,OAClC,cACA,WAC6B;AAC7B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,4CAA4C,YAAY,EAAE;AAClE,QAAM,WAAW,MAAM,OAAO,MAAM,QAAQ,YAAY;AACxD,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,QAAM,QAAQ,KAAK,KAAK,EAAE,MAAM,IAAI;AAEpC,QAAM,UAAU,oBAAI,IAAsB;AAC1C,aAAW,QAAQ,OAAO;AACxB,QAAI;AACJ,QAAI;AACF,eAAS,KAAK,MAAM,IAAI;AAAA,IAC1B,QAAQ;AACN,cAAQ,+CAA+C;AACvD;AAAA,IACF;AACA,UAAM,MAAM,OAAO,OAAO,SAAS;AACnC,QAAI,OAAO,UAAU,gBAAgB,KAAK;AACxC,YAAM,YAAY,OAAO,SAAS,MAAM,OAAO,CAAC,GAAG;AACnD,UAAI,WAAW;AACb,gBAAQ,IAAI,KAAK,SAAS;AAAA,MAC5B;AAAA,IACF,OAAO;AACL;AAAA,QACE,2BAA2B,GAAG,YAAY,KAAK,UAAU,OAAO,UAAU,MAAM,SAAS,OAAO,KAAK,CAAC;AAAA,MACxG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WAA4B,OAAO,IAAI,CAAC,OAAO,MAAM;AACzD,UAAM,SAAS,QAAQ,IAAI,CAAC,KAAK,CAAC;AAClC,QAAI,OAAO,WAAW,GAAG;AACvB,cAAQ,yBAAyB,CAAC,uDAAkD;AAAA,IACtF;AACA,WAAO,EAAE,GAAG,OAAO,OAAO;AAAA,EAC5B,CAAC;AAED,QAAM,UAAU,SAAS,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,CAAC,EAAE;AAC9D,MAAI,UAAU,GAAG;AACf,YAAQ,mBAAmB,OAAO,qDAAqD;AAAA,EACzF;AAEA,UAAQ,0CAA0C,SAAS,MAAM,uBAAuB;AACxF,SAAO;AACT;AAEO,IAAM,oBAAoB,OAAO,aAAqB,iBAAiC;AAC5F,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,OAAO,MAAM,IAAI,WAAW,EAAE,MAAM,MAAM,MAAS;AACzD,MAAI,cAAc;AAChB,UAAM,OAAO,MAAM,IAAI,YAAY,EAAE,MAAM,MAAM,MAAS;AAAA,EAC5D;AACF;","names":[]}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getModels,
|
|
3
|
+
logInfo,
|
|
4
|
+
logWarn
|
|
5
|
+
} from "./chunk-KGG7WEYE.js";
|
|
6
|
+
|
|
7
|
+
// src/services/batch-embedder.ts
|
|
8
|
+
import OpenAI from "openai";
|
|
9
|
+
var buildJsonl = (chunks, model) => chunks.map(
|
|
10
|
+
(chunk, i) => ({
|
|
11
|
+
custom_id: String(i),
|
|
12
|
+
method: "POST",
|
|
13
|
+
url: "/v1/embeddings",
|
|
14
|
+
body: { model, input: chunk.content }
|
|
15
|
+
})
|
|
16
|
+
).map((line) => JSON.stringify(line)).join("\n");
|
|
17
|
+
var submitBatchEmbeddings = async (chunks) => {
|
|
18
|
+
const models = await getModels();
|
|
19
|
+
const client = new OpenAI();
|
|
20
|
+
logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);
|
|
21
|
+
const jsonl = buildJsonl(chunks, models.embedding);
|
|
22
|
+
const blob = new Blob([jsonl], { type: "application/jsonl" });
|
|
23
|
+
const file = await client.files.create({
|
|
24
|
+
file: new File([blob], "embeddings.jsonl", { type: "application/jsonl" }),
|
|
25
|
+
purpose: "batch"
|
|
26
|
+
});
|
|
27
|
+
logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);
|
|
28
|
+
const batch = await client.batches.create({
|
|
29
|
+
input_file_id: file.id,
|
|
30
|
+
endpoint: "/v1/embeddings",
|
|
31
|
+
completion_window: "24h"
|
|
32
|
+
});
|
|
33
|
+
logInfo(`[BatchEmbedder] Created batch ${batch.id} \u2014 status: ${batch.status}`);
|
|
34
|
+
return { batchId: batch.id, inputFileId: file.id };
|
|
35
|
+
};
|
|
36
|
+
var checkBatchStatus = async (batchId) => {
|
|
37
|
+
const client = new OpenAI();
|
|
38
|
+
const batch = await client.batches.retrieve(batchId);
|
|
39
|
+
return {
|
|
40
|
+
status: batch.status,
|
|
41
|
+
completed: batch.request_counts?.completed ?? 0,
|
|
42
|
+
failed: batch.request_counts?.failed ?? 0,
|
|
43
|
+
total: batch.request_counts?.total ?? 0,
|
|
44
|
+
outputFileId: batch.output_file_id ?? null,
|
|
45
|
+
errorFileId: batch.error_file_id ?? null
|
|
46
|
+
};
|
|
47
|
+
};
|
|
48
|
+
var downloadBatchResults = async (outputFileId, chunks) => {
|
|
49
|
+
const client = new OpenAI();
|
|
50
|
+
logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);
|
|
51
|
+
const response = await client.files.content(outputFileId);
|
|
52
|
+
const text = await response.text();
|
|
53
|
+
const lines = text.trim().split("\n");
|
|
54
|
+
const vectors = /* @__PURE__ */ new Map();
|
|
55
|
+
for (const line of lines) {
|
|
56
|
+
const result = JSON.parse(line);
|
|
57
|
+
const idx = Number(result.custom_id);
|
|
58
|
+
if (result.response?.status_code === 200) {
|
|
59
|
+
const embedding = result.response.body?.data?.[0]?.embedding;
|
|
60
|
+
if (embedding) {
|
|
61
|
+
vectors.set(idx, embedding);
|
|
62
|
+
}
|
|
63
|
+
} else {
|
|
64
|
+
logWarn(
|
|
65
|
+
`[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
const embedded = chunks.map((chunk, i) => ({
|
|
70
|
+
...chunk,
|
|
71
|
+
vector: vectors.get(i) ?? []
|
|
72
|
+
}));
|
|
73
|
+
const missing = embedded.filter((e) => e.vector.length === 0).length;
|
|
74
|
+
if (missing > 0) {
|
|
75
|
+
logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);
|
|
76
|
+
}
|
|
77
|
+
logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);
|
|
78
|
+
return embedded;
|
|
79
|
+
};
|
|
80
|
+
var cleanupBatchFiles = async (inputFileId, outputFileId) => {
|
|
81
|
+
const client = new OpenAI();
|
|
82
|
+
await client.files.del(inputFileId).catch(() => void 0);
|
|
83
|
+
if (outputFileId) {
|
|
84
|
+
await client.files.del(outputFileId).catch(() => void 0);
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
export {
|
|
89
|
+
submitBatchEmbeddings,
|
|
90
|
+
checkBatchStatus,
|
|
91
|
+
downloadBatchResults,
|
|
92
|
+
cleanupBatchFiles
|
|
93
|
+
};
|
|
94
|
+
//# sourceMappingURL=chunk-XXO66RCF.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/batch-embedder.ts"],"sourcesContent":["import OpenAI from \"openai\";\nimport type { BookChunk } from \"../shared/types.js\";\nimport type { EmbeddedChunk } from \"./embedder.js\";\nimport { getModels, logInfo, logWarn } from \"./constants.js\";\n\ntype BatchRequestLine = {\n custom_id: string;\n method: \"POST\";\n url: \"/v1/embeddings\";\n body: { model: string; input: string };\n};\n\nconst buildJsonl = (chunks: BookChunk[], model: string): string =>\n chunks\n .map(\n (chunk, i): BatchRequestLine => ({\n custom_id: String(i),\n method: \"POST\",\n url: \"/v1/embeddings\",\n body: { model, input: chunk.content },\n })\n )\n .map((line) => JSON.stringify(line))\n .join(\"\\n\");\n\nexport type BatchSubmitResult = {\n batchId: string;\n inputFileId: string;\n};\n\nexport const submitBatchEmbeddings = async (chunks: BookChunk[]): Promise<BatchSubmitResult> => {\n const models = await getModels();\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);\n\n const jsonl = buildJsonl(chunks, models.embedding);\n const blob = new Blob([jsonl], { type: \"application/jsonl\" });\n const file = await client.files.create({\n file: new File([blob], \"embeddings.jsonl\", { type: \"application/jsonl\" }),\n purpose: \"batch\",\n });\n logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);\n\n const batch = await client.batches.create({\n input_file_id: file.id,\n endpoint: \"/v1/embeddings\",\n completion_window: \"24h\",\n });\n logInfo(`[BatchEmbedder] Created batch ${batch.id} — status: ${batch.status}`);\n\n return { batchId: batch.id, inputFileId: file.id };\n};\n\nexport type BatchStatus = {\n status: string;\n completed: number;\n failed: number;\n total: number;\n outputFileId: string | null;\n errorFileId: string | null;\n};\n\nexport const checkBatchStatus = async (batchId: string): Promise<BatchStatus> => {\n const client = new OpenAI();\n const batch = await client.batches.retrieve(batchId);\n return {\n status: batch.status,\n completed: batch.request_counts?.completed ?? 0,\n failed: batch.request_counts?.failed ?? 0,\n total: batch.request_counts?.total ?? 0,\n outputFileId: batch.output_file_id ?? null,\n errorFileId: batch.error_file_id ?? null,\n };\n};\n\nexport const downloadBatchResults = async (\n outputFileId: string,\n chunks: BookChunk[],\n): Promise<EmbeddedChunk[]> => {\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);\n const response = await client.files.content(outputFileId);\n const text = await response.text();\n const lines = text.trim().split(\"\\n\");\n\n const vectors = new Map<number, number[]>();\n for (const line of lines) {\n const result = JSON.parse(line);\n const idx = Number(result.custom_id);\n if (result.response?.status_code === 200) {\n const embedding = result.response.body?.data?.[0]?.embedding;\n if (embedding) {\n vectors.set(idx, embedding);\n }\n } else {\n logWarn(\n `[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`\n );\n }\n }\n\n const embedded: EmbeddedChunk[] = chunks.map((chunk, i) => ({\n ...chunk,\n vector: vectors.get(i) ?? [],\n }));\n\n const missing = embedded.filter((e) => e.vector.length === 0).length;\n if (missing > 0) {\n logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);\n }\n\n logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);\n return embedded;\n};\n\nexport const cleanupBatchFiles = async (inputFileId: string, outputFileId?: string | null) => {\n const client = new OpenAI();\n await client.files.del(inputFileId).catch(() => undefined);\n if (outputFileId) {\n await client.files.del(outputFileId).catch(() => undefined);\n }\n};\n"],"mappings":";;;;;;;AAAA,OAAO,YAAY;AAYnB,IAAM,aAAa,CAAC,QAAqB,UACvC,OACG;AAAA,EACC,CAAC,OAAO,OAAyB;AAAA,IAC/B,WAAW,OAAO,CAAC;AAAA,IACnB,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,MAAM,EAAE,OAAO,OAAO,MAAM,QAAQ;AAAA,EACtC;AACF,EACC,IAAI,CAAC,SAAS,KAAK,UAAU,IAAI,CAAC,EAClC,KAAK,IAAI;AAOP,IAAM,wBAAwB,OAAO,WAAoD;AAC9F,QAAM,SAAS,MAAM,UAAU;AAC/B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,+CAA+C,OAAO,MAAM,SAAS;AAE7E,QAAM,QAAQ,WAAW,QAAQ,OAAO,SAAS;AACjD,QAAM,OAAO,IAAI,KAAK,CAAC,KAAK,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAC5D,QAAM,OAAO,MAAM,OAAO,MAAM,OAAO;AAAA,IACrC,MAAM,IAAI,KAAK,CAAC,IAAI,GAAG,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAAA,IACxE,SAAS;AAAA,EACX,CAAC;AACD,UAAQ,uCAAuC,KAAK,EAAE,EAAE;AAExD,QAAM,QAAQ,MAAM,OAAO,QAAQ,OAAO;AAAA,IACxC,eAAe,KAAK;AAAA,IACpB,UAAU;AAAA,IACV,mBAAmB;AAAA,EACrB,CAAC;AACD,UAAQ,iCAAiC,MAAM,EAAE,mBAAc,MAAM,MAAM,EAAE;AAE7E,SAAO,EAAE,SAAS,MAAM,IAAI,aAAa,KAAK,GAAG;AACnD;AAWO,IAAM,mBAAmB,OAAO,YAA0C;AAC/E,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,QAAQ,MAAM,OAAO,QAAQ,SAAS,OAAO;AACnD,SAAO;AAAA,IACL,QAAQ,MAAM;AAAA,IACd,WAAW,MAAM,gBAAgB,aAAa;AAAA,IAC9C,QAAQ,MAAM,gBAAgB,UAAU;AAAA,IACxC,OAAO,MAAM,gBAAgB,SAAS;AAAA,IACtC,cAAc,MAAM,kBAAkB;AAAA,IACtC,aAAa,MAAM,iBAAiB;AAAA,EACtC;AACF;AAEO,IAAM,uBAAuB,OAClC,cACA,WAC6B;AAC7B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,4CAA4C,YAAY,EAAE;AAClE,QAAM,WAAW,MAAM,OAAO,MAAM,QAAQ,YAAY;AACxD,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,QAAM,QAAQ,KAAK,KAAK,EAAE,MAAM,IAAI;AAEpC,QAAM,UAAU,oBAAI,IAAsB;AAC1C,aAAW,QAAQ,OAAO;AACxB,UAAM,SAAS,KAAK,MAAM,IAAI;AAC9B,UAAM,MAAM,OAAO,OAAO,SAAS;AACnC,QAAI,OAAO,UAAU,gBAAgB,KAAK;AACxC,YAAM,YAAY,OAAO,SAAS,MAAM,OAAO,CAAC,GAAG;AACnD,UAAI,WAAW;AACb,gBAAQ,IAAI,KAAK,SAAS;AAAA,MAC5B;AAAA,IACF,OAAO;AACL;AAAA,QACE,2BAA2B,GAAG,YAAY,KAAK,UAAU,OAAO,UAAU,MAAM,SAAS,OAAO,KAAK,CAAC;AAAA,MACxG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WAA4B,OAAO,IAAI,CAAC,OAAO,OAAO;AAAA,IAC1D,GAAG;AAAA,IACH,QAAQ,QAAQ,IAAI,CAAC,KAAK,CAAC;AAAA,EAC7B,EAAE;AAEF,QAAM,UAAU,SAAS,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,CAAC,EAAE;AAC9D,MAAI,UAAU,GAAG;AACf,YAAQ,mBAAmB,OAAO,qDAAqD;AAAA,EACzF;AAEA,UAAQ,0CAA0C,SAAS,MAAM,uBAAuB;AACxF,SAAO;AACT;AAEO,IAAM,oBAAoB,OAAO,aAAqB,iBAAiC;AAC5F,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,OAAO,MAAM,IAAI,WAAW,EAAE,MAAM,MAAM,MAAS;AACzD,MAAI,cAAc;AAChB,UAAM,OAAO,MAAM,IAAI,YAAY,EAAE,MAAM,MAAM,MAAS;AAAA,EAC5D;AACF;","names":[]}
|