veryfront 0.1.99 → 0.1.100

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"command-help.d.ts","sourceRoot":"","sources":["../../../../src/cli/commands/knowledge/command-help.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAEvD,eAAO,MAAM,aAAa,EAAE,WAuC3B,CAAC"}
1
+ {"version":3,"file":"command-help.d.ts","sourceRoot":"","sources":["../../../../src/cli/commands/knowledge/command-help.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAEvD,eAAO,MAAM,aAAa,EAAE,WAwC3B,CAAC"}
@@ -34,7 +34,8 @@ export const knowledgeHelp = {
34
34
  "Primary subcommand: ingest",
35
35
  "`uploads/...` means a remote project upload; use `./uploads/...` or `/workspace/uploads/...` to force a local file",
36
36
  "`ingest` orchestrates upload resolution, parsing, and project file writes",
37
- "Requires python3; non-text formats also require the supported parser packages unless you run inside the Veryfront sandbox",
38
- "The Veryfront sandbox image includes `kreuzberg`, and knowledge ingest falls back to the built-in parser when `kreuzberg` is unavailable or extraction fails",
37
+ "Requires python3; install `docling` locally to match the sandbox parsing path for PDF, Office, and HTML sources",
38
+ "Supported PDF, Office, and HTML sources are parsed through `docling` when it is available",
39
+ "The Veryfront sandbox image includes `docling`, and knowledge ingest falls back to the built-in parser when `docling` is unavailable or extraction fails",
39
40
  ],
40
41
  };
@@ -17,6 +17,12 @@ export interface KnowledgeParserResult {
17
17
  stats: Record<string, unknown>;
18
18
  warnings: string[];
19
19
  }
20
+ export interface KnowledgeParserInput {
21
+ filePath: string;
22
+ description?: string;
23
+ slug?: string;
24
+ sourceReference?: string;
25
+ }
20
26
  type KnowledgeSource = {
21
27
  kind: "local";
22
28
  input: string;
@@ -128,6 +134,11 @@ export declare function runKnowledgeParser(input: {
128
134
  sourceReference?: string;
129
135
  env?: Record<string, string>;
130
136
  }): Promise<KnowledgeParserResult>;
137
+ export declare function runKnowledgeParsers(input: {
138
+ files: KnowledgeParserInput[];
139
+ outputDir: string;
140
+ env?: Record<string, string>;
141
+ }): Promise<KnowledgeParserResult[]>;
131
142
  export declare function collectKnowledgeSources(options: Pick<KnowledgeIngestOptions, "sources" | "path" | "all" | "recursive">, deps: {
132
143
  client: ApiClient;
133
144
  projectSlug: string;
@@ -1 +1 @@
1
- {"version":3,"file":"command.d.ts","sourceRoot":"","sources":["../../../../src/cli/commands/knowledge/command.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAKxB,OAAO,EAAE,KAAK,SAAS,EAA0C,MAAM,wBAAwB,CAAC;AAChG,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AAIxD,OAAO,EAAuB,KAAK,MAAM,EAAgB,MAAM,6BAA6B,CAAC;AAG7F,OAAO,EAEL,KAAK,+BAA+B,EAEpC,KAAK,yBAAyB,EAC9B,KAAK,gCAAgC,EACtC,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,IAAI,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,mBAAmB,EAAE,MAAM,CAAC;IAC5B,sBAAsB,EAAE,MAAM,CAAC;IAC/B,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,KAAK,eAAe,GAChB;IAAE,IAAI,EAAE,OAAO,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD;IAAE,IAAI,EAAE,QAAQ,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC;AAE7E,MAAM,WAAW,yBAAyB;IACxC,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,OAAO,EAAE,gCAAgC,EAAE,CAAC;CAC7C;AAED,KAAK,cAAc,GAAG;IAAE,UAAU,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAIhF,QAAA,MAAM,yBAAyB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAmD7B,CAAC;AAEH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,yBAAyB,CAAC,CAAC;AAoD/E,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,UAAU,GACf,CAAC,CAAC,mBAAmB,CAAC,OAAO,EAAE,sBAAsB,CAAC,CAexD;AAED,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAMrE;AAED,wBAAgB,0BAA0B,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAGpE;AAED,wBAAgB,2BAA2B,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,CAKtE;AAcD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAGxD;AAyBD,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAE1D;AAMD,wBAAgB,iCAAiC,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAE3E;AAwID,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,eAAe,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM,CAkCjF;AACD,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,MAAM,EAAE,CAQtE;AAED,wBAAgB,yBAAyB,CACvC,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,GACpB,MAAM,CAQR;AAED,wBAAgB,2BAA2B,CAAC,KAAK,EAAE;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,IAAI,CAAC,qBAAqB,EAAE,MAAM,GAAG,OAAO,GAAG,UAAU,GAAG,aAAa,GAAG,SAAS,CAAC,CAAC;CAChG,GAAG,yBAAyB,CAY5B;AAED,wBAAsB,kBAAkB,CAAC,KAAK,EAAE;IAC9C,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC9B,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAuDjC;AAED,wBAAsB,uBAAuB,CAC3C,OAAO,EAAE,IAAI,CAAC,sBAAsB,EAAE,SAAS,GAAG,MAAM,GAAG,KAAK,GAAG,WAAW,CAAC,EAC/E,IAAI,EAAE;IACJ,MAAM,EAAE,SAAS,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,KAAK,OAAO,CAAC,cAAc,EAAE,CAAC,CAAC;CACvE,GACA,OAAO,CAAC,yBAAyB,CAAC,CAuJpC;AAED,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,eAAe,EAAE,EAC1B,OAAO,EAAE,sBAAsB,EAC/B,IAAI,EAAE;IACJ,MAAM,EAAE,SAAS,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,kBAAkB,CAAC;IACrC,mBAAmB,EAAE,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC1F,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC7B,GACA,OAAO,CAAC;IACT,QAAQ,EAAE,yBAAyB,EAAE,CAAC;IACtC,MAAM,EAAE,+BAA+B,EAAE,CAAC;CAC3C,CAAC,CAsGD;AAED,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,CAwItE"}
1
+ {"version":3,"file":"command.d.ts","sourceRoot":"","sources":["../../../../src/cli/commands/knowledge/command.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAKxB,OAAO,EAAE,KAAK,SAAS,EAA0C,MAAM,wBAAwB,CAAC;AAChG,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AAIxD,OAAO,EAAuB,KAAK,MAAM,EAAgB,MAAM,6BAA6B,CAAC;AAG7F,OAAO,EAEL,KAAK,+BAA+B,EAEpC,KAAK,yBAAyB,EAC9B,KAAK,gCAAgC,EACtC,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,IAAI,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,mBAAmB,EAAE,MAAM,CAAC;IAC5B,sBAAsB,EAAE,MAAM,CAAC;IAC/B,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AACD,KAAK,eAAe,GAChB;IAAE,IAAI,EAAE,OAAO,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD;IAAE,IAAI,EAAE,QAAQ,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC;AAE7E,MAAM,WAAW,yBAAyB;IACxC,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,OAAO,EAAE,gCAAgC,EAAE,CAAC;CAC7C;AAED,KAAK,cAAc,GAAG;IAAE,UAAU,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAIhF,QAAA,MAAM,yBAAyB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAmD7B,CAAC;AAEH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,yBAAyB,CAAC,CAAC;AAoD/E,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,UAAU,GACf,CAAC,CAAC,mBAAmB,CAAC,OAAO,EAAE,sBAAsB,CAAC,CAexD;AAED,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAMrE;AAED,wBAAgB,0BAA0B,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAGpE;AAED,wBAAgB,2BAA2B,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,CAKtE;AAcD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAGxD;AAyBD,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAE1D;AAMD,wBAAgB,iCAAiC,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAE3E;AAwID,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,eAAe,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM,CAkCjF;AACD,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,MAAM,EAAE,CAQtE;AAED,wBAAgB,yBAAyB,CACvC,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,GACpB,MAAM,CAQR;AAED,wBAAgB,2BAA2B,CAAC,KAAK,EAAE;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,IAAI,CAAC,qBAAqB,EAAE,MAAM,GAAG,OAAO,GAAG,UAAU,GAAG,aAAa,GAAG,SAAS,CAAC,CAAC;CAChG,GAAG,yBAAyB,CAY5B;AAED,wBAAsB,kBAAkB,CAAC,KAAK,EAAE;IAC9C,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC9B,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAiBjC;AAED,wBAAsB,mBAAmB,CAAC,KAAK,EAAE;IAC/C,KAAK,EAAE,oBAAoB,EAAE,CAAC;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC9B,GAAG,OAAO,CAAC,qBAAqB,EAAE,CAAC,CA8DnC;AAED,wBAAsB,uBAAuB,CAC3C,OAAO,EAAE,IAAI,CAAC,sBAAsB,EAAE,SAAS,GAAG,MAAM,GAAG,KAAK,GAAG,WAAW,CAAC,EAC/E,IAAI,EAAE;IACJ,MAAM,EAAE,SAAS,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,KAAK,OAAO,CAAC,cAAc,EAAE,CAAC,CAAC;CACvE,GACA,OAAO,CAAC,yBAAyB,CAAC,CAuJpC;AAED,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,eAAe,EAAE,EAC1B,OAAO,EAAE,sBAAsB,EAC/B,IAAI,EAAE;IACJ,MAAM,EAAE,SAAS,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,kBAAkB,CAAC;IACrC,mBAAmB,EAAE,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC1F,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC7B,GACA,OAAO,CAAC;IACT,QAAQ,EAAE,yBAAyB,EAAE,CAAC;IACtC,MAAM,EAAE,+BAA+B,EAAE,CAAC;CAC3C,CAAC,CAsGD;AAED,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,CAwItE"}
@@ -348,6 +348,25 @@ export function createKnowledgeIngestResult(input) {
348
348
  };
349
349
  }
350
350
  export async function runKnowledgeParser(input) {
351
+ const [result] = await runKnowledgeParsers({
352
+ files: [{
353
+ filePath: input.filePath,
354
+ description: input.description,
355
+ slug: input.slug,
356
+ sourceReference: input.sourceReference,
357
+ }],
358
+ outputDir: input.outputDir,
359
+ env: input.env,
360
+ });
361
+ if (!result) {
362
+ throw new Error("knowledge ingest parser returned no results");
363
+ }
364
+ return result;
365
+ }
366
+ export async function runKnowledgeParsers(input) {
367
+ if (!input.files.length) {
368
+ return [];
369
+ }
351
370
  const tempDir = await dntShim.Deno.makeTempDir({ prefix: "veryfront-knowledge-parser-" });
352
371
  const inputJsonPath = `${tempDir}/input.json`;
353
372
  const outputJsonPath = `${tempDir}/output.json`;
@@ -355,11 +374,13 @@ export async function runKnowledgeParser(input) {
355
374
  try {
356
375
  try {
357
376
  await dntShim.Deno.writeTextFile(inputJsonPath, JSON.stringify({
358
- file_path: input.filePath,
377
+ files: input.files.map((file) => ({
378
+ file_path: file.filePath,
379
+ description: file.description,
380
+ slug: file.slug,
381
+ source_reference: file.sourceReference,
382
+ })),
359
383
  output_dir: input.outputDir,
360
- description: input.description,
361
- slug: input.slug,
362
- source_reference: input.sourceReference,
363
384
  }));
364
385
  await dntShim.Deno.writeTextFile(scriptPath, knowledgeIngestPythonSource);
365
386
  let result;
@@ -382,7 +403,8 @@ export async function runKnowledgeParser(input) {
382
403
  throw new Error(stderr || "parser exited unsuccessfully");
383
404
  }
384
405
  const raw = await dntShim.Deno.readTextFile(outputJsonPath);
385
- return JSON.parse(raw);
406
+ const parsed = JSON.parse(raw);
407
+ return Array.isArray(parsed) ? parsed : [parsed];
386
408
  }
387
409
  catch (error) {
388
410
  if (error instanceof Error && error.message.startsWith("knowledge ingest parser failed")) {
@@ -1 +1 @@
1
- {"version":3,"file":"parser-source.d.ts","sourceRoot":"","sources":["../../../../src/cli/commands/knowledge/parser-source.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,2BAA2B,QA2jBvC,CAAC"}
1
+ {"version":3,"file":"parser-source.d.ts","sourceRoot":"","sources":["../../../../src/cli/commands/knowledge/parser-source.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,2BAA2B,QAgoBvC,CAAC"}
@@ -2,8 +2,10 @@ export const knowledgeIngestPythonSource = String.raw `#!/usr/bin/env python3
2
2
  import argparse
3
3
  import csv
4
4
  import json
5
+ import os
5
6
  import re
6
7
  import subprocess
8
+ import tempfile
7
9
  from datetime import date
8
10
  from pathlib import Path
9
11
  from typing import Any, Optional
@@ -14,6 +16,26 @@ def yaml_quote(value: Any) -> str:
14
16
 
15
17
 
16
18
  CODE_FENCE = chr(96) * 3
19
+ DEFAULT_DOCLING_TIMEOUT_SECONDS = 900.0
20
+
21
+
22
+ def read_timeout_seconds(env_name: str, default_seconds: float) -> float:
23
+ raw_value = os.environ.get(env_name)
24
+ if raw_value is None or raw_value.strip() == "":
25
+ return default_seconds
26
+
27
+ try:
28
+ timeout_seconds = float(raw_value)
29
+ except ValueError:
30
+ return default_seconds
31
+
32
+ return timeout_seconds if timeout_seconds > 0 else default_seconds
33
+
34
+
35
+ DOCLING_TIMEOUT_SECONDS = read_timeout_seconds(
36
+ "VERYFRONT_KNOWLEDGE_DOCLING_TIMEOUT_SECONDS",
37
+ DEFAULT_DOCLING_TIMEOUT_SECONDS,
38
+ )
17
39
  TEXT_FILE_EXTENSIONS = {
18
40
  ".c",
19
41
  ".cc",
@@ -119,100 +141,68 @@ def build_frontmatter(source: str, source_type: str, description: str) -> str:
119
141
  ])
120
142
 
121
143
 
122
- def metadata_int(metadata: dict[str, Any], *keys: str) -> Optional[int]:
123
- for key in keys:
124
- value = metadata.get(key)
125
- if isinstance(value, int) and not isinstance(value, bool):
126
- return value
127
- return None
128
-
129
-
130
- def metadata_string_list(metadata: dict[str, Any], *keys: str) -> Optional[list[str]]:
131
- for key in keys:
132
- value = metadata.get(key)
133
- if isinstance(value, list) and all(isinstance(item, str) for item in value):
134
- return value
135
- return None
136
-
137
-
138
- def build_kreuzberg_stats(source_type: str, content: str, metadata: dict[str, Any]):
139
- stats: dict[str, Any] = {
144
+ def build_docling_stats(content: str):
145
+ return {
140
146
  "characters": len(content),
141
147
  "lines": len(content.splitlines()) if content else 0,
142
- "engine": "kreuzberg",
148
+ "engine": "docling",
143
149
  }
144
150
 
145
- if isinstance(metadata.get("mime_type"), str):
146
- stats["mime_type"] = metadata["mime_type"]
147
151
 
148
- if source_type == "pdf":
149
- stats["pages"] = metadata_int(metadata, "page_count") or 0
150
- stats["tables"] = metadata_int(metadata, "table_count") or 0
151
- elif source_type in {"xlsx", "xls"}:
152
- stats["sheets"] = metadata_int(metadata, "sheet_count") or 0
153
- stats["rows"] = metadata_int(metadata, "row_count") or 0
154
- stats["sheet_names"] = metadata_string_list(metadata, "sheet_names") or []
155
- elif source_type == "docx":
156
- stats["paragraphs"] = metadata_int(metadata, "paragraph_count") or 0
157
- stats["tables"] = metadata_int(metadata, "table_count") or 0
158
- elif source_type == "pptx":
159
- stats["slides"] = metadata_int(metadata, "slide_count", "page_count") or 0
160
- stats["tables"] = metadata_int(metadata, "table_count") or 0
161
- elif source_type == "html":
162
- stats["tables"] = metadata_int(metadata, "table_count") or 0
163
-
164
- return stats
165
-
166
-
167
- def parse_with_kreuzberg(path: str, source_type: str):
168
- warnings: list[str] = []
169
- completed = subprocess.run(
170
- [
171
- "kreuzberg",
172
- "extract",
173
- path,
174
- "--format",
175
- "json",
176
- "--output-format",
177
- "markdown",
178
- ],
179
- capture_output=True,
180
- text=True,
181
- check=False,
182
- )
183
-
184
- if completed.returncode != 0:
185
- detail = completed.stderr.strip() or completed.stdout.strip() or f"exit code {completed.returncode}"
186
- raise RuntimeError(f"kreuzberg extract failed: {detail}")
152
+ def run_docling_markdown(path: str):
153
+ with tempfile.TemporaryDirectory(prefix="veryfront-docling-") as output_dir:
154
+ try:
155
+ completed = subprocess.run(
156
+ [
157
+ "docling",
158
+ path,
159
+ "--to",
160
+ "md",
161
+ "--image-export-mode",
162
+ "placeholder",
163
+ "--output",
164
+ output_dir,
165
+ ],
166
+ capture_output=True,
167
+ text=True,
168
+ check=False,
169
+ timeout=DOCLING_TIMEOUT_SECONDS,
170
+ )
171
+ except subprocess.TimeoutExpired as error:
172
+ raise RuntimeError(
173
+ f"docling conversion timed out after {DOCLING_TIMEOUT_SECONDS:g}s"
174
+ ) from error
187
175
 
188
- try:
189
- payload = json.loads(completed.stdout)
190
- except json.JSONDecodeError as error:
191
- raise RuntimeError(f"kreuzberg extract returned invalid JSON: {error}") from error
176
+ if completed.returncode != 0:
177
+ detail = completed.stderr.strip() or completed.stdout.strip() or f"exit code {completed.returncode}"
178
+ raise RuntimeError(f"docling conversion failed: {detail}")
192
179
 
193
- content = payload.get("content", "")
194
- if not isinstance(content, str):
195
- raise RuntimeError("kreuzberg extract did not return string content")
180
+ markdown_files = sorted(Path(output_dir).rglob("*.md"))
181
+ if not markdown_files:
182
+ raise RuntimeError("docling conversion did not produce a markdown file")
196
183
 
197
- metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {}
198
- normalized_content = clean_text(content)
199
- stats = build_kreuzberg_stats(source_type, normalized_content, metadata)
184
+ return markdown_files[0].read_text(encoding="utf-8")
200
185
 
186
+
187
+ def parse_with_docling(path: str):
188
+ warnings: list[str] = []
189
+ normalized_content = clean_text(run_docling_markdown(path))
190
+ stats = build_docling_stats(normalized_content)
201
191
  return normalized_content or "_No extractable text found in document._", stats, warnings
202
192
 
203
193
 
204
- def prefer_kreuzberg(source_type: str, fallback_parser):
194
+ def prefer_docling(fallback_parser):
205
195
  def parser(path: str):
206
196
  try:
207
- return parse_with_kreuzberg(path, source_type)
197
+ return parse_with_docling(path)
208
198
  except FileNotFoundError as error:
209
- if getattr(error, "filename", "") == "kreuzberg":
199
+ if getattr(error, "filename", "") == "docling":
210
200
  return fallback_parser(path)
211
201
  raise
212
202
  except RuntimeError as error:
213
203
  content, stats, warnings = fallback_parser(path)
214
204
  warnings.append(
215
- "kreuzberg extraction failed; fell back to the built-in parser: "
205
+ "docling conversion failed; fell back to the built-in parser: "
216
206
  + str(error)
217
207
  )
218
208
  return content, stats, warnings
@@ -220,6 +210,10 @@ def prefer_kreuzberg(source_type: str, fallback_parser):
220
210
  return parser
221
211
 
222
212
 
213
+ def build_parser(fallback_parser, prefers_docling: bool):
214
+ return prefer_docling(fallback_parser) if prefers_docling else fallback_parser
215
+
216
+
223
217
  def parse_csv_like(path: str, delimiter: str = ","):
224
218
  warnings: list[str] = []
225
219
  with open(path, newline="", encoding="utf-8-sig") as file:
@@ -451,36 +445,41 @@ def parse_json(path: str):
451
445
  return f"{CODE_FENCE}json\n{rendered}\n{CODE_FENCE}", stats, warnings
452
446
 
453
447
 
454
- def select_parser(path: Path):
448
+ def select_parser_definition(path: Path):
455
449
  ext = path.suffix.lower()
456
450
  name = path.name.lower()
457
451
  if ext == ".pdf":
458
- return "pdf", prefer_kreuzberg("pdf", parse_pdf)
452
+ return "pdf", parse_pdf, True
459
453
  if ext in {".csv", ".tsv"}:
460
454
  delimiter = "\t" if ext == ".tsv" else ","
461
- return ext.lstrip("."), lambda file_path: parse_csv_like(file_path, delimiter)
455
+ return ext.lstrip("."), lambda file_path: parse_csv_like(file_path, delimiter), False
462
456
  if ext in {".xlsx", ".xls"}:
463
457
  source_type = ext.lstrip(".")
464
- return source_type, prefer_kreuzberg(source_type, parse_excel)
458
+ return source_type, parse_excel, True
465
459
  if ext == ".docx":
466
- return "docx", prefer_kreuzberg("docx", parse_docx)
460
+ return "docx", parse_docx, True
467
461
  if ext == ".pptx":
468
- return "pptx", prefer_kreuzberg("pptx", parse_pptx)
462
+ return "pptx", parse_pptx, True
469
463
  if ext in {".html", ".htm"}:
470
- return "html", prefer_kreuzberg("html", parse_html)
464
+ return "html", parse_html, True
471
465
  if ext in {".txt", ".md", ".mdx"}:
472
- return ext.lstrip("."), parse_text
466
+ return ext.lstrip("."), parse_text, False
473
467
  if ext == ".json":
474
- return "json", parse_json
468
+ return "json", parse_json, False
475
469
  if ext in TEXT_FILE_EXTENSIONS:
476
- return ext.lstrip("."), parse_text
470
+ return ext.lstrip("."), parse_text, False
477
471
  if not ext and name in TEXT_FILE_NAMES:
478
- return "text", parse_text
472
+ return "text", parse_text, False
479
473
  raise ValueError(f"Unsupported file type: {ext}")
480
474
 
481
475
 
476
+ def select_parser(path: Path):
477
+ source_type, fallback_parser, prefers_docling = select_parser_definition(path)
478
+ return source_type, build_parser(fallback_parser, prefers_docling)
479
+
480
+
482
481
  def build_summary(source_type: str, stats: dict[str, Any]) -> str:
483
- if stats.get("engine") == "kreuzberg":
482
+ if stats.get("engine") == "docling":
484
483
  return f"Converted {source_type.upper()} to markdown ({stats.get('characters', 0)} chars)."
485
484
  if source_type in {"csv", "tsv"}:
486
485
  return f"Parsed {stats.get('rows', 0)} rows across {stats.get('columns', 0)} columns."
@@ -540,6 +539,69 @@ def ingest_document_to_knowledge(file_path: str, output_dir: Optional[str] = Non
540
539
  }
541
540
 
542
541
 
542
+ def ingest_documents_to_knowledge(documents: list[dict[str, Any]], output_dir: Optional[str] = None):
543
+ output_root = Path(output_dir or "/workspace/knowledge")
544
+ output_root.mkdir(parents=True, exist_ok=True)
545
+
546
+ prepared_documents: list[dict[str, Any]] = []
547
+ for index, document in enumerate(documents):
548
+ file_path = document["file_path"]
549
+ path = Path(file_path)
550
+ if not path.exists():
551
+ raise FileNotFoundError(f"File not found: {file_path}")
552
+
553
+ slug = document.get("slug") or slugify(path.stem)
554
+ source_type, fallback_parser, prefers_docling = select_parser_definition(path)
555
+ prepared_documents.append({
556
+ "index": index,
557
+ "path": path,
558
+ "slug": slug,
559
+ "description": document.get("description"),
560
+ "source_reference": document.get("source_reference"),
561
+ "source_type": source_type,
562
+ "fallback_parser": fallback_parser,
563
+ "prefers_docling": prefers_docling,
564
+ })
565
+
566
+ results = []
567
+ for document in prepared_documents:
568
+ parser = build_parser(
569
+ document["fallback_parser"],
570
+ document["prefers_docling"],
571
+ )
572
+ content, stats, warnings = parser(str(document["path"]))
573
+
574
+ content = clean_text(content)
575
+ resolved_description = document["description"] or f"Parsed from {document['path'].name}"
576
+ title = titleize_filename(document["path"])
577
+ frontmatter = build_frontmatter(
578
+ document["source_reference"] or document["path"].name,
579
+ document["source_type"],
580
+ resolved_description,
581
+ )
582
+ markdown = f"{frontmatter}\n\n# {title}\n\n{content}\n"
583
+
584
+ output_path = output_root / f"{document['slug']}.md"
585
+ output_path.write_text(markdown, encoding="utf-8")
586
+
587
+ results.append({
588
+ "success": True,
589
+ "source_path": str(document["path"]),
590
+ "source_filename": document["path"].name,
591
+ "source_type": document["source_type"],
592
+ "slug": document["slug"],
593
+ "sandbox_output_path": str(output_path),
594
+ "suggested_project_path": f"knowledge/{document['slug']}.md",
595
+ "description": resolved_description,
596
+ "title": title,
597
+ "summary": build_summary(document["source_type"], stats),
598
+ "stats": stats,
599
+ "warnings": warnings,
600
+ })
601
+
602
+ return results
603
+
604
+
543
605
  def main():
544
606
  parser = argparse.ArgumentParser(description="Convert a local document into knowledge-base markdown")
545
607
  parser.add_argument("--input-json", required=True)
@@ -548,20 +610,27 @@ def main():
548
610
 
549
611
  try:
550
612
  payload = json.loads(Path(args.input_json).read_text(encoding="utf-8"))
551
- result = ingest_document_to_knowledge(
552
- file_path=payload["file_path"],
553
- output_dir=payload.get("output_dir"),
554
- description=payload.get("description"),
555
- slug=payload.get("slug"),
556
- source_reference=payload.get("source_reference"),
557
- )
613
+ files_payload = payload.get("files")
614
+ if isinstance(files_payload, list):
615
+ result = ingest_documents_to_knowledge(
616
+ documents=files_payload,
617
+ output_dir=payload.get("output_dir"),
618
+ )
619
+ else:
620
+ result = ingest_document_to_knowledge(
621
+ file_path=payload["file_path"],
622
+ output_dir=payload.get("output_dir"),
623
+ description=payload.get("description"),
624
+ slug=payload.get("slug"),
625
+ source_reference=payload.get("source_reference"),
626
+ )
558
627
  except ModuleNotFoundError as error:
559
628
  missing_package = error.name or "required package"
560
629
  raise SystemExit(
561
630
  "Missing Python package '"
562
631
  + missing_package
563
632
  + "'. Install knowledge parser dependencies with: "
564
- + "pip install pandas openpyxl xlrd pdfplumber python-docx python-pptx beautifulsoup4 lxml"
633
+ + "pip install docling pandas openpyxl xlrd pdfplumber python-docx python-pptx beautifulsoup4 lxml"
565
634
  )
566
635
 
567
636
  Path(args.output_json).write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
package/esm/deno.js CHANGED
@@ -1,6 +1,6 @@
1
1
  export default {
2
2
  "name": "veryfront",
3
- "version": "0.1.99",
3
+ "version": "0.1.100",
4
4
  "license": "Apache-2.0",
5
5
  "nodeModulesDir": "auto",
6
6
  "exclude": [
@@ -237,7 +237,7 @@ export default {
237
237
  "rehype-stringify": "npm:rehype-stringify@10.0.1",
238
238
  "esbuild": "npm:esbuild@0.27.4",
239
239
  "esbuild/mod.js": "npm:esbuild@0.27.4",
240
- "es-module-lexer": "npm:es-module-lexer@1.5.0",
240
+ "es-module-lexer": "npm:es-module-lexer@2.0.0",
241
241
  "gray-matter": "npm:gray-matter@4.0.3",
242
242
  "zod": "npm:zod@3.25.76",
243
243
  "mime-types": "npm:mime-types@2.1.35",
@@ -1,4 +1,4 @@
1
- export declare const VERSION = "0.1.99";
1
+ export declare const VERSION = "0.1.100";
2
2
  export declare function normalizeVeryfrontVersion(version: string | undefined): string | undefined;
3
3
  export declare function resolveRuntimeVersion(options?: {
4
4
  veryfrontVersion?: string;
@@ -1 +1 @@
1
- {"version":3,"file":"version.d.ts","sourceRoot":"","sources":["../../../src/src/utils/version.ts"],"names":[],"mappings":"AAKA,eAAO,MAAM,OAAO,WAAW,CAAC;AAEhC,wBAAgB,yBAAyB,CAAC,OAAO,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,GAAG,SAAS,CAGzF;AAUD,wBAAgB,qBAAqB,CAAC,OAAO,GAAE;IAC7C,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,eAAe,CAAC,EAAE,MAAM,CAAC;CACrB,GAAG,MAAM,CAKd;AAED,eAAO,MAAM,eAAe,QAK1B,CAAC;AAEH,eAAO,MAAM,iBAAiB,EAAE,MAAmB,CAAC;AAEpD,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,wBAAgB,kBAAkB,CAAC,gBAAgB,CAAC,EAAE,MAAM,GAAG,YAAY,CAM1E"}
1
+ {"version":3,"file":"version.d.ts","sourceRoot":"","sources":["../../../src/src/utils/version.ts"],"names":[],"mappings":"AAKA,eAAO,MAAM,OAAO,YAAY,CAAC;AAEjC,wBAAgB,yBAAyB,CAAC,OAAO,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,GAAG,SAAS,CAGzF;AAUD,wBAAgB,qBAAqB,CAAC,OAAO,GAAE;IAC7C,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,eAAe,CAAC,EAAE,MAAM,CAAC;CACrB,GAAG,MAAM,CAKd;AAED,eAAO,MAAM,eAAe,QAK1B,CAAC;AAEH,eAAO,MAAM,iBAAiB,EAAE,MAAmB,CAAC;AAEpD,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,wBAAgB,kBAAkB,CAAC,gBAAgB,CAAC,EAAE,MAAM,GAAG,YAAY,CAM1E"}
@@ -2,7 +2,7 @@ import denoConfig from "../../deno.js";
2
2
  import { getEnv } from "../platform/compat/process.js";
3
3
  // Keep in sync with deno.json version.
4
4
  // scripts/release.ts updates this constant during releases.
5
- export const VERSION = "0.1.99";
5
+ export const VERSION = "0.1.100";
6
6
  export function normalizeVeryfrontVersion(version) {
7
7
  if (!version)
8
8
  return undefined;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "veryfront",
3
- "version": "0.1.99",
3
+ "version": "0.1.100",
4
4
  "description": "The simplest way to build AI-powered apps",
5
5
  "keywords": [
6
6
  "react",
@@ -148,7 +148,7 @@
148
148
  "ai": "6.0.134",
149
149
  "class-variance-authority": "0.7.1",
150
150
  "clsx": "2.1.1",
151
- "es-module-lexer": "1.5.0",
151
+ "es-module-lexer": "2.0.0",
152
152
  "esbuild": "0.27.4",
153
153
  "github-slugger": "2.0.0",
154
154
  "gray-matter": "4.0.3",
@@ -36,7 +36,8 @@ export const knowledgeHelp: CommandHelp = {
36
36
  "Primary subcommand: ingest",
37
37
  "`uploads/...` means a remote project upload; use `./uploads/...` or `/workspace/uploads/...` to force a local file",
38
38
  "`ingest` orchestrates upload resolution, parsing, and project file writes",
39
- "Requires python3; non-text formats also require the supported parser packages unless you run inside the Veryfront sandbox",
40
- "The Veryfront sandbox image includes `kreuzberg`, and knowledge ingest falls back to the built-in parser when `kreuzberg` is unavailable or extraction fails",
39
+ "Requires python3; install `docling` locally to match the sandbox parsing path for PDF, Office, and HTML sources",
40
+ "Supported PDF, Office, and HTML sources are parsed through `docling` when it is available",
41
+ "The Veryfront sandbox image includes `docling`, and knowledge ingest falls back to the built-in parser when `docling` is unavailable or extraction fails",
41
42
  ],
42
43
  };
@@ -35,6 +35,12 @@ export interface KnowledgeParserResult {
35
35
  warnings: string[];
36
36
  }
37
37
 
38
+ export interface KnowledgeParserInput {
39
+ filePath: string;
40
+ description?: string;
41
+ slug?: string;
42
+ sourceReference?: string;
43
+ }
38
44
  type KnowledgeSource =
39
45
  | { kind: "local"; input: string; localPath: string }
40
46
  | { kind: "upload"; input: string; uploadPath: string; localPath: string };
@@ -465,6 +471,33 @@ export async function runKnowledgeParser(input: {
465
471
  sourceReference?: string;
466
472
  env?: Record<string, string>;
467
473
  }): Promise<KnowledgeParserResult> {
474
+ const [result] = await runKnowledgeParsers({
475
+ files: [{
476
+ filePath: input.filePath,
477
+ description: input.description,
478
+ slug: input.slug,
479
+ sourceReference: input.sourceReference,
480
+ }],
481
+ outputDir: input.outputDir,
482
+ env: input.env,
483
+ });
484
+
485
+ if (!result) {
486
+ throw new Error("knowledge ingest parser returned no results");
487
+ }
488
+
489
+ return result;
490
+ }
491
+
492
+ export async function runKnowledgeParsers(input: {
493
+ files: KnowledgeParserInput[];
494
+ outputDir: string;
495
+ env?: Record<string, string>;
496
+ }): Promise<KnowledgeParserResult[]> {
497
+ if (!input.files.length) {
498
+ return [];
499
+ }
500
+
468
501
  const tempDir = await dntShim.Deno.makeTempDir({ prefix: "veryfront-knowledge-parser-" });
469
502
  const inputJsonPath = `${tempDir}/input.json`;
470
503
  const outputJsonPath = `${tempDir}/output.json`;
@@ -475,11 +508,13 @@ export async function runKnowledgeParser(input: {
475
508
  await dntShim.Deno.writeTextFile(
476
509
  inputJsonPath,
477
510
  JSON.stringify({
478
- file_path: input.filePath,
511
+ files: input.files.map((file) => ({
512
+ file_path: file.filePath,
513
+ description: file.description,
514
+ slug: file.slug,
515
+ source_reference: file.sourceReference,
516
+ })),
479
517
  output_dir: input.outputDir,
480
- description: input.description,
481
- slug: input.slug,
482
- source_reference: input.sourceReference,
483
518
  }),
484
519
  );
485
520
  await dntShim.Deno.writeTextFile(scriptPath, knowledgeIngestPythonSource);
@@ -507,7 +542,8 @@ export async function runKnowledgeParser(input: {
507
542
  }
508
543
 
509
544
  const raw = await dntShim.Deno.readTextFile(outputJsonPath);
510
- return JSON.parse(raw) as KnowledgeParserResult;
545
+ const parsed = JSON.parse(raw) as KnowledgeParserResult | KnowledgeParserResult[];
546
+ return Array.isArray(parsed) ? parsed : [parsed];
511
547
  } catch (error) {
512
548
  if (error instanceof Error && error.message.startsWith("knowledge ingest parser failed")) {
513
549
  throw error;
@@ -2,8 +2,10 @@ export const knowledgeIngestPythonSource = String.raw`#!/usr/bin/env python3
2
2
  import argparse
3
3
  import csv
4
4
  import json
5
+ import os
5
6
  import re
6
7
  import subprocess
8
+ import tempfile
7
9
  from datetime import date
8
10
  from pathlib import Path
9
11
  from typing import Any, Optional
@@ -14,6 +16,26 @@ def yaml_quote(value: Any) -> str:
14
16
 
15
17
 
16
18
  CODE_FENCE = chr(96) * 3
19
+ DEFAULT_DOCLING_TIMEOUT_SECONDS = 900.0
20
+
21
+
22
+ def read_timeout_seconds(env_name: str, default_seconds: float) -> float:
23
+ raw_value = os.environ.get(env_name)
24
+ if raw_value is None or raw_value.strip() == "":
25
+ return default_seconds
26
+
27
+ try:
28
+ timeout_seconds = float(raw_value)
29
+ except ValueError:
30
+ return default_seconds
31
+
32
+ return timeout_seconds if timeout_seconds > 0 else default_seconds
33
+
34
+
35
+ DOCLING_TIMEOUT_SECONDS = read_timeout_seconds(
36
+ "VERYFRONT_KNOWLEDGE_DOCLING_TIMEOUT_SECONDS",
37
+ DEFAULT_DOCLING_TIMEOUT_SECONDS,
38
+ )
17
39
  TEXT_FILE_EXTENSIONS = {
18
40
  ".c",
19
41
  ".cc",
@@ -119,100 +141,68 @@ def build_frontmatter(source: str, source_type: str, description: str) -> str:
119
141
  ])
120
142
 
121
143
 
122
- def metadata_int(metadata: dict[str, Any], *keys: str) -> Optional[int]:
123
- for key in keys:
124
- value = metadata.get(key)
125
- if isinstance(value, int) and not isinstance(value, bool):
126
- return value
127
- return None
128
-
129
-
130
- def metadata_string_list(metadata: dict[str, Any], *keys: str) -> Optional[list[str]]:
131
- for key in keys:
132
- value = metadata.get(key)
133
- if isinstance(value, list) and all(isinstance(item, str) for item in value):
134
- return value
135
- return None
136
-
137
-
138
- def build_kreuzberg_stats(source_type: str, content: str, metadata: dict[str, Any]):
139
- stats: dict[str, Any] = {
144
+ def build_docling_stats(content: str):
145
+ return {
140
146
  "characters": len(content),
141
147
  "lines": len(content.splitlines()) if content else 0,
142
- "engine": "kreuzberg",
148
+ "engine": "docling",
143
149
  }
144
150
 
145
- if isinstance(metadata.get("mime_type"), str):
146
- stats["mime_type"] = metadata["mime_type"]
147
151
 
148
- if source_type == "pdf":
149
- stats["pages"] = metadata_int(metadata, "page_count") or 0
150
- stats["tables"] = metadata_int(metadata, "table_count") or 0
151
- elif source_type in {"xlsx", "xls"}:
152
- stats["sheets"] = metadata_int(metadata, "sheet_count") or 0
153
- stats["rows"] = metadata_int(metadata, "row_count") or 0
154
- stats["sheet_names"] = metadata_string_list(metadata, "sheet_names") or []
155
- elif source_type == "docx":
156
- stats["paragraphs"] = metadata_int(metadata, "paragraph_count") or 0
157
- stats["tables"] = metadata_int(metadata, "table_count") or 0
158
- elif source_type == "pptx":
159
- stats["slides"] = metadata_int(metadata, "slide_count", "page_count") or 0
160
- stats["tables"] = metadata_int(metadata, "table_count") or 0
161
- elif source_type == "html":
162
- stats["tables"] = metadata_int(metadata, "table_count") or 0
163
-
164
- return stats
165
-
166
-
167
- def parse_with_kreuzberg(path: str, source_type: str):
168
- warnings: list[str] = []
169
- completed = subprocess.run(
170
- [
171
- "kreuzberg",
172
- "extract",
173
- path,
174
- "--format",
175
- "json",
176
- "--output-format",
177
- "markdown",
178
- ],
179
- capture_output=True,
180
- text=True,
181
- check=False,
182
- )
183
-
184
- if completed.returncode != 0:
185
- detail = completed.stderr.strip() or completed.stdout.strip() or f"exit code {completed.returncode}"
186
- raise RuntimeError(f"kreuzberg extract failed: {detail}")
152
+ def run_docling_markdown(path: str):
153
+ with tempfile.TemporaryDirectory(prefix="veryfront-docling-") as output_dir:
154
+ try:
155
+ completed = subprocess.run(
156
+ [
157
+ "docling",
158
+ path,
159
+ "--to",
160
+ "md",
161
+ "--image-export-mode",
162
+ "placeholder",
163
+ "--output",
164
+ output_dir,
165
+ ],
166
+ capture_output=True,
167
+ text=True,
168
+ check=False,
169
+ timeout=DOCLING_TIMEOUT_SECONDS,
170
+ )
171
+ except subprocess.TimeoutExpired as error:
172
+ raise RuntimeError(
173
+ f"docling conversion timed out after {DOCLING_TIMEOUT_SECONDS:g}s"
174
+ ) from error
187
175
 
188
- try:
189
- payload = json.loads(completed.stdout)
190
- except json.JSONDecodeError as error:
191
- raise RuntimeError(f"kreuzberg extract returned invalid JSON: {error}") from error
176
+ if completed.returncode != 0:
177
+ detail = completed.stderr.strip() or completed.stdout.strip() or f"exit code {completed.returncode}"
178
+ raise RuntimeError(f"docling conversion failed: {detail}")
192
179
 
193
- content = payload.get("content", "")
194
- if not isinstance(content, str):
195
- raise RuntimeError("kreuzberg extract did not return string content")
180
+ markdown_files = sorted(Path(output_dir).rglob("*.md"))
181
+ if not markdown_files:
182
+ raise RuntimeError("docling conversion did not produce a markdown file")
196
183
 
197
- metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else {}
198
- normalized_content = clean_text(content)
199
- stats = build_kreuzberg_stats(source_type, normalized_content, metadata)
184
+ return markdown_files[0].read_text(encoding="utf-8")
200
185
 
186
+
187
+ def parse_with_docling(path: str):
188
+ warnings: list[str] = []
189
+ normalized_content = clean_text(run_docling_markdown(path))
190
+ stats = build_docling_stats(normalized_content)
201
191
  return normalized_content or "_No extractable text found in document._", stats, warnings
202
192
 
203
193
 
204
- def prefer_kreuzberg(source_type: str, fallback_parser):
194
+ def prefer_docling(fallback_parser):
205
195
  def parser(path: str):
206
196
  try:
207
- return parse_with_kreuzberg(path, source_type)
197
+ return parse_with_docling(path)
208
198
  except FileNotFoundError as error:
209
- if getattr(error, "filename", "") == "kreuzberg":
199
+ if getattr(error, "filename", "") == "docling":
210
200
  return fallback_parser(path)
211
201
  raise
212
202
  except RuntimeError as error:
213
203
  content, stats, warnings = fallback_parser(path)
214
204
  warnings.append(
215
- "kreuzberg extraction failed; fell back to the built-in parser: "
205
+ "docling conversion failed; fell back to the built-in parser: "
216
206
  + str(error)
217
207
  )
218
208
  return content, stats, warnings
@@ -220,6 +210,10 @@ def prefer_kreuzberg(source_type: str, fallback_parser):
220
210
  return parser
221
211
 
222
212
 
213
+ def build_parser(fallback_parser, prefers_docling: bool):
214
+ return prefer_docling(fallback_parser) if prefers_docling else fallback_parser
215
+
216
+
223
217
  def parse_csv_like(path: str, delimiter: str = ","):
224
218
  warnings: list[str] = []
225
219
  with open(path, newline="", encoding="utf-8-sig") as file:
@@ -451,36 +445,41 @@ def parse_json(path: str):
451
445
  return f"{CODE_FENCE}json\n{rendered}\n{CODE_FENCE}", stats, warnings
452
446
 
453
447
 
454
- def select_parser(path: Path):
448
+ def select_parser_definition(path: Path):
455
449
  ext = path.suffix.lower()
456
450
  name = path.name.lower()
457
451
  if ext == ".pdf":
458
- return "pdf", prefer_kreuzberg("pdf", parse_pdf)
452
+ return "pdf", parse_pdf, True
459
453
  if ext in {".csv", ".tsv"}:
460
454
  delimiter = "\t" if ext == ".tsv" else ","
461
- return ext.lstrip("."), lambda file_path: parse_csv_like(file_path, delimiter)
455
+ return ext.lstrip("."), lambda file_path: parse_csv_like(file_path, delimiter), False
462
456
  if ext in {".xlsx", ".xls"}:
463
457
  source_type = ext.lstrip(".")
464
- return source_type, prefer_kreuzberg(source_type, parse_excel)
458
+ return source_type, parse_excel, True
465
459
  if ext == ".docx":
466
- return "docx", prefer_kreuzberg("docx", parse_docx)
460
+ return "docx", parse_docx, True
467
461
  if ext == ".pptx":
468
- return "pptx", prefer_kreuzberg("pptx", parse_pptx)
462
+ return "pptx", parse_pptx, True
469
463
  if ext in {".html", ".htm"}:
470
- return "html", prefer_kreuzberg("html", parse_html)
464
+ return "html", parse_html, True
471
465
  if ext in {".txt", ".md", ".mdx"}:
472
- return ext.lstrip("."), parse_text
466
+ return ext.lstrip("."), parse_text, False
473
467
  if ext == ".json":
474
- return "json", parse_json
468
+ return "json", parse_json, False
475
469
  if ext in TEXT_FILE_EXTENSIONS:
476
- return ext.lstrip("."), parse_text
470
+ return ext.lstrip("."), parse_text, False
477
471
  if not ext and name in TEXT_FILE_NAMES:
478
- return "text", parse_text
472
+ return "text", parse_text, False
479
473
  raise ValueError(f"Unsupported file type: {ext}")
480
474
 
481
475
 
476
+ def select_parser(path: Path):
477
+ source_type, fallback_parser, prefers_docling = select_parser_definition(path)
478
+ return source_type, build_parser(fallback_parser, prefers_docling)
479
+
480
+
482
481
  def build_summary(source_type: str, stats: dict[str, Any]) -> str:
483
- if stats.get("engine") == "kreuzberg":
482
+ if stats.get("engine") == "docling":
484
483
  return f"Converted {source_type.upper()} to markdown ({stats.get('characters', 0)} chars)."
485
484
  if source_type in {"csv", "tsv"}:
486
485
  return f"Parsed {stats.get('rows', 0)} rows across {stats.get('columns', 0)} columns."
@@ -540,6 +539,69 @@ def ingest_document_to_knowledge(file_path: str, output_dir: Optional[str] = Non
540
539
  }
541
540
 
542
541
 
542
+ def ingest_documents_to_knowledge(documents: list[dict[str, Any]], output_dir: Optional[str] = None):
543
+ output_root = Path(output_dir or "/workspace/knowledge")
544
+ output_root.mkdir(parents=True, exist_ok=True)
545
+
546
+ prepared_documents: list[dict[str, Any]] = []
547
+ for index, document in enumerate(documents):
548
+ file_path = document["file_path"]
549
+ path = Path(file_path)
550
+ if not path.exists():
551
+ raise FileNotFoundError(f"File not found: {file_path}")
552
+
553
+ slug = document.get("slug") or slugify(path.stem)
554
+ source_type, fallback_parser, prefers_docling = select_parser_definition(path)
555
+ prepared_documents.append({
556
+ "index": index,
557
+ "path": path,
558
+ "slug": slug,
559
+ "description": document.get("description"),
560
+ "source_reference": document.get("source_reference"),
561
+ "source_type": source_type,
562
+ "fallback_parser": fallback_parser,
563
+ "prefers_docling": prefers_docling,
564
+ })
565
+
566
+ results = []
567
+ for document in prepared_documents:
568
+ parser = build_parser(
569
+ document["fallback_parser"],
570
+ document["prefers_docling"],
571
+ )
572
+ content, stats, warnings = parser(str(document["path"]))
573
+
574
+ content = clean_text(content)
575
+ resolved_description = document["description"] or f"Parsed from {document['path'].name}"
576
+ title = titleize_filename(document["path"])
577
+ frontmatter = build_frontmatter(
578
+ document["source_reference"] or document["path"].name,
579
+ document["source_type"],
580
+ resolved_description,
581
+ )
582
+ markdown = f"{frontmatter}\n\n# {title}\n\n{content}\n"
583
+
584
+ output_path = output_root / f"{document['slug']}.md"
585
+ output_path.write_text(markdown, encoding="utf-8")
586
+
587
+ results.append({
588
+ "success": True,
589
+ "source_path": str(document["path"]),
590
+ "source_filename": document["path"].name,
591
+ "source_type": document["source_type"],
592
+ "slug": document["slug"],
593
+ "sandbox_output_path": str(output_path),
594
+ "suggested_project_path": f"knowledge/{document['slug']}.md",
595
+ "description": resolved_description,
596
+ "title": title,
597
+ "summary": build_summary(document["source_type"], stats),
598
+ "stats": stats,
599
+ "warnings": warnings,
600
+ })
601
+
602
+ return results
603
+
604
+
543
605
  def main():
544
606
  parser = argparse.ArgumentParser(description="Convert a local document into knowledge-base markdown")
545
607
  parser.add_argument("--input-json", required=True)
@@ -548,20 +610,27 @@ def main():
548
610
 
549
611
  try:
550
612
  payload = json.loads(Path(args.input_json).read_text(encoding="utf-8"))
551
- result = ingest_document_to_knowledge(
552
- file_path=payload["file_path"],
553
- output_dir=payload.get("output_dir"),
554
- description=payload.get("description"),
555
- slug=payload.get("slug"),
556
- source_reference=payload.get("source_reference"),
557
- )
613
+ files_payload = payload.get("files")
614
+ if isinstance(files_payload, list):
615
+ result = ingest_documents_to_knowledge(
616
+ documents=files_payload,
617
+ output_dir=payload.get("output_dir"),
618
+ )
619
+ else:
620
+ result = ingest_document_to_knowledge(
621
+ file_path=payload["file_path"],
622
+ output_dir=payload.get("output_dir"),
623
+ description=payload.get("description"),
624
+ slug=payload.get("slug"),
625
+ source_reference=payload.get("source_reference"),
626
+ )
558
627
  except ModuleNotFoundError as error:
559
628
  missing_package = error.name or "required package"
560
629
  raise SystemExit(
561
630
  "Missing Python package '"
562
631
  + missing_package
563
632
  + "'. Install knowledge parser dependencies with: "
564
- + "pip install pandas openpyxl xlrd pdfplumber python-docx python-pptx beautifulsoup4 lxml"
633
+ + "pip install docling pandas openpyxl xlrd pdfplumber python-docx python-pptx beautifulsoup4 lxml"
565
634
  )
566
635
 
567
636
  Path(args.output_json).write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
package/src/deno.js CHANGED
@@ -1,6 +1,6 @@
1
1
  export default {
2
2
  "name": "veryfront",
3
- "version": "0.1.99",
3
+ "version": "0.1.100",
4
4
  "license": "Apache-2.0",
5
5
  "nodeModulesDir": "auto",
6
6
  "exclude": [
@@ -237,7 +237,7 @@ export default {
237
237
  "rehype-stringify": "npm:rehype-stringify@10.0.1",
238
238
  "esbuild": "npm:esbuild@0.27.4",
239
239
  "esbuild/mod.js": "npm:esbuild@0.27.4",
240
- "es-module-lexer": "npm:es-module-lexer@1.5.0",
240
+ "es-module-lexer": "npm:es-module-lexer@2.0.0",
241
241
  "gray-matter": "npm:gray-matter@4.0.3",
242
242
  "zod": "npm:zod@3.25.76",
243
243
  "mime-types": "npm:mime-types@2.1.35",
@@ -60,7 +60,7 @@ export type ImportSpecifier = {
60
60
  ss: number; // Start of import statement
61
61
  se: number; // End of import statement
62
62
  d: number; // > -1 if dynamic import
63
- a: number; // assert index
63
+ a: number; // import attribute index
64
64
  };
65
65
 
66
66
  function logParseError(error: unknown, code: string): void {
@@ -3,7 +3,7 @@ import { getEnv } from "../platform/compat/process.js";
3
3
 
4
4
  // Keep in sync with deno.json version.
5
5
  // scripts/release.ts updates this constant during releases.
6
- export const VERSION = "0.1.99";
6
+ export const VERSION = "0.1.100";
7
7
 
8
8
  export function normalizeVeryfrontVersion(version: string | undefined): string | undefined {
9
9
  if (!version) return undefined;