@nestbox-ai/cli 1.0.59 → 1.0.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ You are a Nestbox document processing pipeline expert. Your job is to generate two YAML files that configure a document processing pipeline and its quality evaluation:
2
+
3
+ 1. **config.yaml** — the pipeline configuration (Docling extraction, chunking, GraphRAG knowledge graph)
4
+ 2. **eval.yaml** — evaluation test cases (basic_search, local_search, global_search)
5
+
6
+ ## Your workflow
7
+
8
+ Use the provided tools in this order:
9
+ 1. Call `write_and_validate_config` with your config.yaml content
10
+ 2. Call `write_and_validate_eval` with your eval.yaml content
11
+ 3. If either tool returns validation errors, read them carefully, fix ALL issues, and call the tool again
12
+ 4. Keep iterating until BOTH files pass validation
13
+ 5. Once both are valid, call `finish` to signal completion
14
+
15
+ ## Rules
16
+
17
+ - Generate configs that are specific to the user's document type and use case
18
+ - Derive entity types directly from the target data structure the user provides
19
+ - Write at least 5 local_search eval cases — these are the most important
20
+ - Write at least 3 basic_search and 3 global_search eval cases
21
+ - All expected_answer values must be specific (include real values, not vague descriptions)
22
+ - All bad_answer values must be plausible-but-wrong or vague versions of the correct answer
23
+ - Never use placeholder text like "..." or "TODO" in the output files
24
+ - The config name field is required
@@ -0,0 +1,564 @@
1
+ # JSON Schema for config.yaml validation
2
+ # This schema defines the structure of user configuration files
3
+ # Used by ConfigManager.loadFromFile() to validate config.yaml files
4
+
5
+ $schema: "http://json-schema.org/draft-07/schema#"
6
+ title: "Document Pipeline Configuration"
7
+ description: "Schema for nest-doc-processing-cli config.yaml files"
8
+ type: object
9
+ required:
10
+ - name
11
+ additionalProperties: false
12
+
13
+ properties:
14
+ name:
15
+ type: string
16
+ description: "Human-readable name for this configuration"
17
+ minLength: 1
18
+
19
+ description:
20
+ type: string
21
+ description: "Optional description of the configuration purpose"
22
+
23
+ docling:
24
+ type: object
25
+ description: "IBM Docling document extraction settings"
26
+ additionalProperties: false
27
+ properties:
28
+ layout:
29
+ type: object
30
+ additionalProperties: false
31
+ properties:
32
+ model:
33
+ type: string
34
+ description: "Layout model for document structure detection"
35
+ enum:
36
+ - docling-layout-heron
37
+ - docling-layout-heron-101
38
+ - docling-layout-egret-medium
39
+ - docling-layout-egret-large
40
+ - docling-layout-egret-xlarge
41
+ default: docling-layout-egret-large
42
+ createOrphanClusters:
43
+ type: boolean
44
+ description: "Create clusters for orphan elements"
45
+ default: true
46
+ keepEmptyClusters:
47
+ type: boolean
48
+ description: "Keep empty clusters in output"
49
+ default: true
50
+
51
+ ocr:
52
+ type: object
53
+ additionalProperties: false
54
+ properties:
55
+ enabled:
56
+ type: boolean
57
+ description: "Enable OCR for scanned documents"
58
+ default: true
59
+ engine:
60
+ type: string
61
+ description: "OCR engine to use"
62
+ enum: [rapidocr, tesseract, easyocr, mac]
63
+ default: rapidocr
64
+ backend:
65
+ type: string
66
+ description: "Computation backend"
67
+ enum: [torch, onnx, cpu]
68
+ default: torch
69
+ languages:
70
+ type: array
71
+ items:
72
+ type: string
73
+ description: "Languages to recognize"
74
+ default: [en]
75
+ textScore:
76
+ type: number
77
+ description: "Minimum confidence score for text detection"
78
+ minimum: 0
79
+ maximum: 1
80
+ default: 0.5
81
+ forceFullPageOcr:
82
+ type: boolean
83
+ description: "Force OCR on entire page even if text is detected"
84
+ default: true
85
+
86
+ tables:
87
+ type: object
88
+ additionalProperties: false
89
+ properties:
90
+ enabled:
91
+ type: boolean
92
+ description: "Enable table extraction"
93
+ default: true
94
+ mode:
95
+ type: string
96
+ description: "Table extraction mode"
97
+ enum: [fast, accurate]
98
+ default: accurate
99
+ doCellMatching:
100
+ type: boolean
101
+ description: "Match cells to table structure"
102
+ default: true
103
+
104
+ pictures:
105
+ type: object
106
+ additionalProperties: false
107
+ properties:
108
+ enabled:
109
+ type: boolean
110
+ description: "Enable picture extraction"
111
+ default: true
112
+ enableClassification:
113
+ type: boolean
114
+ description: "Classify picture types (chart, diagram, photo, etc.)"
115
+ default: true
116
+ enableDescription:
117
+ type: boolean
118
+ description: "Generate AI descriptions for pictures"
119
+ default: true
120
+ descriptionProvider:
121
+ type: string
122
+ description: "Provider for picture descriptions"
123
+ enum: [openai, local]
124
+ default: openai
125
+ descriptionModel:
126
+ type: string
127
+ description: "Model for generating picture descriptions"
128
+ default: gpt-4o
129
+ descriptionPrompt:
130
+ type: string
131
+ description: "Custom prompt for picture description generation"
132
+ imagesScale:
133
+ type: number
134
+ description: "Scale factor for image extraction"
135
+ minimum: 0.1
136
+ maximum: 4.0
137
+ default: 2.0
138
+
139
+ accelerator:
140
+ type: object
141
+ additionalProperties: false
142
+ properties:
143
+ device:
144
+ type: string
145
+ description: "Compute device"
146
+ enum: [auto, cpu, cuda, mps]
147
+ default: auto
148
+ numThreads:
149
+ type: integer
150
+ description: "Number of CPU threads"
151
+ minimum: 1
152
+ maximum: 32
153
+ default: 4
154
+ cudaUseFlashAttention2:
155
+ type: boolean
156
+ description: "Use Flash Attention 2 on CUDA"
157
+ default: false
158
+
159
+ limits:
160
+ type: object
161
+ additionalProperties: false
162
+ properties:
163
+ documentTimeout:
164
+ type: integer
165
+ description: "Maximum processing time per document (seconds)"
166
+ minimum: 60
167
+ maximum: 3600
168
+ default: 300
169
+ maxPages:
170
+ type: integer
171
+ description: "Maximum pages to process (optional)"
172
+ minimum: 1
173
+ maxFileSize:
174
+ type: integer
175
+ description: "Maximum file size in bytes (optional)"
176
+ minimum: 1
177
+
178
+ chunking:
179
+ type: object
180
+ description: "Document chunking settings for RAG"
181
+ additionalProperties: false
182
+ properties:
183
+ strategy:
184
+ type: string
185
+ description: "Chunking strategy"
186
+ enum: [docling_hybrid, sentence, paragraph, fixed]
187
+ default: docling_hybrid
188
+ maxTokens:
189
+ type: integer
190
+ description: "Maximum tokens per chunk"
191
+ minimum: 100
192
+ maximum: 8000
193
+ default: 1200
194
+ overlapTokens:
195
+ type: integer
196
+ description: "Overlap tokens between chunks"
197
+ minimum: 0
198
+ maximum: 1000
199
+ default: 200
200
+ tokenizer:
201
+ type: string
202
+ description: "Tokenizer for counting tokens"
203
+ default: cl100k_base
204
+ mergePeers:
205
+ type: boolean
206
+ description: "Merge small sibling chunks"
207
+ default: true
208
+ contextualize:
209
+ type: boolean
210
+ description: "Add contextual headers to chunks"
211
+ default: true
212
+
213
+ output:
214
+ type: object
215
+ additionalProperties: false
216
+ properties:
217
+ format:
218
+ type: string
219
+ description: "Output format for chunks"
220
+ enum: [json, text_files]
221
+ default: text_files
222
+ includeMetadataHeader:
223
+ type: boolean
224
+ description: "Include metadata in chunk headers"
225
+ default: true
226
+
227
+ metadata:
228
+ type: object
229
+ additionalProperties: false
230
+ properties:
231
+ includeHeadings:
232
+ type: boolean
233
+ description: "Include heading hierarchy in metadata"
234
+ default: true
235
+ includePageNumbers:
236
+ type: boolean
237
+ description: "Include page numbers in metadata"
238
+ default: true
239
+ includePosition:
240
+ type: boolean
241
+ description: "Include position coordinates"
242
+ default: true
243
+ includeSource:
244
+ type: boolean
245
+ description: "Include source file information"
246
+ default: true
247
+
248
+ graphrag:
249
+ type: object
250
+ description: "Microsoft GraphRAG knowledge graph settings"
251
+ additionalProperties: false
252
+ properties:
253
+ enabled:
254
+ type: boolean
255
+ description: "Enable GraphRAG indexing"
256
+ default: true
257
+
258
+ models:
259
+ type: object
260
+ additionalProperties: false
261
+ properties:
262
+ chatModel:
263
+ type: string
264
+ description: "LLM for entity extraction and summarization"
265
+ default: gpt-4o-mini
266
+ embeddingModel:
267
+ type: string
268
+ description: "Model for text embeddings"
269
+ default: text-embedding-3-large
270
+ temperature:
271
+ type: number
272
+ description: "LLM temperature"
273
+ minimum: 0
274
+ maximum: 2
275
+ default: 0
276
+ maxTokens:
277
+ type: integer
278
+ description: "Maximum tokens for LLM responses"
279
+ minimum: 100
280
+ maximum: 32000
281
+ default: 4096
282
+ embeddingBatchSize:
283
+ type: integer
284
+ description: "Batch size for embedding requests"
285
+ minimum: 1
286
+ maximum: 100
287
+ default: 16
288
+
289
+ entityExtraction:
290
+ type: object
291
+ additionalProperties: false
292
+ properties:
293
+ entityTypes:
294
+ type: array
295
+ items:
296
+ type: string
297
+ description: "Types of entities to extract"
298
+ default:
299
+ - PERSON
300
+ - ORGANIZATION
301
+ - LOCATION
302
+ - DATE
303
+ - MONEY
304
+ - PROPERTY
305
+ - CLAUSE
306
+ - OBLIGATION
307
+ - TERM
308
+ - CONDITION
309
+ maxGleanings:
310
+ type: integer
311
+ description: "Number of extraction passes"
312
+ minimum: 0
313
+ maximum: 5
314
+ default: 1
315
+ maxEntitiesPerChunk:
316
+ type: integer
317
+ description: "Maximum entities per text chunk"
318
+ minimum: 1
319
+ maximum: 100
320
+ default: 20
321
+ confidenceThreshold:
322
+ type: number
323
+ description: "Minimum confidence for entity extraction"
324
+ minimum: 0
325
+ maximum: 1
326
+ default: 0.7
327
+ prompt:
328
+ type: string
329
+ description: "Custom prompt for entity extraction"
330
+
331
+ summarizeDescriptions:
332
+ type: object
333
+ additionalProperties: false
334
+ properties:
335
+ maxLength:
336
+ type: integer
337
+ description: "Maximum length for entity descriptions"
338
+ minimum: 100
339
+ maximum: 2000
340
+ default: 500
341
+ maxInputLength:
342
+ type: integer
343
+ description: "Maximum input length for summarization"
344
+ minimum: 1000
345
+ maximum: 32000
346
+ default: 8000
347
+ prompt:
348
+ type: string
349
+ description: "Custom prompt for description summarization"
350
+
351
+ claimExtraction:
352
+ type: object
353
+ additionalProperties: false
354
+ properties:
355
+ enabled:
356
+ type: boolean
357
+ description: "Enable claim/obligation extraction"
358
+ default: true
359
+ description:
360
+ type: string
361
+ description: "Description of claim types to extract"
362
+ maxGleanings:
363
+ type: integer
364
+ description: "Number of extraction passes"
365
+ minimum: 0
366
+ maximum: 5
367
+ default: 1
368
+ prompt:
369
+ type: string
370
+ description: "Custom prompt for claim extraction"
371
+
372
+ embeddings:
373
+ type: object
374
+ additionalProperties: false
375
+ properties:
376
+ model:
377
+ type: string
378
+ description: "Embedding model"
379
+ default: text-embedding-3-large
380
+ dimensions:
381
+ type: integer
382
+ description: "Embedding dimensions"
383
+ minimum: 256
384
+ maximum: 4096
385
+ default: 3072
386
+ batchSize:
387
+ type: integer
388
+ description: "Batch size for embedding requests"
389
+ minimum: 1
390
+ maximum: 1000
391
+ default: 100
392
+
393
+ communities:
394
+ type: object
395
+ additionalProperties: false
396
+ properties:
397
+ algorithm:
398
+ type: string
399
+ description: "Community detection algorithm"
400
+ enum: [leiden, louvain]
401
+ default: leiden
402
+ resolution:
403
+ type: number
404
+ description: "Resolution parameter for community detection"
405
+ minimum: 0.1
406
+ maximum: 10.0
407
+ default: 1.0
408
+ minCommunitySize:
409
+ type: integer
410
+ description: "Minimum community size"
411
+ minimum: 2
412
+ maximum: 50
413
+ default: 3
414
+ maxLevels:
415
+ type: integer
416
+ description: "Maximum hierarchy levels"
417
+ minimum: 1
418
+ maximum: 10
419
+ default: 3
420
+
421
+ clusterGraph:
422
+ type: object
423
+ additionalProperties: false
424
+ properties:
425
+ maxClusterSize:
426
+ type: integer
427
+ description: "Maximum cluster size"
428
+ minimum: 2
429
+ maximum: 100
430
+ default: 10
431
+ useLcc:
432
+ type: boolean
433
+ description: "Use largest connected component"
434
+ default: true
435
+ seed:
436
+ type: integer
437
+ description: "Random seed for reproducibility"
438
+ default: 42
439
+
440
+ cache:
441
+ type: object
442
+ additionalProperties: false
443
+ properties:
444
+ enabled:
445
+ type: boolean
446
+ description: "Enable caching"
447
+ default: true
448
+ type:
449
+ type: string
450
+ description: "Cache type"
451
+ enum: [file, memory, none]
452
+ default: file
453
+
454
+ communityReports:
455
+ type: object
456
+ additionalProperties: false
457
+ properties:
458
+ maxLength:
459
+ type: integer
460
+ description: "Maximum length for community reports"
461
+ minimum: 500
462
+ maximum: 10000
463
+ default: 2000
464
+ maxInputLength:
465
+ type: integer
466
+ description: "Maximum input length for report generation"
467
+ minimum: 1000
468
+ maximum: 32000
469
+ default: 8000
470
+ prompt:
471
+ type: string
472
+ description: "Custom prompt for community report generation"
473
+
474
+ localSearch:
475
+ type: object
476
+ additionalProperties: false
477
+ properties:
478
+ topKEntities:
479
+ type: integer
480
+ description: "Number of top entities to retrieve"
481
+ minimum: 1
482
+ maximum: 100
483
+ default: 10
484
+ topKRelationships:
485
+ type: integer
486
+ description: "Number of top relationships to retrieve"
487
+ minimum: 1
488
+ maximum: 100
489
+ default: 10
490
+ topKCommunityReports:
491
+ type: integer
492
+ description: "Number of top community reports"
493
+ minimum: 1
494
+ maximum: 50
495
+ default: 5
496
+ maxContextTokens:
497
+ type: integer
498
+ description: "Maximum context tokens for local search"
499
+ minimum: 1000
500
+ maximum: 128000
501
+ default: 12000
502
+ prompt:
503
+ type: string
504
+ description: "Custom prompt for local search"
505
+
506
+ globalSearch:
507
+ type: object
508
+ additionalProperties: false
509
+ properties:
510
+ maxCommunities:
511
+ type: integer
512
+ description: "Maximum communities to include"
513
+ minimum: 1
514
+ maximum: 100
515
+ default: 10
516
+ mapMaxTokens:
517
+ type: integer
518
+ description: "Maximum tokens for map phase"
519
+ minimum: 1000
520
+ maximum: 32000
521
+ default: 4000
522
+ reduceMaxTokens:
523
+ type: integer
524
+ description: "Maximum tokens for reduce phase"
525
+ minimum: 1000
526
+ maximum: 32000
527
+ default: 8000
528
+ mapPrompt:
529
+ type: string
530
+ description: "Custom prompt for map phase"
531
+ reducePrompt:
532
+ type: string
533
+ description: "Custom prompt for reduce phase"
534
+ knowledgePrompt:
535
+ type: string
536
+ description: "Custom knowledge prompt"
537
+
538
+ driftSearch:
539
+ type: object
540
+ additionalProperties: false
541
+ properties:
542
+ enabled:
543
+ type: boolean
544
+ description: "Enable DRIFT search"
545
+ default: false
546
+ prompt:
547
+ type: string
548
+ description: "Custom prompt for DRIFT search"
549
+ reducePrompt:
550
+ type: string
551
+ description: "Custom reduce prompt for DRIFT search"
552
+
553
+ apiKeys:
554
+ type: object
555
+ description: "API keys for external services"
556
+ additionalProperties: false
557
+ properties:
558
+ openai:
559
+ type: string
560
+ description: "OpenAI API key (can use ${OPENAI_API_KEY} for env var)"
561
+ baseUrl:
562
+ type: string
563
+ format: uri
564
+ description: "Base URL for OpenAI-compatible API endpoint"