datalab-python-sdk 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/PKG-INFO +1 -1
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/__init__.py +11 -1
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/cli.py +308 -28
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/client.py +480 -31
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/models.py +76 -7
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/settings.py +1 -1
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/pyproject.toml +1 -1
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/tests/test_client_methods.py +315 -131
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/uv.lock +1 -1
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/.github/workflows/ci.yml +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/.github/workflows/publish.yml +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/.gitignore +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/.pre-commit-config.yaml +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/.python-version +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/LICENSE +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/README.md +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/08-Lambda-Calculus.pptx +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/adversarial.pdf +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/bid_evaluation.docx +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/book_review.ppt +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/book_store.xls +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/chi_hind.png +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/how_to_read.doc +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/normandy.epub +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/sample-1-sheet.xlsx +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/thinkpython.pdf +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/data/vibe.html +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/exceptions.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/datalab_sdk/mimetypes.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/integration/README.md +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/integration/__init__.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/integration/test_live_api.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/integration/test_readme_examples.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/poetry.lock +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/pytest.ini +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/README.md +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/end_to_end_workflow.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_api_tutorial/1_get_step_types.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_api_tutorial/2_get_workflows.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_api_tutorial/3_create_workflow.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_api_tutorial/4_execute_workflow.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_api_tutorial/5_poll_workflow.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_api_tutorial/README.md +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_definitions/README.md +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_definitions/eval_segmentation.json +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_definitions/parse_segment.json +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_definitions/segment_parallel_extract.json +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/recipes/workflows/workflow_definitions/slack_alert.json +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/tests/__init__.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/tests/conftest.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/tests/test_cli_simple.py +0 -0
- {datalab_python_sdk-0.2.2 → datalab_python_sdk-0.3.0}/tests/test_workflows.py +0 -0
|
@@ -2,15 +2,20 @@
|
|
|
2
2
|
Datalab SDK - Python client for Datalab API
|
|
3
3
|
|
|
4
4
|
This SDK provides both synchronous and asynchronous interfaces to the Datalab API,
|
|
5
|
-
supporting document conversion,
|
|
5
|
+
supporting document conversion, extraction, segmentation, and more.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from .client import DatalabClient, AsyncDatalabClient
|
|
9
9
|
from .exceptions import DatalabError, DatalabAPIError, DatalabTimeoutError
|
|
10
10
|
from .models import (
|
|
11
11
|
ConversionResult,
|
|
12
|
+
CreateDocumentResult,
|
|
12
13
|
OCRResult,
|
|
13
14
|
ConvertOptions,
|
|
15
|
+
ExtractOptions,
|
|
16
|
+
SegmentOptions,
|
|
17
|
+
CustomPipelineOptions,
|
|
18
|
+
TrackChangesOptions,
|
|
14
19
|
OCROptions,
|
|
15
20
|
FormFillingOptions,
|
|
16
21
|
FormFillingResult,
|
|
@@ -30,8 +35,13 @@ __all__ = [
|
|
|
30
35
|
"DatalabAPIError",
|
|
31
36
|
"DatalabTimeoutError",
|
|
32
37
|
"ConversionResult",
|
|
38
|
+
"CreateDocumentResult",
|
|
33
39
|
"OCRResult",
|
|
34
40
|
"ConvertOptions",
|
|
41
|
+
"ExtractOptions",
|
|
42
|
+
"SegmentOptions",
|
|
43
|
+
"CustomPipelineOptions",
|
|
44
|
+
"TrackChangesOptions",
|
|
35
45
|
"OCROptions",
|
|
36
46
|
"FormFillingOptions",
|
|
37
47
|
"FormFillingResult",
|
|
@@ -16,6 +16,10 @@ from datalab_sdk.mimetypes import SUPPORTED_EXTENSIONS
|
|
|
16
16
|
from datalab_sdk.models import (
|
|
17
17
|
OCROptions,
|
|
18
18
|
ConvertOptions,
|
|
19
|
+
ExtractOptions,
|
|
20
|
+
SegmentOptions,
|
|
21
|
+
CustomPipelineOptions,
|
|
22
|
+
TrackChangesOptions,
|
|
19
23
|
ProcessingOptions,
|
|
20
24
|
WorkflowStep,
|
|
21
25
|
InputConfig,
|
|
@@ -60,7 +64,7 @@ def common_options(func):
|
|
|
60
64
|
|
|
61
65
|
|
|
62
66
|
def marker_options(func):
|
|
63
|
-
"""Options specific to
|
|
67
|
+
"""Options specific to convert command"""
|
|
64
68
|
func = click.option(
|
|
65
69
|
"--format",
|
|
66
70
|
"output_format",
|
|
@@ -79,17 +83,14 @@ def marker_options(func):
|
|
|
79
83
|
is_flag=True,
|
|
80
84
|
help="Disable synthetic image captions/descriptions in output",
|
|
81
85
|
)(func)
|
|
82
|
-
func = click.option(
|
|
83
|
-
"--page_schema", help="Schema to set to do structured extraction"
|
|
84
|
-
)(func)
|
|
85
86
|
func = click.option(
|
|
86
87
|
"--add_block_ids", is_flag=True, help="Add block IDs to HTML output"
|
|
87
88
|
)(func)
|
|
88
89
|
func = click.option(
|
|
89
90
|
"--mode",
|
|
90
91
|
type=click.Choice(["fast", "balanced", "accurate"]),
|
|
91
|
-
default="
|
|
92
|
-
help="
|
|
92
|
+
default="fast",
|
|
93
|
+
help="Processing mode",
|
|
93
94
|
)(func)
|
|
94
95
|
return func
|
|
95
96
|
|
|
@@ -125,22 +126,27 @@ async def process_files_async(
|
|
|
125
126
|
|
|
126
127
|
async def call_api(client, file_path, output_path):
|
|
127
128
|
"""Make API call - client handles retries for rate limits"""
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
return await client.ocr(
|
|
138
|
-
file_path,
|
|
129
|
+
api_method = getattr(client, method)
|
|
130
|
+
# For extract/segment with checkpoint_id, don't pass file_path
|
|
131
|
+
has_checkpoint = (
|
|
132
|
+
options is not None
|
|
133
|
+
and hasattr(options, "checkpoint_id")
|
|
134
|
+
and options.checkpoint_id is not None
|
|
135
|
+
)
|
|
136
|
+
if has_checkpoint:
|
|
137
|
+
return await api_method(
|
|
139
138
|
options=options,
|
|
140
139
|
save_output=output_path,
|
|
141
140
|
max_polls=max_polls,
|
|
142
141
|
poll_interval=poll_interval,
|
|
143
142
|
)
|
|
143
|
+
return await api_method(
|
|
144
|
+
file_path,
|
|
145
|
+
options=options,
|
|
146
|
+
save_output=output_path,
|
|
147
|
+
max_polls=max_polls,
|
|
148
|
+
poll_interval=poll_interval,
|
|
149
|
+
)
|
|
144
150
|
|
|
145
151
|
async def process_single_file(file_path: Path) -> dict:
|
|
146
152
|
async with semaphore:
|
|
@@ -260,9 +266,18 @@ def process_documents(
|
|
|
260
266
|
paginate: bool = False,
|
|
261
267
|
disable_image_extraction: bool = False,
|
|
262
268
|
disable_image_captions: bool = False,
|
|
263
|
-
page_schema: Optional[str] = None,
|
|
264
269
|
add_block_ids: bool = False,
|
|
265
|
-
mode: str = "
|
|
270
|
+
mode: str = "fast",
|
|
271
|
+
# Extract-specific
|
|
272
|
+
page_schema: Optional[str] = None,
|
|
273
|
+
checkpoint_id: Optional[str] = None,
|
|
274
|
+
# Segment-specific
|
|
275
|
+
segmentation_schema: Optional[str] = None,
|
|
276
|
+
# Custom pipeline-specific
|
|
277
|
+
pipeline_id: Optional[str] = None,
|
|
278
|
+
run_eval: bool = False,
|
|
279
|
+
# Options object override
|
|
280
|
+
options_override: Optional[ProcessingOptions] = None,
|
|
266
281
|
):
|
|
267
282
|
"""Unified document processing function"""
|
|
268
283
|
try:
|
|
@@ -278,7 +293,7 @@ def process_documents(
|
|
|
278
293
|
if base_url is None:
|
|
279
294
|
base_url = settings.DATALAB_HOST
|
|
280
295
|
|
|
281
|
-
|
|
296
|
+
output_dir_path = setup_output_directory(output_dir)
|
|
282
297
|
file_extensions = parse_extensions(extensions)
|
|
283
298
|
|
|
284
299
|
# Get files to process
|
|
@@ -292,19 +307,57 @@ def process_documents(
|
|
|
292
307
|
click.echo(f"Found {len(to_process)} files to process")
|
|
293
308
|
|
|
294
309
|
# Create processing options based on method
|
|
295
|
-
if
|
|
310
|
+
if options_override:
|
|
311
|
+
options = options_override
|
|
312
|
+
elif method == "convert":
|
|
296
313
|
options = ConvertOptions(
|
|
297
|
-
output_format=output_format,
|
|
314
|
+
output_format=output_format or "markdown",
|
|
298
315
|
max_pages=max_pages,
|
|
299
316
|
paginate=paginate,
|
|
300
317
|
disable_image_extraction=disable_image_extraction,
|
|
301
318
|
disable_image_captions=disable_image_captions,
|
|
302
319
|
page_range=page_range,
|
|
303
320
|
skip_cache=skip_cache,
|
|
304
|
-
page_schema=page_schema,
|
|
305
321
|
add_block_ids=add_block_ids,
|
|
306
322
|
mode=mode,
|
|
307
323
|
)
|
|
324
|
+
elif method == "extract":
|
|
325
|
+
options = ExtractOptions(
|
|
326
|
+
page_schema=page_schema or "",
|
|
327
|
+
checkpoint_id=checkpoint_id,
|
|
328
|
+
mode=mode,
|
|
329
|
+
output_format=output_format or "markdown",
|
|
330
|
+
max_pages=max_pages,
|
|
331
|
+
page_range=page_range,
|
|
332
|
+
skip_cache=skip_cache,
|
|
333
|
+
)
|
|
334
|
+
elif method == "segment":
|
|
335
|
+
options = SegmentOptions(
|
|
336
|
+
segmentation_schema=segmentation_schema or "",
|
|
337
|
+
checkpoint_id=checkpoint_id,
|
|
338
|
+
mode=mode,
|
|
339
|
+
max_pages=max_pages,
|
|
340
|
+
page_range=page_range,
|
|
341
|
+
skip_cache=skip_cache,
|
|
342
|
+
)
|
|
343
|
+
elif method == "run_custom_pipeline":
|
|
344
|
+
options = CustomPipelineOptions(
|
|
345
|
+
pipeline_id=pipeline_id or "",
|
|
346
|
+
run_eval=run_eval,
|
|
347
|
+
mode=mode,
|
|
348
|
+
output_format=output_format or "markdown",
|
|
349
|
+
max_pages=max_pages,
|
|
350
|
+
page_range=page_range,
|
|
351
|
+
skip_cache=skip_cache,
|
|
352
|
+
)
|
|
353
|
+
elif method == "track_changes":
|
|
354
|
+
options = TrackChangesOptions(
|
|
355
|
+
output_format=output_format or "markdown,html,chunks",
|
|
356
|
+
paginate=paginate,
|
|
357
|
+
max_pages=max_pages,
|
|
358
|
+
page_range=page_range,
|
|
359
|
+
skip_cache=skip_cache,
|
|
360
|
+
)
|
|
308
361
|
else: # method == "ocr"
|
|
309
362
|
options = OCROptions(
|
|
310
363
|
max_pages=max_pages,
|
|
@@ -315,7 +368,7 @@ def process_documents(
|
|
|
315
368
|
results = asyncio.run(
|
|
316
369
|
process_files_async(
|
|
317
370
|
to_process,
|
|
318
|
-
|
|
371
|
+
output_dir_path,
|
|
319
372
|
method,
|
|
320
373
|
options=options,
|
|
321
374
|
max_concurrent=max_concurrent,
|
|
@@ -327,8 +380,16 @@ def process_documents(
|
|
|
327
380
|
)
|
|
328
381
|
|
|
329
382
|
# Show results
|
|
330
|
-
|
|
331
|
-
|
|
383
|
+
operation_names = {
|
|
384
|
+
"convert": "Conversion",
|
|
385
|
+
"extract": "Extraction",
|
|
386
|
+
"segment": "Segmentation",
|
|
387
|
+
"run_custom_pipeline": "Custom Pipeline",
|
|
388
|
+
"track_changes": "Track Changes",
|
|
389
|
+
"ocr": "OCR",
|
|
390
|
+
}
|
|
391
|
+
operation = operation_names.get(method, method.title())
|
|
392
|
+
show_results(results, operation, output_dir_path)
|
|
332
393
|
|
|
333
394
|
except DatalabError as e:
|
|
334
395
|
click.echo(f"Error: {e}", err=True)
|
|
@@ -361,7 +422,6 @@ def convert(
|
|
|
361
422
|
paginate: bool,
|
|
362
423
|
disable_image_extraction: bool,
|
|
363
424
|
disable_image_captions: bool,
|
|
364
|
-
page_schema: Optional[str],
|
|
365
425
|
add_block_ids: bool,
|
|
366
426
|
mode: str,
|
|
367
427
|
):
|
|
@@ -383,12 +443,227 @@ def convert(
|
|
|
383
443
|
paginate=paginate,
|
|
384
444
|
disable_image_extraction=disable_image_extraction,
|
|
385
445
|
disable_image_captions=disable_image_captions,
|
|
386
|
-
page_schema=page_schema,
|
|
387
446
|
add_block_ids=add_block_ids,
|
|
388
447
|
mode=mode,
|
|
389
448
|
)
|
|
390
449
|
|
|
391
450
|
|
|
451
|
+
@click.command()
|
|
452
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
453
|
+
@click.option("--page_schema", required=True, help="JSON schema for structured extraction (must contain 'properties' key)")
|
|
454
|
+
@click.option("--checkpoint_id", help="Checkpoint ID from a previous convert (skips re-parsing)")
|
|
455
|
+
@click.option("--format", "output_format", default="markdown", type=click.Choice(["markdown", "html", "json", "chunks"]), help="Output format")
|
|
456
|
+
@click.option("--mode", type=click.Choice(["fast", "balanced", "accurate"]), default="fast", help="Processing mode")
|
|
457
|
+
@common_options
|
|
458
|
+
def extract(
|
|
459
|
+
path: str,
|
|
460
|
+
page_schema: str,
|
|
461
|
+
checkpoint_id: Optional[str],
|
|
462
|
+
output_format: str,
|
|
463
|
+
mode: str,
|
|
464
|
+
api_key: str,
|
|
465
|
+
output_dir: str,
|
|
466
|
+
max_pages: Optional[int],
|
|
467
|
+
extensions: Optional[str],
|
|
468
|
+
max_concurrent: int,
|
|
469
|
+
base_url: str,
|
|
470
|
+
page_range: Optional[str],
|
|
471
|
+
skip_cache: bool,
|
|
472
|
+
max_polls: int,
|
|
473
|
+
poll_interval: int,
|
|
474
|
+
):
|
|
475
|
+
"""Extract structured data from documents using a JSON schema"""
|
|
476
|
+
process_documents(
|
|
477
|
+
path=path,
|
|
478
|
+
method="extract",
|
|
479
|
+
api_key=api_key,
|
|
480
|
+
output_dir=output_dir,
|
|
481
|
+
max_pages=max_pages,
|
|
482
|
+
extensions=extensions,
|
|
483
|
+
max_concurrent=max_concurrent,
|
|
484
|
+
base_url=base_url,
|
|
485
|
+
page_range=page_range,
|
|
486
|
+
skip_cache=skip_cache,
|
|
487
|
+
max_polls=max_polls,
|
|
488
|
+
poll_interval=poll_interval,
|
|
489
|
+
output_format=output_format,
|
|
490
|
+
mode=mode,
|
|
491
|
+
page_schema=page_schema,
|
|
492
|
+
checkpoint_id=checkpoint_id,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
@click.command()
|
|
497
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
498
|
+
@click.option("--segmentation_schema", required=True, help="JSON schema with segment names and descriptions")
|
|
499
|
+
@click.option("--checkpoint_id", help="Checkpoint ID from a previous convert (skips re-parsing)")
|
|
500
|
+
@click.option("--mode", type=click.Choice(["fast", "balanced", "accurate"]), default="fast", help="Processing mode")
|
|
501
|
+
@common_options
|
|
502
|
+
def segment(
|
|
503
|
+
path: str,
|
|
504
|
+
segmentation_schema: str,
|
|
505
|
+
checkpoint_id: Optional[str],
|
|
506
|
+
mode: str,
|
|
507
|
+
api_key: str,
|
|
508
|
+
output_dir: str,
|
|
509
|
+
max_pages: Optional[int],
|
|
510
|
+
extensions: Optional[str],
|
|
511
|
+
max_concurrent: int,
|
|
512
|
+
base_url: str,
|
|
513
|
+
page_range: Optional[str],
|
|
514
|
+
skip_cache: bool,
|
|
515
|
+
max_polls: int,
|
|
516
|
+
poll_interval: int,
|
|
517
|
+
):
|
|
518
|
+
"""Segment documents into sections using a schema"""
|
|
519
|
+
process_documents(
|
|
520
|
+
path=path,
|
|
521
|
+
method="segment",
|
|
522
|
+
api_key=api_key,
|
|
523
|
+
output_dir=output_dir,
|
|
524
|
+
max_pages=max_pages,
|
|
525
|
+
extensions=extensions,
|
|
526
|
+
max_concurrent=max_concurrent,
|
|
527
|
+
base_url=base_url,
|
|
528
|
+
page_range=page_range,
|
|
529
|
+
skip_cache=skip_cache,
|
|
530
|
+
max_polls=max_polls,
|
|
531
|
+
poll_interval=poll_interval,
|
|
532
|
+
mode=mode,
|
|
533
|
+
segmentation_schema=segmentation_schema,
|
|
534
|
+
checkpoint_id=checkpoint_id,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
@click.command("custom-pipeline")
|
|
539
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
540
|
+
@click.option("--pipeline_id", required=True, help="Custom pipeline ID to execute (cp_XXXXX format)")
|
|
541
|
+
@click.option("--run_eval", is_flag=True, help="Run evaluation rules for this pipeline")
|
|
542
|
+
@click.option("--format", "output_format", default="markdown", type=click.Choice(["markdown", "html", "json", "chunks"]), help="Output format")
|
|
543
|
+
@click.option("--mode", type=click.Choice(["fast", "balanced", "accurate"]), default="fast", help="Processing mode")
|
|
544
|
+
@common_options
|
|
545
|
+
def custom_pipeline(
|
|
546
|
+
path: str,
|
|
547
|
+
pipeline_id: str,
|
|
548
|
+
run_eval: bool,
|
|
549
|
+
output_format: str,
|
|
550
|
+
mode: str,
|
|
551
|
+
api_key: str,
|
|
552
|
+
output_dir: str,
|
|
553
|
+
max_pages: Optional[int],
|
|
554
|
+
extensions: Optional[str],
|
|
555
|
+
max_concurrent: int,
|
|
556
|
+
base_url: str,
|
|
557
|
+
page_range: Optional[str],
|
|
558
|
+
skip_cache: bool,
|
|
559
|
+
max_polls: int,
|
|
560
|
+
poll_interval: int,
|
|
561
|
+
):
|
|
562
|
+
"""Run a custom pipeline on documents"""
|
|
563
|
+
process_documents(
|
|
564
|
+
path=path,
|
|
565
|
+
method="run_custom_pipeline",
|
|
566
|
+
api_key=api_key,
|
|
567
|
+
output_dir=output_dir,
|
|
568
|
+
max_pages=max_pages,
|
|
569
|
+
extensions=extensions,
|
|
570
|
+
max_concurrent=max_concurrent,
|
|
571
|
+
base_url=base_url,
|
|
572
|
+
page_range=page_range,
|
|
573
|
+
skip_cache=skip_cache,
|
|
574
|
+
max_polls=max_polls,
|
|
575
|
+
poll_interval=poll_interval,
|
|
576
|
+
output_format=output_format,
|
|
577
|
+
mode=mode,
|
|
578
|
+
pipeline_id=pipeline_id,
|
|
579
|
+
run_eval=run_eval,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
@click.command("track-changes")
|
|
584
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
585
|
+
@click.option("--format", "output_format", default="markdown,html,chunks", help="Comma-separated output formats (markdown, html, chunks)")
|
|
586
|
+
@click.option("--paginate", is_flag=True, help="Separate output by page")
|
|
587
|
+
@common_options
|
|
588
|
+
def track_changes(
|
|
589
|
+
path: str,
|
|
590
|
+
output_format: str,
|
|
591
|
+
paginate: bool,
|
|
592
|
+
api_key: str,
|
|
593
|
+
output_dir: str,
|
|
594
|
+
max_pages: Optional[int],
|
|
595
|
+
extensions: Optional[str],
|
|
596
|
+
max_concurrent: int,
|
|
597
|
+
base_url: str,
|
|
598
|
+
page_range: Optional[str],
|
|
599
|
+
skip_cache: bool,
|
|
600
|
+
max_polls: int,
|
|
601
|
+
poll_interval: int,
|
|
602
|
+
):
|
|
603
|
+
"""Extract tracked changes from DOCX documents"""
|
|
604
|
+
process_documents(
|
|
605
|
+
path=path,
|
|
606
|
+
method="track_changes",
|
|
607
|
+
api_key=api_key,
|
|
608
|
+
output_dir=output_dir,
|
|
609
|
+
max_pages=max_pages,
|
|
610
|
+
extensions=extensions,
|
|
611
|
+
max_concurrent=max_concurrent,
|
|
612
|
+
base_url=base_url,
|
|
613
|
+
page_range=page_range,
|
|
614
|
+
skip_cache=skip_cache,
|
|
615
|
+
max_polls=max_polls,
|
|
616
|
+
poll_interval=poll_interval,
|
|
617
|
+
output_format=output_format,
|
|
618
|
+
paginate=paginate,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
@click.command("create-document")
|
|
623
|
+
@click.option("--markdown", "markdown_input", required=True, help="Markdown content or path to markdown file")
|
|
624
|
+
@click.option("--output", "-o", "output_path", required=True, type=click.Path(), help="Output file path for the DOCX")
|
|
625
|
+
@click.option("--api_key", required=False, help="Datalab API key")
|
|
626
|
+
@click.option("--base_url", default=settings.DATALAB_HOST, help="API base URL")
|
|
627
|
+
def create_document(
|
|
628
|
+
markdown_input: str,
|
|
629
|
+
output_path: str,
|
|
630
|
+
api_key: Optional[str],
|
|
631
|
+
base_url: str,
|
|
632
|
+
):
|
|
633
|
+
"""Create a DOCX document from markdown"""
|
|
634
|
+
try:
|
|
635
|
+
if api_key is None:
|
|
636
|
+
api_key = settings.DATALAB_API_KEY
|
|
637
|
+
|
|
638
|
+
if api_key is None:
|
|
639
|
+
raise DatalabError(
|
|
640
|
+
"You must either pass in an api key via --api_key or set the DATALAB_API_KEY env variable."
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
# Check if markdown_input is a file path
|
|
644
|
+
md_path = Path(markdown_input)
|
|
645
|
+
if md_path.exists() and md_path.is_file():
|
|
646
|
+
markdown_content = md_path.read_text(encoding="utf-8")
|
|
647
|
+
else:
|
|
648
|
+
markdown_content = markdown_input
|
|
649
|
+
|
|
650
|
+
client = DatalabClient(api_key=api_key, base_url=base_url)
|
|
651
|
+
result = client.create_document(
|
|
652
|
+
markdown=markdown_content,
|
|
653
|
+
save_output=output_path,
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
if result.success:
|
|
657
|
+
click.echo(f"Document created successfully: {Path(output_path).with_suffix('.docx')}")
|
|
658
|
+
else:
|
|
659
|
+
click.echo(f"Document creation failed: {result.error}", err=True)
|
|
660
|
+
sys.exit(1)
|
|
661
|
+
|
|
662
|
+
except DatalabError as e:
|
|
663
|
+
click.echo(f"Error: {e}", err=True)
|
|
664
|
+
sys.exit(1)
|
|
665
|
+
|
|
666
|
+
|
|
392
667
|
# Workflow commands
|
|
393
668
|
@click.command()
|
|
394
669
|
@click.option("--name", required=True, help="Name of the workflow")
|
|
@@ -869,6 +1144,11 @@ def _render_dag_simple(layers, children, step_map):
|
|
|
869
1144
|
|
|
870
1145
|
# Add commands to CLI group
|
|
871
1146
|
cli.add_command(convert)
|
|
1147
|
+
cli.add_command(extract)
|
|
1148
|
+
cli.add_command(segment)
|
|
1149
|
+
cli.add_command(custom_pipeline)
|
|
1150
|
+
cli.add_command(track_changes)
|
|
1151
|
+
cli.add_command(create_document)
|
|
872
1152
|
cli.add_command(create_workflow)
|
|
873
1153
|
cli.add_command(get_workflow)
|
|
874
1154
|
cli.add_command(get_step_types)
|