datalab-python-sdk 0.1.15__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/PKG-INFO +1 -5
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/README.md +0 -4
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/__init__.py +4 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/cli.py +17 -69
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/client.py +97 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/models.py +88 -7
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/test_readme_examples.py +0 -6
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/pyproject.toml +1 -1
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/3_create_workflow.py +16 -24
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/conftest.py +1 -3
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/test_cli_simple.py +0 -91
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/test_client_methods.py +4 -4
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/uv.lock +1 -1
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.github/workflows/ci.yml +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.github/workflows/publish.yml +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.gitignore +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.pre-commit-config.yaml +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.python-version +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/LICENSE +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/08-Lambda-Calculus.pptx +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/adversarial.pdf +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/bid_evaluation.docx +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/book_review.ppt +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/book_store.xls +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/chi_hind.png +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/how_to_read.doc +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/normandy.epub +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/sample-1-sheet.xlsx +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/thinkpython.pdf +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/vibe.html +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/exceptions.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/mimetypes.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/settings.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/README.md +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/__init__.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/test_live_api.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/poetry.lock +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/pytest.ini +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/README.md +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/end_to_end_workflow.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/1_get_step_types.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/2_get_workflows.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/4_execute_workflow.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/5_poll_workflow.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/README.md +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/README.md +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/eval_segmentation.json +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/parse_segment.json +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/segment_parallel_extract.json +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/slack_alert.json +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/__init__.py +0 -0
- {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/test_workflows.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datalab-python-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: SDK for the Datalab document intelligence API
|
|
5
5
|
Author-email: Datalab Team <hi@datalab.to>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -46,10 +46,6 @@ client = DatalabClient() # use env var from above, or pass api_key="your_api_key
|
|
|
46
46
|
# Convert PDF to markdown
|
|
47
47
|
result = client.convert("document.pdf")
|
|
48
48
|
print(result.markdown)
|
|
49
|
-
|
|
50
|
-
# OCR a document
|
|
51
|
-
ocr_result = client.ocr("document.pdf")
|
|
52
|
-
print(ocr_result.pages) # Get all text as string
|
|
53
49
|
```
|
|
54
50
|
|
|
55
51
|
## Workflows
|
|
@@ -30,10 +30,6 @@ client = DatalabClient() # use env var from above, or pass api_key="your_api_key
|
|
|
30
30
|
# Convert PDF to markdown
|
|
31
31
|
result = client.convert("document.pdf")
|
|
32
32
|
print(result.markdown)
|
|
33
|
-
|
|
34
|
-
# OCR a document
|
|
35
|
-
ocr_result = client.ocr("document.pdf")
|
|
36
|
-
print(ocr_result.pages) # Get all text as string
|
|
37
33
|
```
|
|
38
34
|
|
|
39
35
|
## Workflows
|
|
@@ -12,6 +12,8 @@ from .models import (
|
|
|
12
12
|
OCRResult,
|
|
13
13
|
ConvertOptions,
|
|
14
14
|
OCROptions,
|
|
15
|
+
FormFillingOptions,
|
|
16
|
+
FormFillingResult,
|
|
15
17
|
Workflow,
|
|
16
18
|
WorkflowStep,
|
|
17
19
|
WorkflowExecution,
|
|
@@ -31,6 +33,8 @@ __all__ = [
|
|
|
31
33
|
"OCRResult",
|
|
32
34
|
"ConvertOptions",
|
|
33
35
|
"OCROptions",
|
|
36
|
+
"FormFillingOptions",
|
|
37
|
+
"FormFillingResult",
|
|
34
38
|
"Workflow",
|
|
35
39
|
"WorkflowStep",
|
|
36
40
|
"WorkflowExecution",
|
|
@@ -67,28 +67,16 @@ def marker_options(func):
|
|
|
67
67
|
type=click.Choice(["markdown", "html", "json", "chunks"]),
|
|
68
68
|
help="Output format",
|
|
69
69
|
)(func)
|
|
70
|
-
func = click.option("--force_ocr", is_flag=True, help="Force OCR on every page")(
|
|
71
|
-
func
|
|
72
|
-
)
|
|
73
|
-
func = click.option(
|
|
74
|
-
"--format_lines", is_flag=True, help="Partially OCR lines for better formatting"
|
|
75
|
-
)(func)
|
|
76
70
|
func = click.option(
|
|
77
71
|
"--paginate", is_flag=True, help="Add page delimiters to output"
|
|
78
72
|
)(func)
|
|
79
|
-
func = click.option("--use_llm", is_flag=True, help="Use LLM to enhance accuracy")(
|
|
80
|
-
func
|
|
81
|
-
)
|
|
82
|
-
func = click.option(
|
|
83
|
-
"--strip_existing_ocr",
|
|
84
|
-
is_flag=True,
|
|
85
|
-
help="Remove existing OCR text and redo OCR",
|
|
86
|
-
)(func)
|
|
87
73
|
func = click.option(
|
|
88
74
|
"--disable_image_extraction", is_flag=True, help="Disable extraction of images"
|
|
89
75
|
)(func)
|
|
90
76
|
func = click.option(
|
|
91
|
-
"--
|
|
77
|
+
"--disable_image_captions",
|
|
78
|
+
is_flag=True,
|
|
79
|
+
help="Disable synthetic image captions/descriptions in output",
|
|
92
80
|
)(func)
|
|
93
81
|
func = click.option(
|
|
94
82
|
"--page_schema", help="Schema to set to do structured extraction"
|
|
@@ -96,6 +84,12 @@ def marker_options(func):
|
|
|
96
84
|
func = click.option(
|
|
97
85
|
"--add_block_ids", is_flag=True, help="Add block IDs to HTML output"
|
|
98
86
|
)(func)
|
|
87
|
+
func = click.option(
|
|
88
|
+
"--mode",
|
|
89
|
+
type=click.Choice(["fast", "balanced", "accurate"]),
|
|
90
|
+
default="balanced",
|
|
91
|
+
help="OCR mode",
|
|
92
|
+
)(func)
|
|
99
93
|
return func
|
|
100
94
|
|
|
101
95
|
|
|
@@ -248,15 +242,12 @@ def process_documents(
|
|
|
248
242
|
poll_interval: int,
|
|
249
243
|
# Convert-specific options
|
|
250
244
|
output_format: Optional[str] = None,
|
|
251
|
-
force_ocr: bool = False,
|
|
252
|
-
format_lines: bool = False,
|
|
253
245
|
paginate: bool = False,
|
|
254
|
-
use_llm: bool = False,
|
|
255
|
-
strip_existing_ocr: bool = False,
|
|
256
246
|
disable_image_extraction: bool = False,
|
|
257
|
-
|
|
247
|
+
disable_image_captions: bool = False,
|
|
258
248
|
page_schema: Optional[str] = None,
|
|
259
249
|
add_block_ids: bool = False,
|
|
250
|
+
mode: str = "balanced",
|
|
260
251
|
):
|
|
261
252
|
"""Unified document processing function"""
|
|
262
253
|
try:
|
|
@@ -290,17 +281,14 @@ def process_documents(
|
|
|
290
281
|
options = ConvertOptions(
|
|
291
282
|
output_format=output_format,
|
|
292
283
|
max_pages=max_pages,
|
|
293
|
-
force_ocr=force_ocr,
|
|
294
|
-
format_lines=format_lines,
|
|
295
284
|
paginate=paginate,
|
|
296
|
-
use_llm=use_llm,
|
|
297
|
-
strip_existing_ocr=strip_existing_ocr,
|
|
298
285
|
disable_image_extraction=disable_image_extraction,
|
|
286
|
+
disable_image_captions=disable_image_captions,
|
|
299
287
|
page_range=page_range,
|
|
300
|
-
block_correction_prompt=block_correction_prompt,
|
|
301
288
|
skip_cache=skip_cache,
|
|
302
289
|
page_schema=page_schema,
|
|
303
290
|
add_block_ids=add_block_ids,
|
|
291
|
+
mode=mode,
|
|
304
292
|
)
|
|
305
293
|
else: # method == "ocr"
|
|
306
294
|
options = OCROptions(
|
|
@@ -355,15 +343,12 @@ def convert(
|
|
|
355
343
|
max_polls: int,
|
|
356
344
|
poll_interval: int,
|
|
357
345
|
output_format: str,
|
|
358
|
-
force_ocr: bool,
|
|
359
|
-
format_lines: bool,
|
|
360
346
|
paginate: bool,
|
|
361
|
-
use_llm: bool,
|
|
362
|
-
strip_existing_ocr: bool,
|
|
363
347
|
disable_image_extraction: bool,
|
|
364
|
-
|
|
348
|
+
disable_image_captions: bool,
|
|
365
349
|
page_schema: Optional[str],
|
|
366
350
|
add_block_ids: bool,
|
|
351
|
+
mode: str,
|
|
367
352
|
):
|
|
368
353
|
"""Convert documents to markdown, HTML, or JSON"""
|
|
369
354
|
process_documents(
|
|
@@ -380,48 +365,12 @@ def convert(
|
|
|
380
365
|
max_polls=max_polls,
|
|
381
366
|
poll_interval=poll_interval,
|
|
382
367
|
output_format=output_format,
|
|
383
|
-
force_ocr=force_ocr,
|
|
384
|
-
format_lines=format_lines,
|
|
385
368
|
paginate=paginate,
|
|
386
|
-
use_llm=use_llm,
|
|
387
|
-
strip_existing_ocr=strip_existing_ocr,
|
|
388
369
|
disable_image_extraction=disable_image_extraction,
|
|
389
|
-
|
|
370
|
+
disable_image_captions=disable_image_captions,
|
|
390
371
|
page_schema=page_schema,
|
|
391
372
|
add_block_ids=add_block_ids,
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
@click.command()
|
|
396
|
-
@click.argument("path", type=click.Path(exists=True))
|
|
397
|
-
@common_options
|
|
398
|
-
def ocr(
|
|
399
|
-
path: str,
|
|
400
|
-
api_key: str,
|
|
401
|
-
output_dir: str,
|
|
402
|
-
max_pages: Optional[int],
|
|
403
|
-
extensions: Optional[str],
|
|
404
|
-
max_concurrent: int,
|
|
405
|
-
base_url: str,
|
|
406
|
-
page_range: Optional[str],
|
|
407
|
-
skip_cache: bool,
|
|
408
|
-
max_polls: int,
|
|
409
|
-
poll_interval: int,
|
|
410
|
-
):
|
|
411
|
-
"""Perform OCR on documents"""
|
|
412
|
-
process_documents(
|
|
413
|
-
path=path,
|
|
414
|
-
method="ocr",
|
|
415
|
-
api_key=api_key,
|
|
416
|
-
output_dir=output_dir,
|
|
417
|
-
max_pages=max_pages,
|
|
418
|
-
extensions=extensions,
|
|
419
|
-
max_concurrent=max_concurrent,
|
|
420
|
-
base_url=base_url,
|
|
421
|
-
page_range=page_range,
|
|
422
|
-
skip_cache=skip_cache,
|
|
423
|
-
max_polls=max_polls,
|
|
424
|
-
poll_interval=poll_interval,
|
|
373
|
+
mode=mode,
|
|
425
374
|
)
|
|
426
375
|
|
|
427
376
|
|
|
@@ -905,7 +854,6 @@ def _render_dag_simple(layers, children, step_map):
|
|
|
905
854
|
|
|
906
855
|
# Add commands to CLI group
|
|
907
856
|
cli.add_command(convert)
|
|
908
|
-
cli.add_command(ocr)
|
|
909
857
|
cli.add_command(create_workflow)
|
|
910
858
|
cli.add_command(get_workflow)
|
|
911
859
|
cli.add_command(get_step_types)
|
|
@@ -27,6 +27,8 @@ from datalab_sdk.models import (
|
|
|
27
27
|
ProcessingOptions,
|
|
28
28
|
ConvertOptions,
|
|
29
29
|
OCROptions,
|
|
30
|
+
FormFillingOptions,
|
|
31
|
+
FormFillingResult,
|
|
30
32
|
Workflow,
|
|
31
33
|
WorkflowStep,
|
|
32
34
|
WorkflowExecution,
|
|
@@ -345,6 +347,70 @@ class AsyncDatalabClient:
|
|
|
345
347
|
|
|
346
348
|
return result
|
|
347
349
|
|
|
350
|
+
async def fill(
|
|
351
|
+
self,
|
|
352
|
+
file_path: Optional[Union[str, Path]] = None,
|
|
353
|
+
file_url: Optional[str] = None,
|
|
354
|
+
options: Optional[FormFillingOptions] = None,
|
|
355
|
+
save_output: Optional[Union[str, Path]] = None,
|
|
356
|
+
max_polls: int = 300,
|
|
357
|
+
poll_interval: int = 1,
|
|
358
|
+
) -> FormFillingResult:
|
|
359
|
+
"""
|
|
360
|
+
Fill PDF or image forms with provided field data
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
file_path: Path to the file to fill
|
|
364
|
+
file_url: URL of the file to fill
|
|
365
|
+
options: Form filling options (must include field_data)
|
|
366
|
+
save_output: Optional path to save output files
|
|
367
|
+
max_polls: Maximum number of polling attempts
|
|
368
|
+
poll_interval: Seconds between polling attempts
|
|
369
|
+
"""
|
|
370
|
+
if options is None:
|
|
371
|
+
raise ValueError("options must be provided with field_data")
|
|
372
|
+
|
|
373
|
+
initial_data = await self._make_request(
|
|
374
|
+
"POST",
|
|
375
|
+
"/api/v1/fill",
|
|
376
|
+
data=self.get_form_params(
|
|
377
|
+
file_path=file_path, file_url=file_url, options=options
|
|
378
|
+
),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
if not initial_data.get("success"):
|
|
382
|
+
raise DatalabAPIError(
|
|
383
|
+
f"Request failed: {initial_data.get('error', 'Unknown error')}"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
result_data = await self._poll_result(
|
|
387
|
+
initial_data["request_check_url"],
|
|
388
|
+
max_polls=max_polls,
|
|
389
|
+
poll_interval=poll_interval,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
result = FormFillingResult(
|
|
393
|
+
status=result_data.get("status", "complete"),
|
|
394
|
+
success=result_data.get("success"),
|
|
395
|
+
error=result_data.get("error"),
|
|
396
|
+
output_format=result_data.get("output_format"),
|
|
397
|
+
output_base64=result_data.get("output_base64"),
|
|
398
|
+
fields_filled=result_data.get("fields_filled"),
|
|
399
|
+
fields_not_found=result_data.get("fields_not_found"),
|
|
400
|
+
runtime=result_data.get("runtime"),
|
|
401
|
+
page_count=result_data.get("page_count"),
|
|
402
|
+
cost_breakdown=result_data.get("cost_breakdown"),
|
|
403
|
+
versions=result_data.get("versions"),
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Save output if requested
|
|
407
|
+
if save_output and result.success and result.output_base64:
|
|
408
|
+
output_path = Path(save_output)
|
|
409
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
410
|
+
result.save_output(output_path)
|
|
411
|
+
|
|
412
|
+
return result
|
|
413
|
+
|
|
348
414
|
# Workflow methods
|
|
349
415
|
async def create_workflow(
|
|
350
416
|
self,
|
|
@@ -1016,6 +1082,37 @@ class DatalabClient:
|
|
|
1016
1082
|
)
|
|
1017
1083
|
)
|
|
1018
1084
|
|
|
1085
|
+
def fill(
|
|
1086
|
+
self,
|
|
1087
|
+
file_path: Optional[Union[str, Path]] = None,
|
|
1088
|
+
file_url: Optional[str] = None,
|
|
1089
|
+
options: Optional[FormFillingOptions] = None,
|
|
1090
|
+
save_output: Optional[Union[str, Path]] = None,
|
|
1091
|
+
max_polls: int = 300,
|
|
1092
|
+
poll_interval: int = 1,
|
|
1093
|
+
) -> FormFillingResult:
|
|
1094
|
+
"""
|
|
1095
|
+
Fill PDF or image forms with provided field data (sync version)
|
|
1096
|
+
|
|
1097
|
+
Args:
|
|
1098
|
+
file_path: Path to the file to fill
|
|
1099
|
+
file_url: URL of the file to fill
|
|
1100
|
+
options: Form filling options (must include field_data)
|
|
1101
|
+
save_output: Optional path to save output files
|
|
1102
|
+
max_polls: Maximum number of polling attempts
|
|
1103
|
+
poll_interval: Seconds between polling attempts
|
|
1104
|
+
"""
|
|
1105
|
+
return self._run_async(
|
|
1106
|
+
self._async_client.fill(
|
|
1107
|
+
file_path=file_path,
|
|
1108
|
+
file_url=file_url,
|
|
1109
|
+
options=options,
|
|
1110
|
+
save_output=save_output,
|
|
1111
|
+
max_polls=max_polls,
|
|
1112
|
+
poll_interval=poll_interval,
|
|
1113
|
+
)
|
|
1114
|
+
)
|
|
1115
|
+
|
|
1019
1116
|
# Workflow methods (sync)
|
|
1020
1117
|
def create_workflow(
|
|
1021
1118
|
self,
|
|
@@ -38,14 +38,9 @@ class ConvertOptions(ProcessingOptions):
|
|
|
38
38
|
"""Options for marker conversion"""
|
|
39
39
|
|
|
40
40
|
# Marker specific options
|
|
41
|
-
force_ocr: bool = False
|
|
42
|
-
format_lines: bool = False
|
|
43
41
|
paginate: bool = False
|
|
44
|
-
use_llm: bool = False
|
|
45
|
-
strip_existing_ocr: bool = False
|
|
46
42
|
disable_image_extraction: bool = False
|
|
47
|
-
|
|
48
|
-
block_correction_prompt: Optional[str] = None
|
|
43
|
+
disable_image_captions: bool = False
|
|
49
44
|
additional_config: Optional[Dict[str, Any]] = None
|
|
50
45
|
page_schema: Optional[Dict[str, Any]] = None
|
|
51
46
|
segmentation_schema: Optional[str] = None # JSON string for document segmentation
|
|
@@ -54,7 +49,7 @@ class ConvertOptions(ProcessingOptions):
|
|
|
54
49
|
None # Comma-separated list: 'track_changes', 'chart_understanding'
|
|
55
50
|
)
|
|
56
51
|
output_format: str = "markdown" # markdown, json, html, chunks
|
|
57
|
-
mode: str = "
|
|
52
|
+
mode: str = "balanced" # fast, balanced, accurate
|
|
58
53
|
keep_spreadsheet_formatting: bool = False
|
|
59
54
|
webhook_url: Optional[str] = None
|
|
60
55
|
extras: Optional[str] = None # comma-separated extras
|
|
@@ -85,6 +80,32 @@ class OCROptions(ProcessingOptions):
|
|
|
85
80
|
pass
|
|
86
81
|
|
|
87
82
|
|
|
83
|
+
@dataclass
|
|
84
|
+
class FormFillingOptions(ProcessingOptions):
|
|
85
|
+
"""Options for form filling"""
|
|
86
|
+
|
|
87
|
+
field_data: Dict[str, Dict[str, str]] = field(default_factory=dict)
|
|
88
|
+
context: Optional[str] = None # Optional context to guide form filling
|
|
89
|
+
confidence_threshold: float = 0.5 # Minimum confidence for field matching (0.0-1.0)
|
|
90
|
+
|
|
91
|
+
def to_form_data(self) -> Dict[str, Any]:
|
|
92
|
+
"""Convert to form data format for API requests"""
|
|
93
|
+
# Start with parent's form data
|
|
94
|
+
form_data = super().to_form_data()
|
|
95
|
+
|
|
96
|
+
# field_data must be JSON string
|
|
97
|
+
form_data["field_data"] = (None, json.dumps(self.field_data))
|
|
98
|
+
|
|
99
|
+
# Add context if provided
|
|
100
|
+
if self.context is not None:
|
|
101
|
+
form_data["context"] = (None, self.context)
|
|
102
|
+
|
|
103
|
+
# Add confidence_threshold
|
|
104
|
+
form_data["confidence_threshold"] = (None, str(self.confidence_threshold))
|
|
105
|
+
|
|
106
|
+
return form_data
|
|
107
|
+
|
|
108
|
+
|
|
88
109
|
@dataclass
|
|
89
110
|
class ConversionResult:
|
|
90
111
|
"""Result from document conversion (marker endpoint)"""
|
|
@@ -345,3 +366,63 @@ class OCRResult:
|
|
|
345
366
|
f,
|
|
346
367
|
indent=2,
|
|
347
368
|
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
@dataclass
|
|
372
|
+
class FormFillingResult:
|
|
373
|
+
"""Result from form filling"""
|
|
374
|
+
|
|
375
|
+
status: str
|
|
376
|
+
success: Optional[bool] = None
|
|
377
|
+
error: Optional[str] = None
|
|
378
|
+
output_format: Optional[str] = None # "pdf" or "png"
|
|
379
|
+
output_base64: Optional[str] = None # Base64-encoded filled form
|
|
380
|
+
fields_filled: Optional[List[str]] = (
|
|
381
|
+
None # List of field keys that were successfully filled
|
|
382
|
+
)
|
|
383
|
+
fields_not_found: Optional[List[str]] = (
|
|
384
|
+
None # List of field keys that couldn't be matched
|
|
385
|
+
)
|
|
386
|
+
runtime: Optional[float] = None
|
|
387
|
+
page_count: Optional[int] = None
|
|
388
|
+
cost_breakdown: Optional[Dict[str, Any]] = None
|
|
389
|
+
versions: Optional[Union[Dict[str, Any], str]] = None
|
|
390
|
+
|
|
391
|
+
def save_output(self, output_path: Union[str, Path]) -> None:
|
|
392
|
+
"""Save the filled form to a file"""
|
|
393
|
+
output_path = Path(output_path)
|
|
394
|
+
|
|
395
|
+
if not self.output_base64:
|
|
396
|
+
raise ValueError("No output data available to save")
|
|
397
|
+
|
|
398
|
+
# Determine file extension based on output_format
|
|
399
|
+
if self.output_format == "png":
|
|
400
|
+
output_path = output_path.with_suffix(".png")
|
|
401
|
+
elif self.output_format == "pdf":
|
|
402
|
+
output_path = output_path.with_suffix(".pdf")
|
|
403
|
+
else:
|
|
404
|
+
# Default to PDF if format is unknown
|
|
405
|
+
output_path = output_path.with_suffix(".pdf")
|
|
406
|
+
|
|
407
|
+
# Decode and save base64 data
|
|
408
|
+
with open(output_path, "wb") as f:
|
|
409
|
+
f.write(base64.b64decode(self.output_base64))
|
|
410
|
+
|
|
411
|
+
# Save metadata if available
|
|
412
|
+
metadata = {
|
|
413
|
+
"status": self.status,
|
|
414
|
+
"success": self.success,
|
|
415
|
+
"error": self.error,
|
|
416
|
+
"output_format": self.output_format,
|
|
417
|
+
"fields_filled": self.fields_filled,
|
|
418
|
+
"fields_not_found": self.fields_not_found,
|
|
419
|
+
"runtime": self.runtime,
|
|
420
|
+
"page_count": self.page_count,
|
|
421
|
+
"cost_breakdown": self.cost_breakdown,
|
|
422
|
+
"versions": self.versions,
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
with open(
|
|
426
|
+
output_path.with_suffix(".metadata.json"), "w", encoding="utf-8"
|
|
427
|
+
) as f:
|
|
428
|
+
json.dump(metadata, f, indent=2)
|
|
@@ -98,9 +98,7 @@ class TestAPIMethodExamples:
|
|
|
98
98
|
|
|
99
99
|
# With options
|
|
100
100
|
options = ConvertOptions(
|
|
101
|
-
force_ocr=True,
|
|
102
101
|
output_format="html",
|
|
103
|
-
use_llm=False, # Keep false for cost reasons
|
|
104
102
|
max_pages=1,
|
|
105
103
|
)
|
|
106
104
|
result = client.convert(DATA_DIR / "adversarial.pdf", options=options)
|
|
@@ -294,9 +292,7 @@ class TestProcessingOptionsVariations:
|
|
|
294
292
|
from datalab_sdk import ConvertOptions
|
|
295
293
|
|
|
296
294
|
options = ConvertOptions()
|
|
297
|
-
assert options.force_ocr is False
|
|
298
295
|
assert options.output_format == "markdown"
|
|
299
|
-
assert options.use_llm is False
|
|
300
296
|
assert options.max_pages is None
|
|
301
297
|
|
|
302
298
|
def test_processing_options_custom_values(self):
|
|
@@ -304,9 +300,7 @@ class TestProcessingOptionsVariations:
|
|
|
304
300
|
from datalab_sdk import ConvertOptions
|
|
305
301
|
|
|
306
302
|
options = ConvertOptions(
|
|
307
|
-
force_ocr=True,
|
|
308
303
|
output_format="html",
|
|
309
|
-
use_llm=False, # Keep false for cost reasons
|
|
310
304
|
max_pages=1,
|
|
311
305
|
)
|
|
312
306
|
|
|
@@ -7,7 +7,7 @@ readme = "README.md"
|
|
|
7
7
|
license = "MIT"
|
|
8
8
|
repository = "https://github.com/datalab-to/sdk"
|
|
9
9
|
keywords = ["datalab", "sdk", "document-intelligence", "api"]
|
|
10
|
-
version = "0.
|
|
10
|
+
version = "0.2.0"
|
|
11
11
|
description = "SDK for the Datalab document intelligence API"
|
|
12
12
|
requires-python = ">=3.10"
|
|
13
13
|
dependencies = [
|
|
@@ -29,7 +29,7 @@ from datalab_sdk import DatalabClient, WorkflowStep
|
|
|
29
29
|
|
|
30
30
|
def load_workflow_definition(definition_path: str) -> dict:
|
|
31
31
|
"""Load workflow definition from JSON file"""
|
|
32
|
-
with open(definition_path,
|
|
32
|
+
with open(definition_path, "r") as f:
|
|
33
33
|
return json.load(f)
|
|
34
34
|
|
|
35
35
|
|
|
@@ -40,15 +40,12 @@ def create_workflow_from_definition(client: DatalabClient, workflow_def: dict):
|
|
|
40
40
|
step_key=step["step_key"],
|
|
41
41
|
unique_name=step["unique_name"],
|
|
42
42
|
settings=step["settings"],
|
|
43
|
-
depends_on=step.get("depends_on", [])
|
|
43
|
+
depends_on=step.get("depends_on", []),
|
|
44
44
|
)
|
|
45
45
|
for step in workflow_def["steps"]
|
|
46
46
|
]
|
|
47
47
|
|
|
48
|
-
return client.create_workflow(
|
|
49
|
-
name=workflow_def["name"],
|
|
50
|
-
steps=steps
|
|
51
|
-
)
|
|
48
|
+
return client.create_workflow(name=workflow_def["name"], steps=steps)
|
|
52
49
|
|
|
53
50
|
|
|
54
51
|
def create_simple_workflow(client: DatalabClient):
|
|
@@ -60,9 +57,8 @@ def create_simple_workflow(client: DatalabClient):
|
|
|
60
57
|
settings={
|
|
61
58
|
"max_pages": 10,
|
|
62
59
|
"output_format": "json",
|
|
63
|
-
"force_ocr": False
|
|
64
60
|
},
|
|
65
|
-
depends_on=[]
|
|
61
|
+
depends_on=[],
|
|
66
62
|
),
|
|
67
63
|
WorkflowStep(
|
|
68
64
|
step_key="marker_extract",
|
|
@@ -72,16 +68,15 @@ def create_simple_workflow(client: DatalabClient):
|
|
|
72
68
|
"title": {"type": "string"},
|
|
73
69
|
"author": {"type": "string"},
|
|
74
70
|
"date": {"type": "string"},
|
|
75
|
-
"summary": {"type": "string"}
|
|
71
|
+
"summary": {"type": "string"},
|
|
76
72
|
}
|
|
77
73
|
},
|
|
78
|
-
depends_on=["parse_document"]
|
|
79
|
-
)
|
|
74
|
+
depends_on=["parse_document"],
|
|
75
|
+
),
|
|
80
76
|
]
|
|
81
77
|
|
|
82
78
|
return client.create_workflow(
|
|
83
|
-
name="Document Parser with Metadata Extraction",
|
|
84
|
-
steps=steps
|
|
79
|
+
name="Document Parser with Metadata Extraction", steps=steps
|
|
85
80
|
)
|
|
86
81
|
|
|
87
82
|
|
|
@@ -89,16 +84,13 @@ def main():
|
|
|
89
84
|
parser = argparse.ArgumentParser(
|
|
90
85
|
description="Create a workflow from JSON definition or hardcoded example"
|
|
91
86
|
)
|
|
92
|
-
parser.add_argument(
|
|
93
|
-
"--definition",
|
|
94
|
-
help="Path to workflow definition JSON file"
|
|
95
|
-
)
|
|
87
|
+
parser.add_argument("--definition", help="Path to workflow definition JSON file")
|
|
96
88
|
parser.add_argument(
|
|
97
89
|
"--replace",
|
|
98
90
|
action="append",
|
|
99
91
|
nargs=2,
|
|
100
92
|
metavar=("TOKEN", "VALUE"),
|
|
101
|
-
help="Replace tokens in definition (e.g., --replace YOUR_API_KEY abc123)"
|
|
93
|
+
help="Replace tokens in definition (e.g., --replace YOUR_API_KEY abc123)",
|
|
102
94
|
)
|
|
103
95
|
args = parser.parse_args()
|
|
104
96
|
|
|
@@ -132,7 +124,7 @@ def main():
|
|
|
132
124
|
|
|
133
125
|
# Display results
|
|
134
126
|
print("✅ Workflow created successfully!\n")
|
|
135
|
-
print(
|
|
127
|
+
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
136
128
|
print(f"ID: {workflow.id}")
|
|
137
129
|
print(f"Name: {workflow.name}")
|
|
138
130
|
print(f"Team ID: {workflow.team_id}")
|
|
@@ -148,13 +140,13 @@ def main():
|
|
|
148
140
|
print(f" Depends on: {', '.join(step.depends_on)}")
|
|
149
141
|
print()
|
|
150
142
|
|
|
151
|
-
print(
|
|
152
|
-
print(
|
|
153
|
-
print(
|
|
143
|
+
print("💡 Next steps:")
|
|
144
|
+
print(" 1. Execute this workflow:")
|
|
145
|
+
print(" python recipes/workflows/tutorial/execute_workflow.py \\")
|
|
154
146
|
print(f" --workflow_id {workflow.id} \\")
|
|
155
|
-
print(
|
|
147
|
+
print(" --file_url https://example.com/doc.pdf")
|
|
156
148
|
print()
|
|
157
|
-
print(
|
|
149
|
+
print(" 2. Or use the CLI:")
|
|
158
150
|
print(f" datalab execute-workflow --workflow_id {workflow.id} \\")
|
|
159
151
|
json_example = '{"file_urls": ["https://example.com/doc.pdf"]}'
|
|
160
152
|
print(f" --input_config '{json_example}'")
|
|
@@ -143,9 +143,7 @@ async def mock_async_client(mock_server):
|
|
|
143
143
|
@pytest.fixture
|
|
144
144
|
def processing_options():
|
|
145
145
|
"""Create sample processing options"""
|
|
146
|
-
return ConvertOptions(
|
|
147
|
-
force_ocr=True, output_format="markdown", use_llm=False, max_pages=10
|
|
148
|
-
)
|
|
146
|
+
return ConvertOptions(output_format="markdown", max_pages=10)
|
|
149
147
|
|
|
150
148
|
|
|
151
149
|
@pytest.fixture
|
|
@@ -104,94 +104,3 @@ class TestConvertCommand:
|
|
|
104
104
|
|
|
105
105
|
finally:
|
|
106
106
|
os.unlink(tmp_file.name)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class TestOCRCommand:
|
|
110
|
-
"""Test the OCR command"""
|
|
111
|
-
|
|
112
|
-
@patch("datalab_sdk.cli.asyncio.run")
|
|
113
|
-
def test_ocr_successful_single_file(self, mock_client_class):
|
|
114
|
-
"""Test successful OCR of a single file"""
|
|
115
|
-
|
|
116
|
-
mock_client_class.return_value = ASYNC_RETURN_VALUE
|
|
117
|
-
|
|
118
|
-
runner = CliRunner()
|
|
119
|
-
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
|
120
|
-
try:
|
|
121
|
-
result = runner.invoke(
|
|
122
|
-
cli,
|
|
123
|
-
[
|
|
124
|
-
"ocr",
|
|
125
|
-
tmp_file.name,
|
|
126
|
-
"--api_key",
|
|
127
|
-
"test-key",
|
|
128
|
-
"--output_dir",
|
|
129
|
-
"/tmp/output",
|
|
130
|
-
],
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
assert result.exit_code == 0
|
|
134
|
-
assert "Successfully processed: 2 files" in result.output
|
|
135
|
-
|
|
136
|
-
# Verify client was called correctly
|
|
137
|
-
mock_client_class.assert_called_once()
|
|
138
|
-
|
|
139
|
-
finally:
|
|
140
|
-
os.unlink(tmp_file.name)
|
|
141
|
-
|
|
142
|
-
@patch("datalab_sdk.cli.asyncio.run")
|
|
143
|
-
def test_ocr_with_max_pages(self, mock_asyncio_run):
|
|
144
|
-
"""Test OCR command with max_pages option"""
|
|
145
|
-
# Mock the client
|
|
146
|
-
mock_asyncio_run.return_value = ASYNC_RETURN_VALUE
|
|
147
|
-
|
|
148
|
-
runner = CliRunner()
|
|
149
|
-
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
|
150
|
-
try:
|
|
151
|
-
result = runner.invoke(
|
|
152
|
-
cli,
|
|
153
|
-
[
|
|
154
|
-
"ocr",
|
|
155
|
-
tmp_file.name,
|
|
156
|
-
"--api_key",
|
|
157
|
-
"test-key",
|
|
158
|
-
"--output_dir",
|
|
159
|
-
"/tmp/output",
|
|
160
|
-
"--max_pages",
|
|
161
|
-
"5",
|
|
162
|
-
],
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
assert result.exit_code == 0
|
|
166
|
-
assert "Successfully processed: 2 files" in result.output
|
|
167
|
-
|
|
168
|
-
finally:
|
|
169
|
-
os.unlink(tmp_file.name)
|
|
170
|
-
|
|
171
|
-
@patch("datalab_sdk.cli.asyncio.run")
|
|
172
|
-
def test_ocr_multiple_files(self, mock_asyncio_run):
|
|
173
|
-
"""Test OCR of multiple files"""
|
|
174
|
-
# Mock async processing results
|
|
175
|
-
mock_asyncio_run.return_value = ASYNC_RETURN_VALUE
|
|
176
|
-
|
|
177
|
-
runner = CliRunner()
|
|
178
|
-
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
179
|
-
with open(os.path.join(tmp_dir, "test1.pdf"), "w") as f:
|
|
180
|
-
f.write("Dummy content for test1.pdf")
|
|
181
|
-
with open(os.path.join(tmp_dir, "test2.pdf"), "w") as f:
|
|
182
|
-
f.write("Dummy content for test2.pdf")
|
|
183
|
-
result = runner.invoke(
|
|
184
|
-
cli,
|
|
185
|
-
[
|
|
186
|
-
"ocr",
|
|
187
|
-
tmp_dir,
|
|
188
|
-
"--api_key",
|
|
189
|
-
"test-key",
|
|
190
|
-
"--output_dir",
|
|
191
|
-
"/tmp/output",
|
|
192
|
-
],
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
assert result.exit_code == 0
|
|
196
|
-
assert "OCR Summary:" in result.output
|
|
197
|
-
assert "Successfully processed: 2 files" in result.output
|
|
@@ -123,7 +123,9 @@ class TestConvertMethod:
|
|
|
123
123
|
)
|
|
124
124
|
|
|
125
125
|
assert (output_path.with_suffix(".chunks.json")).exists()
|
|
126
|
-
saved_chunks = json.loads(
|
|
126
|
+
saved_chunks = json.loads(
|
|
127
|
+
(output_path.with_suffix(".chunks.json")).read_text()
|
|
128
|
+
)
|
|
127
129
|
assert saved_chunks == {"some_content": True}
|
|
128
130
|
|
|
129
131
|
def test_convert_sync_with_processing_options(self, temp_dir):
|
|
@@ -133,9 +135,7 @@ class TestConvertMethod:
|
|
|
133
135
|
pdf_file.write_bytes(b"%PDF-1.4\n%Test PDF content\n%%EOF\n")
|
|
134
136
|
|
|
135
137
|
# Create processing options
|
|
136
|
-
options = ConvertOptions(
|
|
137
|
-
force_ocr=True, output_format="html", use_llm=True, max_pages=5
|
|
138
|
-
)
|
|
138
|
+
options = ConvertOptions(output_format="html", max_pages=5)
|
|
139
139
|
|
|
140
140
|
# Mock API responses
|
|
141
141
|
mock_initial_response = {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/end_to_end_workflow.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|