datalab-python-sdk 0.1.15__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/PKG-INFO +1 -5
  2. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/README.md +0 -4
  3. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/__init__.py +4 -0
  4. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/cli.py +17 -69
  5. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/client.py +97 -0
  6. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/models.py +88 -7
  7. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/test_readme_examples.py +0 -6
  8. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/pyproject.toml +1 -1
  9. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/3_create_workflow.py +16 -24
  10. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/conftest.py +1 -3
  11. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/test_cli_simple.py +0 -91
  12. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/test_client_methods.py +4 -4
  13. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/uv.lock +1 -1
  14. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.github/workflows/ci.yml +0 -0
  15. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.github/workflows/publish.yml +0 -0
  16. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.gitignore +0 -0
  17. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.pre-commit-config.yaml +0 -0
  18. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/.python-version +0 -0
  19. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/LICENSE +0 -0
  20. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/08-Lambda-Calculus.pptx +0 -0
  21. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/adversarial.pdf +0 -0
  22. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/bid_evaluation.docx +0 -0
  23. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/book_review.ppt +0 -0
  24. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/book_store.xls +0 -0
  25. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/chi_hind.png +0 -0
  26. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/how_to_read.doc +0 -0
  27. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/normandy.epub +0 -0
  28. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/sample-1-sheet.xlsx +0 -0
  29. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/thinkpython.pdf +0 -0
  30. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/data/vibe.html +0 -0
  31. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/exceptions.py +0 -0
  32. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/mimetypes.py +0 -0
  33. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/datalab_sdk/settings.py +0 -0
  34. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/README.md +0 -0
  35. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/__init__.py +0 -0
  36. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/integration/test_live_api.py +0 -0
  37. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/poetry.lock +0 -0
  38. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/pytest.ini +0 -0
  39. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/README.md +0 -0
  40. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/end_to_end_workflow.py +0 -0
  41. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/1_get_step_types.py +0 -0
  42. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/2_get_workflows.py +0 -0
  43. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/4_execute_workflow.py +0 -0
  44. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/5_poll_workflow.py +0 -0
  45. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_api_tutorial/README.md +0 -0
  46. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/README.md +0 -0
  47. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/eval_segmentation.json +0 -0
  48. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/parse_segment.json +0 -0
  49. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/segment_parallel_extract.json +0 -0
  50. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/recipes/workflows/workflow_definitions/slack_alert.json +0 -0
  51. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/__init__.py +0 -0
  52. {datalab_python_sdk-0.1.15 → datalab_python_sdk-0.2.0}/tests/test_workflows.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datalab-python-sdk
3
- Version: 0.1.15
3
+ Version: 0.2.0
4
4
  Summary: SDK for the Datalab document intelligence API
5
5
  Author-email: Datalab Team <hi@datalab.to>
6
6
  License-Expression: MIT
@@ -46,10 +46,6 @@ client = DatalabClient() # use env var from above, or pass api_key="your_api_key
46
46
  # Convert PDF to markdown
47
47
  result = client.convert("document.pdf")
48
48
  print(result.markdown)
49
-
50
- # OCR a document
51
- ocr_result = client.ocr("document.pdf")
52
- print(ocr_result.pages) # Get all text as string
53
49
  ```
54
50
 
55
51
  ## Workflows
@@ -30,10 +30,6 @@ client = DatalabClient() # use env var from above, or pass api_key="your_api_key
30
30
  # Convert PDF to markdown
31
31
  result = client.convert("document.pdf")
32
32
  print(result.markdown)
33
-
34
- # OCR a document
35
- ocr_result = client.ocr("document.pdf")
36
- print(ocr_result.pages) # Get all text as string
37
33
  ```
38
34
 
39
35
  ## Workflows
@@ -12,6 +12,8 @@ from .models import (
12
12
  OCRResult,
13
13
  ConvertOptions,
14
14
  OCROptions,
15
+ FormFillingOptions,
16
+ FormFillingResult,
15
17
  Workflow,
16
18
  WorkflowStep,
17
19
  WorkflowExecution,
@@ -31,6 +33,8 @@ __all__ = [
31
33
  "OCRResult",
32
34
  "ConvertOptions",
33
35
  "OCROptions",
36
+ "FormFillingOptions",
37
+ "FormFillingResult",
34
38
  "Workflow",
35
39
  "WorkflowStep",
36
40
  "WorkflowExecution",
@@ -67,28 +67,16 @@ def marker_options(func):
67
67
  type=click.Choice(["markdown", "html", "json", "chunks"]),
68
68
  help="Output format",
69
69
  )(func)
70
- func = click.option("--force_ocr", is_flag=True, help="Force OCR on every page")(
71
- func
72
- )
73
- func = click.option(
74
- "--format_lines", is_flag=True, help="Partially OCR lines for better formatting"
75
- )(func)
76
70
  func = click.option(
77
71
  "--paginate", is_flag=True, help="Add page delimiters to output"
78
72
  )(func)
79
- func = click.option("--use_llm", is_flag=True, help="Use LLM to enhance accuracy")(
80
- func
81
- )
82
- func = click.option(
83
- "--strip_existing_ocr",
84
- is_flag=True,
85
- help="Remove existing OCR text and redo OCR",
86
- )(func)
87
73
  func = click.option(
88
74
  "--disable_image_extraction", is_flag=True, help="Disable extraction of images"
89
75
  )(func)
90
76
  func = click.option(
91
- "--block_correction_prompt", help="Custom prompt for block correction"
77
+ "--disable_image_captions",
78
+ is_flag=True,
79
+ help="Disable synthetic image captions/descriptions in output",
92
80
  )(func)
93
81
  func = click.option(
94
82
  "--page_schema", help="Schema to set to do structured extraction"
@@ -96,6 +84,12 @@ def marker_options(func):
96
84
  func = click.option(
97
85
  "--add_block_ids", is_flag=True, help="Add block IDs to HTML output"
98
86
  )(func)
87
+ func = click.option(
88
+ "--mode",
89
+ type=click.Choice(["fast", "balanced", "accurate"]),
90
+ default="balanced",
91
+ help="OCR mode",
92
+ )(func)
99
93
  return func
100
94
 
101
95
 
@@ -248,15 +242,12 @@ def process_documents(
248
242
  poll_interval: int,
249
243
  # Convert-specific options
250
244
  output_format: Optional[str] = None,
251
- force_ocr: bool = False,
252
- format_lines: bool = False,
253
245
  paginate: bool = False,
254
- use_llm: bool = False,
255
- strip_existing_ocr: bool = False,
256
246
  disable_image_extraction: bool = False,
257
- block_correction_prompt: Optional[str] = None,
247
+ disable_image_captions: bool = False,
258
248
  page_schema: Optional[str] = None,
259
249
  add_block_ids: bool = False,
250
+ mode: str = "balanced",
260
251
  ):
261
252
  """Unified document processing function"""
262
253
  try:
@@ -290,17 +281,14 @@ def process_documents(
290
281
  options = ConvertOptions(
291
282
  output_format=output_format,
292
283
  max_pages=max_pages,
293
- force_ocr=force_ocr,
294
- format_lines=format_lines,
295
284
  paginate=paginate,
296
- use_llm=use_llm,
297
- strip_existing_ocr=strip_existing_ocr,
298
285
  disable_image_extraction=disable_image_extraction,
286
+ disable_image_captions=disable_image_captions,
299
287
  page_range=page_range,
300
- block_correction_prompt=block_correction_prompt,
301
288
  skip_cache=skip_cache,
302
289
  page_schema=page_schema,
303
290
  add_block_ids=add_block_ids,
291
+ mode=mode,
304
292
  )
305
293
  else: # method == "ocr"
306
294
  options = OCROptions(
@@ -355,15 +343,12 @@ def convert(
355
343
  max_polls: int,
356
344
  poll_interval: int,
357
345
  output_format: str,
358
- force_ocr: bool,
359
- format_lines: bool,
360
346
  paginate: bool,
361
- use_llm: bool,
362
- strip_existing_ocr: bool,
363
347
  disable_image_extraction: bool,
364
- block_correction_prompt: Optional[str],
348
+ disable_image_captions: bool,
365
349
  page_schema: Optional[str],
366
350
  add_block_ids: bool,
351
+ mode: str,
367
352
  ):
368
353
  """Convert documents to markdown, HTML, or JSON"""
369
354
  process_documents(
@@ -380,48 +365,12 @@ def convert(
380
365
  max_polls=max_polls,
381
366
  poll_interval=poll_interval,
382
367
  output_format=output_format,
383
- force_ocr=force_ocr,
384
- format_lines=format_lines,
385
368
  paginate=paginate,
386
- use_llm=use_llm,
387
- strip_existing_ocr=strip_existing_ocr,
388
369
  disable_image_extraction=disable_image_extraction,
389
- block_correction_prompt=block_correction_prompt,
370
+ disable_image_captions=disable_image_captions,
390
371
  page_schema=page_schema,
391
372
  add_block_ids=add_block_ids,
392
- )
393
-
394
-
395
- @click.command()
396
- @click.argument("path", type=click.Path(exists=True))
397
- @common_options
398
- def ocr(
399
- path: str,
400
- api_key: str,
401
- output_dir: str,
402
- max_pages: Optional[int],
403
- extensions: Optional[str],
404
- max_concurrent: int,
405
- base_url: str,
406
- page_range: Optional[str],
407
- skip_cache: bool,
408
- max_polls: int,
409
- poll_interval: int,
410
- ):
411
- """Perform OCR on documents"""
412
- process_documents(
413
- path=path,
414
- method="ocr",
415
- api_key=api_key,
416
- output_dir=output_dir,
417
- max_pages=max_pages,
418
- extensions=extensions,
419
- max_concurrent=max_concurrent,
420
- base_url=base_url,
421
- page_range=page_range,
422
- skip_cache=skip_cache,
423
- max_polls=max_polls,
424
- poll_interval=poll_interval,
373
+ mode=mode,
425
374
  )
426
375
 
427
376
 
@@ -905,7 +854,6 @@ def _render_dag_simple(layers, children, step_map):
905
854
 
906
855
  # Add commands to CLI group
907
856
  cli.add_command(convert)
908
- cli.add_command(ocr)
909
857
  cli.add_command(create_workflow)
910
858
  cli.add_command(get_workflow)
911
859
  cli.add_command(get_step_types)
@@ -27,6 +27,8 @@ from datalab_sdk.models import (
27
27
  ProcessingOptions,
28
28
  ConvertOptions,
29
29
  OCROptions,
30
+ FormFillingOptions,
31
+ FormFillingResult,
30
32
  Workflow,
31
33
  WorkflowStep,
32
34
  WorkflowExecution,
@@ -345,6 +347,70 @@ class AsyncDatalabClient:
345
347
 
346
348
  return result
347
349
 
350
+ async def fill(
351
+ self,
352
+ file_path: Optional[Union[str, Path]] = None,
353
+ file_url: Optional[str] = None,
354
+ options: Optional[FormFillingOptions] = None,
355
+ save_output: Optional[Union[str, Path]] = None,
356
+ max_polls: int = 300,
357
+ poll_interval: int = 1,
358
+ ) -> FormFillingResult:
359
+ """
360
+ Fill PDF or image forms with provided field data
361
+
362
+ Args:
363
+ file_path: Path to the file to fill
364
+ file_url: URL of the file to fill
365
+ options: Form filling options (must include field_data)
366
+ save_output: Optional path to save output files
367
+ max_polls: Maximum number of polling attempts
368
+ poll_interval: Seconds between polling attempts
369
+ """
370
+ if options is None:
371
+ raise ValueError("options must be provided with field_data")
372
+
373
+ initial_data = await self._make_request(
374
+ "POST",
375
+ "/api/v1/fill",
376
+ data=self.get_form_params(
377
+ file_path=file_path, file_url=file_url, options=options
378
+ ),
379
+ )
380
+
381
+ if not initial_data.get("success"):
382
+ raise DatalabAPIError(
383
+ f"Request failed: {initial_data.get('error', 'Unknown error')}"
384
+ )
385
+
386
+ result_data = await self._poll_result(
387
+ initial_data["request_check_url"],
388
+ max_polls=max_polls,
389
+ poll_interval=poll_interval,
390
+ )
391
+
392
+ result = FormFillingResult(
393
+ status=result_data.get("status", "complete"),
394
+ success=result_data.get("success"),
395
+ error=result_data.get("error"),
396
+ output_format=result_data.get("output_format"),
397
+ output_base64=result_data.get("output_base64"),
398
+ fields_filled=result_data.get("fields_filled"),
399
+ fields_not_found=result_data.get("fields_not_found"),
400
+ runtime=result_data.get("runtime"),
401
+ page_count=result_data.get("page_count"),
402
+ cost_breakdown=result_data.get("cost_breakdown"),
403
+ versions=result_data.get("versions"),
404
+ )
405
+
406
+ # Save output if requested
407
+ if save_output and result.success and result.output_base64:
408
+ output_path = Path(save_output)
409
+ output_path.parent.mkdir(parents=True, exist_ok=True)
410
+ result.save_output(output_path)
411
+
412
+ return result
413
+
348
414
  # Workflow methods
349
415
  async def create_workflow(
350
416
  self,
@@ -1016,6 +1082,37 @@ class DatalabClient:
1016
1082
  )
1017
1083
  )
1018
1084
 
1085
+ def fill(
1086
+ self,
1087
+ file_path: Optional[Union[str, Path]] = None,
1088
+ file_url: Optional[str] = None,
1089
+ options: Optional[FormFillingOptions] = None,
1090
+ save_output: Optional[Union[str, Path]] = None,
1091
+ max_polls: int = 300,
1092
+ poll_interval: int = 1,
1093
+ ) -> FormFillingResult:
1094
+ """
1095
+ Fill PDF or image forms with provided field data (sync version)
1096
+
1097
+ Args:
1098
+ file_path: Path to the file to fill
1099
+ file_url: URL of the file to fill
1100
+ options: Form filling options (must include field_data)
1101
+ save_output: Optional path to save output files
1102
+ max_polls: Maximum number of polling attempts
1103
+ poll_interval: Seconds between polling attempts
1104
+ """
1105
+ return self._run_async(
1106
+ self._async_client.fill(
1107
+ file_path=file_path,
1108
+ file_url=file_url,
1109
+ options=options,
1110
+ save_output=save_output,
1111
+ max_polls=max_polls,
1112
+ poll_interval=poll_interval,
1113
+ )
1114
+ )
1115
+
1019
1116
  # Workflow methods (sync)
1020
1117
  def create_workflow(
1021
1118
  self,
@@ -38,14 +38,9 @@ class ConvertOptions(ProcessingOptions):
38
38
  """Options for marker conversion"""
39
39
 
40
40
  # Marker specific options
41
- force_ocr: bool = False
42
- format_lines: bool = False
43
41
  paginate: bool = False
44
- use_llm: bool = False
45
- strip_existing_ocr: bool = False
46
42
  disable_image_extraction: bool = False
47
- disable_ocr_math: bool = False
48
- block_correction_prompt: Optional[str] = None
43
+ disable_image_captions: bool = False
49
44
  additional_config: Optional[Dict[str, Any]] = None
50
45
  page_schema: Optional[Dict[str, Any]] = None
51
46
  segmentation_schema: Optional[str] = None # JSON string for document segmentation
@@ -54,7 +49,7 @@ class ConvertOptions(ProcessingOptions):
54
49
  None # Comma-separated list: 'track_changes', 'chart_understanding'
55
50
  )
56
51
  output_format: str = "markdown" # markdown, json, html, chunks
57
- mode: str = "fast" # fast, balanced, accurate
52
+ mode: str = "balanced" # fast, balanced, accurate
58
53
  keep_spreadsheet_formatting: bool = False
59
54
  webhook_url: Optional[str] = None
60
55
  extras: Optional[str] = None # comma-separated extras
@@ -85,6 +80,32 @@ class OCROptions(ProcessingOptions):
85
80
  pass
86
81
 
87
82
 
83
+ @dataclass
84
+ class FormFillingOptions(ProcessingOptions):
85
+ """Options for form filling"""
86
+
87
+ field_data: Dict[str, Dict[str, str]] = field(default_factory=dict)
88
+ context: Optional[str] = None # Optional context to guide form filling
89
+ confidence_threshold: float = 0.5 # Minimum confidence for field matching (0.0-1.0)
90
+
91
+ def to_form_data(self) -> Dict[str, Any]:
92
+ """Convert to form data format for API requests"""
93
+ # Start with parent's form data
94
+ form_data = super().to_form_data()
95
+
96
+ # field_data must be JSON string
97
+ form_data["field_data"] = (None, json.dumps(self.field_data))
98
+
99
+ # Add context if provided
100
+ if self.context is not None:
101
+ form_data["context"] = (None, self.context)
102
+
103
+ # Add confidence_threshold
104
+ form_data["confidence_threshold"] = (None, str(self.confidence_threshold))
105
+
106
+ return form_data
107
+
108
+
88
109
  @dataclass
89
110
  class ConversionResult:
90
111
  """Result from document conversion (marker endpoint)"""
@@ -345,3 +366,63 @@ class OCRResult:
345
366
  f,
346
367
  indent=2,
347
368
  )
369
+
370
+
371
+ @dataclass
372
+ class FormFillingResult:
373
+ """Result from form filling"""
374
+
375
+ status: str
376
+ success: Optional[bool] = None
377
+ error: Optional[str] = None
378
+ output_format: Optional[str] = None # "pdf" or "png"
379
+ output_base64: Optional[str] = None # Base64-encoded filled form
380
+ fields_filled: Optional[List[str]] = (
381
+ None # List of field keys that were successfully filled
382
+ )
383
+ fields_not_found: Optional[List[str]] = (
384
+ None # List of field keys that couldn't be matched
385
+ )
386
+ runtime: Optional[float] = None
387
+ page_count: Optional[int] = None
388
+ cost_breakdown: Optional[Dict[str, Any]] = None
389
+ versions: Optional[Union[Dict[str, Any], str]] = None
390
+
391
+ def save_output(self, output_path: Union[str, Path]) -> None:
392
+ """Save the filled form to a file"""
393
+ output_path = Path(output_path)
394
+
395
+ if not self.output_base64:
396
+ raise ValueError("No output data available to save")
397
+
398
+ # Determine file extension based on output_format
399
+ if self.output_format == "png":
400
+ output_path = output_path.with_suffix(".png")
401
+ elif self.output_format == "pdf":
402
+ output_path = output_path.with_suffix(".pdf")
403
+ else:
404
+ # Default to PDF if format is unknown
405
+ output_path = output_path.with_suffix(".pdf")
406
+
407
+ # Decode and save base64 data
408
+ with open(output_path, "wb") as f:
409
+ f.write(base64.b64decode(self.output_base64))
410
+
411
+ # Save metadata if available
412
+ metadata = {
413
+ "status": self.status,
414
+ "success": self.success,
415
+ "error": self.error,
416
+ "output_format": self.output_format,
417
+ "fields_filled": self.fields_filled,
418
+ "fields_not_found": self.fields_not_found,
419
+ "runtime": self.runtime,
420
+ "page_count": self.page_count,
421
+ "cost_breakdown": self.cost_breakdown,
422
+ "versions": self.versions,
423
+ }
424
+
425
+ with open(
426
+ output_path.with_suffix(".metadata.json"), "w", encoding="utf-8"
427
+ ) as f:
428
+ json.dump(metadata, f, indent=2)
@@ -98,9 +98,7 @@ class TestAPIMethodExamples:
98
98
 
99
99
  # With options
100
100
  options = ConvertOptions(
101
- force_ocr=True,
102
101
  output_format="html",
103
- use_llm=False, # Keep false for cost reasons
104
102
  max_pages=1,
105
103
  )
106
104
  result = client.convert(DATA_DIR / "adversarial.pdf", options=options)
@@ -294,9 +292,7 @@ class TestProcessingOptionsVariations:
294
292
  from datalab_sdk import ConvertOptions
295
293
 
296
294
  options = ConvertOptions()
297
- assert options.force_ocr is False
298
295
  assert options.output_format == "markdown"
299
- assert options.use_llm is False
300
296
  assert options.max_pages is None
301
297
 
302
298
  def test_processing_options_custom_values(self):
@@ -304,9 +300,7 @@ class TestProcessingOptionsVariations:
304
300
  from datalab_sdk import ConvertOptions
305
301
 
306
302
  options = ConvertOptions(
307
- force_ocr=True,
308
303
  output_format="html",
309
- use_llm=False, # Keep false for cost reasons
310
304
  max_pages=1,
311
305
  )
312
306
 
@@ -7,7 +7,7 @@ readme = "README.md"
7
7
  license = "MIT"
8
8
  repository = "https://github.com/datalab-to/sdk"
9
9
  keywords = ["datalab", "sdk", "document-intelligence", "api"]
10
- version = "0.1.15"
10
+ version = "0.2.0"
11
11
  description = "SDK for the Datalab document intelligence API"
12
12
  requires-python = ">=3.10"
13
13
  dependencies = [
@@ -29,7 +29,7 @@ from datalab_sdk import DatalabClient, WorkflowStep
29
29
 
30
30
  def load_workflow_definition(definition_path: str) -> dict:
31
31
  """Load workflow definition from JSON file"""
32
- with open(definition_path, 'r') as f:
32
+ with open(definition_path, "r") as f:
33
33
  return json.load(f)
34
34
 
35
35
 
@@ -40,15 +40,12 @@ def create_workflow_from_definition(client: DatalabClient, workflow_def: dict):
40
40
  step_key=step["step_key"],
41
41
  unique_name=step["unique_name"],
42
42
  settings=step["settings"],
43
- depends_on=step.get("depends_on", [])
43
+ depends_on=step.get("depends_on", []),
44
44
  )
45
45
  for step in workflow_def["steps"]
46
46
  ]
47
47
 
48
- return client.create_workflow(
49
- name=workflow_def["name"],
50
- steps=steps
51
- )
48
+ return client.create_workflow(name=workflow_def["name"], steps=steps)
52
49
 
53
50
 
54
51
  def create_simple_workflow(client: DatalabClient):
@@ -60,9 +57,8 @@ def create_simple_workflow(client: DatalabClient):
60
57
  settings={
61
58
  "max_pages": 10,
62
59
  "output_format": "json",
63
- "force_ocr": False
64
60
  },
65
- depends_on=[]
61
+ depends_on=[],
66
62
  ),
67
63
  WorkflowStep(
68
64
  step_key="marker_extract",
@@ -72,16 +68,15 @@ def create_simple_workflow(client: DatalabClient):
72
68
  "title": {"type": "string"},
73
69
  "author": {"type": "string"},
74
70
  "date": {"type": "string"},
75
- "summary": {"type": "string"}
71
+ "summary": {"type": "string"},
76
72
  }
77
73
  },
78
- depends_on=["parse_document"]
79
- )
74
+ depends_on=["parse_document"],
75
+ ),
80
76
  ]
81
77
 
82
78
  return client.create_workflow(
83
- name="Document Parser with Metadata Extraction",
84
- steps=steps
79
+ name="Document Parser with Metadata Extraction", steps=steps
85
80
  )
86
81
 
87
82
 
@@ -89,16 +84,13 @@ def main():
89
84
  parser = argparse.ArgumentParser(
90
85
  description="Create a workflow from JSON definition or hardcoded example"
91
86
  )
92
- parser.add_argument(
93
- "--definition",
94
- help="Path to workflow definition JSON file"
95
- )
87
+ parser.add_argument("--definition", help="Path to workflow definition JSON file")
96
88
  parser.add_argument(
97
89
  "--replace",
98
90
  action="append",
99
91
  nargs=2,
100
92
  metavar=("TOKEN", "VALUE"),
101
- help="Replace tokens in definition (e.g., --replace YOUR_API_KEY abc123)"
93
+ help="Replace tokens in definition (e.g., --replace YOUR_API_KEY abc123)",
102
94
  )
103
95
  args = parser.parse_args()
104
96
 
@@ -132,7 +124,7 @@ def main():
132
124
 
133
125
  # Display results
134
126
  print("✅ Workflow created successfully!\n")
135
- print(f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
127
+ print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
136
128
  print(f"ID: {workflow.id}")
137
129
  print(f"Name: {workflow.name}")
138
130
  print(f"Team ID: {workflow.team_id}")
@@ -148,13 +140,13 @@ def main():
148
140
  print(f" Depends on: {', '.join(step.depends_on)}")
149
141
  print()
150
142
 
151
- print(f"💡 Next steps:")
152
- print(f" 1. Execute this workflow:")
153
- print(f" python recipes/workflows/tutorial/execute_workflow.py \\")
143
+ print("💡 Next steps:")
144
+ print(" 1. Execute this workflow:")
145
+ print(" python recipes/workflows/tutorial/execute_workflow.py \\")
154
146
  print(f" --workflow_id {workflow.id} \\")
155
- print(f" --file_url https://example.com/doc.pdf")
147
+ print(" --file_url https://example.com/doc.pdf")
156
148
  print()
157
- print(f" 2. Or use the CLI:")
149
+ print(" 2. Or use the CLI:")
158
150
  print(f" datalab execute-workflow --workflow_id {workflow.id} \\")
159
151
  json_example = '{"file_urls": ["https://example.com/doc.pdf"]}'
160
152
  print(f" --input_config '{json_example}'")
@@ -143,9 +143,7 @@ async def mock_async_client(mock_server):
143
143
  @pytest.fixture
144
144
  def processing_options():
145
145
  """Create sample processing options"""
146
- return ConvertOptions(
147
- force_ocr=True, output_format="markdown", use_llm=False, max_pages=10
148
- )
146
+ return ConvertOptions(output_format="markdown", max_pages=10)
149
147
 
150
148
 
151
149
  @pytest.fixture
@@ -104,94 +104,3 @@ class TestConvertCommand:
104
104
 
105
105
  finally:
106
106
  os.unlink(tmp_file.name)
107
-
108
-
109
- class TestOCRCommand:
110
- """Test the OCR command"""
111
-
112
- @patch("datalab_sdk.cli.asyncio.run")
113
- def test_ocr_successful_single_file(self, mock_client_class):
114
- """Test successful OCR of a single file"""
115
-
116
- mock_client_class.return_value = ASYNC_RETURN_VALUE
117
-
118
- runner = CliRunner()
119
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
120
- try:
121
- result = runner.invoke(
122
- cli,
123
- [
124
- "ocr",
125
- tmp_file.name,
126
- "--api_key",
127
- "test-key",
128
- "--output_dir",
129
- "/tmp/output",
130
- ],
131
- )
132
-
133
- assert result.exit_code == 0
134
- assert "Successfully processed: 2 files" in result.output
135
-
136
- # Verify client was called correctly
137
- mock_client_class.assert_called_once()
138
-
139
- finally:
140
- os.unlink(tmp_file.name)
141
-
142
- @patch("datalab_sdk.cli.asyncio.run")
143
- def test_ocr_with_max_pages(self, mock_asyncio_run):
144
- """Test OCR command with max_pages option"""
145
- # Mock the client
146
- mock_asyncio_run.return_value = ASYNC_RETURN_VALUE
147
-
148
- runner = CliRunner()
149
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
150
- try:
151
- result = runner.invoke(
152
- cli,
153
- [
154
- "ocr",
155
- tmp_file.name,
156
- "--api_key",
157
- "test-key",
158
- "--output_dir",
159
- "/tmp/output",
160
- "--max_pages",
161
- "5",
162
- ],
163
- )
164
-
165
- assert result.exit_code == 0
166
- assert "Successfully processed: 2 files" in result.output
167
-
168
- finally:
169
- os.unlink(tmp_file.name)
170
-
171
- @patch("datalab_sdk.cli.asyncio.run")
172
- def test_ocr_multiple_files(self, mock_asyncio_run):
173
- """Test OCR of multiple files"""
174
- # Mock async processing results
175
- mock_asyncio_run.return_value = ASYNC_RETURN_VALUE
176
-
177
- runner = CliRunner()
178
- with tempfile.TemporaryDirectory() as tmp_dir:
179
- with open(os.path.join(tmp_dir, "test1.pdf"), "w") as f:
180
- f.write("Dummy content for test1.pdf")
181
- with open(os.path.join(tmp_dir, "test2.pdf"), "w") as f:
182
- f.write("Dummy content for test2.pdf")
183
- result = runner.invoke(
184
- cli,
185
- [
186
- "ocr",
187
- tmp_dir,
188
- "--api_key",
189
- "test-key",
190
- "--output_dir",
191
- "/tmp/output",
192
- ],
193
- )
194
-
195
- assert result.exit_code == 0
196
- assert "OCR Summary:" in result.output
197
- assert "Successfully processed: 2 files" in result.output
@@ -123,7 +123,9 @@ class TestConvertMethod:
123
123
  )
124
124
 
125
125
  assert (output_path.with_suffix(".chunks.json")).exists()
126
- saved_chunks = json.loads((output_path.with_suffix(".chunks.json")).read_text())
126
+ saved_chunks = json.loads(
127
+ (output_path.with_suffix(".chunks.json")).read_text()
128
+ )
127
129
  assert saved_chunks == {"some_content": True}
128
130
 
129
131
  def test_convert_sync_with_processing_options(self, temp_dir):
@@ -133,9 +135,7 @@ class TestConvertMethod:
133
135
  pdf_file.write_bytes(b"%PDF-1.4\n%Test PDF content\n%%EOF\n")
134
136
 
135
137
  # Create processing options
136
- options = ConvertOptions(
137
- force_ocr=True, output_format="html", use_llm=True, max_pages=5
138
- )
138
+ options = ConvertOptions(output_format="html", max_pages=5)
139
139
 
140
140
  # Mock API responses
141
141
  mock_initial_response = {
@@ -212,7 +212,7 @@ wheels = [
212
212
 
213
213
  [[package]]
214
214
  name = "datalab-python-sdk"
215
- version = "0.1.15"
215
+ version = "0.2.0"
216
216
  source = { editable = "." }
217
217
  dependencies = [
218
218
  { name = "aiohttp" },