datalab-python-sdk 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/PKG-INFO +1 -1
  2. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/cli.py +25 -3
  3. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/client.py +1 -0
  4. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/models.py +6 -1
  5. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/pyproject.toml +1 -1
  6. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/test_client_methods.py +6 -1
  7. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/uv.lock +2 -2
  8. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.github/workflows/ci.yml +0 -0
  9. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.github/workflows/publish.yml +0 -0
  10. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.gitignore +0 -0
  11. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.pre-commit-config.yaml +0 -0
  12. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/.python-version +0 -0
  13. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/LICENSE +0 -0
  14. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/README.md +0 -0
  15. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/08-Lambda-Calculus.pptx +0 -0
  16. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/adversarial.pdf +0 -0
  17. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/bid_evaluation.docx +0 -0
  18. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/book_review.ppt +0 -0
  19. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/book_store.xls +0 -0
  20. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/chi_hind.png +0 -0
  21. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/how_to_read.doc +0 -0
  22. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/normandy.epub +0 -0
  23. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/sample-1-sheet.xlsx +0 -0
  24. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/thinkpython.pdf +0 -0
  25. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/data/vibe.html +0 -0
  26. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/__init__.py +0 -0
  27. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/exceptions.py +0 -0
  28. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/mimetypes.py +0 -0
  29. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/datalab_sdk/settings.py +0 -0
  30. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/README.md +0 -0
  31. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/__init__.py +0 -0
  32. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/test_live_api.py +0 -0
  33. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/integration/test_readme_examples.py +0 -0
  34. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/poetry.lock +0 -0
  35. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/pytest.ini +0 -0
  36. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/__init__.py +0 -0
  37. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/conftest.py +0 -0
  38. {datalab_python_sdk-0.1.6 → datalab_python_sdk-0.1.8}/tests/test_cli_simple.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datalab-python-sdk
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: SDK for the Datalab document intelligence API
5
5
  Author-email: Datalab Team <hi@datalab.to>
6
6
  License-Expression: MIT
@@ -42,6 +42,12 @@ def common_options(func):
42
42
  func = click.option("--skip_cache", help="Skip the cache when running inference")(
43
43
  func
44
44
  )
45
+ func = click.option(
46
+ "--max_polls", default=300, type=int, help="Maximum number of polling attempts"
47
+ )(func)
48
+ func = click.option(
49
+ "--poll_interval", default=1, type=int, help="Polling interval in seconds"
50
+ )(func)
45
51
  return func
46
52
 
47
53
 
@@ -51,7 +57,7 @@ def marker_options(func):
51
57
  "--format",
52
58
  "output_format",
53
59
  default="markdown",
54
- type=click.Choice(["markdown", "html", "json"]),
60
+ type=click.Choice(["markdown", "html", "json", "chunks"]),
55
61
  help="Output format",
56
62
  )(func)
57
63
  func = click.option("--force_ocr", is_flag=True, help="Force OCR on every page")(
@@ -106,6 +112,8 @@ async def process_files_async(
106
112
  max_concurrent: int = 5,
107
113
  api_key: str | None = None,
108
114
  base_url: str | None = None,
115
+ max_polls: int = 300,
116
+ poll_interval: int = 1,
109
117
  ) -> List[dict]:
110
118
  """Process files asynchronously"""
111
119
  semaphore = asyncio.Semaphore(max_concurrent)
@@ -124,11 +132,13 @@ async def process_files_async(
124
132
  ) as client:
125
133
  if method == "convert":
126
134
  result = await client.convert(
127
- file_path, options=options, save_output=output_path
135
+ file_path, options=options, save_output=output_path,
136
+ max_polls=max_polls, poll_interval=poll_interval
128
137
  )
129
138
  else: # method == 'ocr'
130
139
  result = await client.ocr(
131
- file_path, options=options, save_output=output_path
140
+ file_path, options=options, save_output=output_path,
141
+ max_polls=max_polls, poll_interval=poll_interval
132
142
  )
133
143
 
134
144
  return {
@@ -218,6 +228,8 @@ def process_documents(
218
228
  base_url: str,
219
229
  page_range: Optional[str],
220
230
  skip_cache: bool,
231
+ max_polls: int,
232
+ poll_interval: int,
221
233
  # Convert-specific options
222
234
  output_format: Optional[str] = None,
223
235
  force_ocr: bool = False,
@@ -288,6 +300,8 @@ def process_documents(
288
300
  max_concurrent=max_concurrent,
289
301
  api_key=api_key,
290
302
  base_url=base_url,
303
+ max_polls=max_polls,
304
+ poll_interval=poll_interval,
291
305
  )
292
306
  )
293
307
 
@@ -320,6 +334,8 @@ def convert(
320
334
  base_url: str,
321
335
  page_range: Optional[str],
322
336
  skip_cache: bool,
337
+ max_polls: int,
338
+ poll_interval: int,
323
339
  output_format: str,
324
340
  force_ocr: bool,
325
341
  format_lines: bool,
@@ -342,6 +358,8 @@ def convert(
342
358
  base_url=base_url,
343
359
  page_range=page_range,
344
360
  skip_cache=skip_cache,
361
+ max_polls=max_polls,
362
+ poll_interval=poll_interval,
345
363
  output_format=output_format,
346
364
  force_ocr=force_ocr,
347
365
  format_lines=format_lines,
@@ -367,6 +385,8 @@ def ocr(
367
385
  base_url: str,
368
386
  page_range: Optional[str],
369
387
  skip_cache: bool,
388
+ max_polls: int,
389
+ poll_interval: int,
370
390
  ):
371
391
  """Perform OCR on documents"""
372
392
  process_documents(
@@ -380,6 +400,8 @@ def ocr(
380
400
  base_url=base_url,
381
401
  page_range=page_range,
382
402
  skip_cache=skip_cache,
403
+ max_polls=max_polls,
404
+ poll_interval=poll_interval,
383
405
  )
384
406
 
385
407
 
@@ -249,6 +249,7 @@ class AsyncDatalabClient:
249
249
  markdown=result_data.get("markdown"),
250
250
  html=result_data.get("html"),
251
251
  json=result_data.get("json"),
252
+ chunks=result_data.get("chunks"),
252
253
  extraction_schema_json=result_data.get("extraction_schema_json"),
253
254
  images=result_data.get("images"),
254
255
  metadata=result_data.get("metadata"),
@@ -13,7 +13,7 @@ import base64
13
13
  class ProcessingOptions:
14
14
  # Common options
15
15
  max_pages: Optional[int] = None
16
- skip_cache: bool = True
16
+ skip_cache: bool = False
17
17
  page_range: Optional[str] = None
18
18
 
19
19
  def to_form_data(self) -> Dict[str, Any]:
@@ -65,6 +65,7 @@ class ConversionResult:
65
65
  markdown: Optional[str] = None
66
66
  html: Optional[str] = None
67
67
  json: Optional[Dict[str, Any]] = None
68
+ chunks: Optional[Dict[str, Any]] = None
68
69
  extraction_schema_json: Optional[str] = None
69
70
  images: Optional[Dict[str, str]] = None
70
71
  metadata: Optional[Dict[str, Any]] = None
@@ -91,6 +92,10 @@ class ConversionResult:
91
92
  with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
92
93
  json.dump(self.json, f, indent=2)
93
94
 
95
+ if self.chunks:
96
+ with open(output_path.with_suffix(".chunks.json"), "w", encoding="utf-8") as f:
97
+ json.dump(self.chunks, f, indent=2)
98
+
94
99
  if self.extraction_schema_json:
95
100
  with open(
96
101
  output_path.with_suffix("_extraction_results.json"),
@@ -7,7 +7,7 @@ readme = "README.md"
7
7
  license = "MIT"
8
8
  repository = "https://github.com/datalab-to/sdk"
9
9
  keywords = ["datalab", "sdk", "document-intelligence", "api"]
10
- version = "0.1.6"
10
+ version = "0.1.8"
11
11
  description = "SDK for the Datalab document intelligence API"
12
12
  requires-python = ">=3.10"
13
13
  dependencies = [
@@ -89,6 +89,7 @@ class TestConvertMethod:
89
89
  "markdown": "# Test Document\n\nThis is a test document.",
90
90
  "html": None,
91
91
  "json": None,
92
+ "chunks": {"some_content": True},
92
93
  "images": {},
93
94
  "metadata": {"pages": 1},
94
95
  "error": "",
@@ -114,13 +115,17 @@ class TestConvertMethod:
114
115
  # Verify result
115
116
  assert result.success is True
116
117
 
117
- # Verify file was saved
118
+ # Verify Markdown file was saved
118
119
  assert (output_path.with_suffix(".md")).exists()
119
120
  saved_content = (output_path.with_suffix(".md")).read_text()
120
121
  assert (
121
122
  saved_content == "# Test Document\n\nThis is a test document."
122
123
  )
123
124
 
125
+ assert (output_path.with_suffix(".chunks.json")).exists()
126
+ saved_chunks = json.loads((output_path.with_suffix(".chunks.json")).read_text())
127
+ assert saved_chunks == {"some_content": True}
128
+
124
129
  def test_convert_sync_with_processing_options(self, temp_dir):
125
130
  """Test synchronous conversion with processing options"""
126
131
  # Create test file
@@ -1,5 +1,5 @@
1
1
  version = 1
2
- revision = 2
2
+ revision = 3
3
3
  requires-python = ">=3.10"
4
4
 
5
5
  [[package]]
@@ -169,7 +169,7 @@ wheels = [
169
169
 
170
170
  [[package]]
171
171
  name = "datalab-python-sdk"
172
- version = "0.1.6"
172
+ version = "0.1.8"
173
173
  source = { editable = "." }
174
174
  dependencies = [
175
175
  { name = "aiohttp" },