datalab-python-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. datalab_python_sdk-0.1.0/.github/workflows/ci.yml +19 -0
  2. datalab_python_sdk-0.1.0/.github/workflows/publish.yml +39 -0
  3. datalab_python_sdk-0.1.0/.gitignore +219 -0
  4. datalab_python_sdk-0.1.0/.pre-commit-config.yaml +10 -0
  5. datalab_python_sdk-0.1.0/.python-version +1 -0
  6. datalab_python_sdk-0.1.0/LICENSE +18 -0
  7. datalab_python_sdk-0.1.0/PKG-INFO +17 -0
  8. datalab_python_sdk-0.1.0/README.md +174 -0
  9. datalab_python_sdk-0.1.0/data/08-Lambda-Calculus.pptx +0 -0
  10. datalab_python_sdk-0.1.0/data/adversarial.pdf +16700 -3
  11. datalab_python_sdk-0.1.0/data/bid_evaluation.docx +0 -0
  12. datalab_python_sdk-0.1.0/data/book_review.ppt +0 -0
  13. datalab_python_sdk-0.1.0/data/book_store.xls +0 -0
  14. datalab_python_sdk-0.1.0/data/chi_hind.png +0 -0
  15. datalab_python_sdk-0.1.0/data/how_to_read.doc +0 -0
  16. datalab_python_sdk-0.1.0/data/normandy.epub +0 -0
  17. datalab_python_sdk-0.1.0/data/sample-1-sheet.xlsx +0 -0
  18. datalab_python_sdk-0.1.0/data/thinkpython.pdf +0 -0
  19. datalab_python_sdk-0.1.0/data/vibe.html +918 -0
  20. datalab_python_sdk-0.1.0/datalab_sdk/__init__.py +26 -0
  21. datalab_python_sdk-0.1.0/datalab_sdk/cli.py +416 -0
  22. datalab_python_sdk-0.1.0/datalab_sdk/client.py +301 -0
  23. datalab_python_sdk-0.1.0/datalab_sdk/exceptions.py +38 -0
  24. datalab_python_sdk-0.1.0/datalab_sdk/mimetypes.py +18 -0
  25. datalab_python_sdk-0.1.0/datalab_sdk/models.py +155 -0
  26. datalab_python_sdk-0.1.0/datalab_sdk/settings.py +15 -0
  27. datalab_python_sdk-0.1.0/integration/README.md +71 -0
  28. datalab_python_sdk-0.1.0/integration/__init__.py +1 -0
  29. datalab_python_sdk-0.1.0/integration/test_live_api.py +265 -0
  30. datalab_python_sdk-0.1.0/integration/test_readme_examples.py +330 -0
  31. datalab_python_sdk-0.1.0/poetry.lock +1201 -0
  32. datalab_python_sdk-0.1.0/pyproject.toml +39 -0
  33. datalab_python_sdk-0.1.0/pytest.ini +4 -0
  34. datalab_python_sdk-0.1.0/tests/__init__.py +0 -0
  35. datalab_python_sdk-0.1.0/tests/conftest.py +157 -0
  36. datalab_python_sdk-0.1.0/tests/test_cli_simple.py +251 -0
  37. datalab_python_sdk-0.1.0/tests/test_client_methods.py +414 -0
  38. datalab_python_sdk-0.1.0/uv.lock +1126 -0
@@ -0,0 +1,19 @@
1
+ name: Unit tests
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v3
10
+ - name: Set up Python 3.13
11
+ uses: actions/setup-python@v4
12
+ with:
13
+ python-version: 3.13
14
+ - name: Install python dependencies
15
+ run: |
16
+ pip install uv
17
+ uv sync
18
+ - name: Run tests
19
+ run: uv run pytest tests/
@@ -0,0 +1,39 @@
1
+ name: Publish python package
2
+ on:
3
+ push:
4
+ tags:
5
+ - "v*.*.*"
6
+ jobs:
7
+ build:
8
+ runs-on: ubuntu-latest
9
+ steps:
10
+ - uses: actions/checkout@v3
11
+ - name: Set up Python 3.11
12
+ uses: actions/setup-python@v4
13
+ with:
14
+ python-version: 3.11
15
+ - name: Install python dependencies
16
+ run: |
17
+ pip install uv
18
+ uv sync --group dev
19
+ - name: Build package
20
+ run: |
21
+ uv build
22
+ - name: Extract version from pyproject.toml
23
+ id: version
24
+ run: |
25
+ VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])")
26
+ echo "version=v$VERSION" >> $GITHUB_OUTPUT
27
+
28
+ - name: Validate tag matches version
29
+ run: |
30
+ if [ "${{ github.ref_name }}" != "${{ steps.version.outputs.version }}" ]; then
31
+ echo "Tag ${{ github.ref_name }} doesn't match pyproject.toml version ${{ steps.version.outputs.version }}"
32
+ exit 1
33
+ fi
34
+ - name: Publish package
35
+ env:
36
+ PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
37
+ run: |
38
+ cd python
39
+ uv publish --token "$PYPI_TOKEN"
@@ -0,0 +1,219 @@
1
+ arxiv_papers/
2
+ .env
3
+ examples/+*
4
+ examples_all/
5
+ python/examples
6
+ e2e.sh
7
+ TODO.md
8
+ .vscode/
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[codz]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ share/python-wheels/
33
+ *.egg-info/
34
+ .installed.cfg
35
+ *.egg
36
+ MANIFEST
37
+
38
+ # PyInstaller
39
+ # Usually these files are written by a python script from a template
40
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
41
+ *.manifest
42
+ *.spec
43
+
44
+ # Installer logs
45
+ pip-log.txt
46
+ pip-delete-this-directory.txt
47
+
48
+ # Unit test / coverage reports
49
+ htmlcov/
50
+ .tox/
51
+ .nox/
52
+ .coverage
53
+ .coverage.*
54
+ .cache
55
+ nosetests.xml
56
+ coverage.xml
57
+ *.cover
58
+ *.py.cover
59
+ .hypothesis/
60
+ .pytest_cache/
61
+ cover/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ .pybuilder/
85
+ target/
86
+
87
+ # Jupyter Notebook
88
+ .ipynb_checkpoints
89
+
90
+ # IPython
91
+ profile_default/
92
+ ipython_config.py
93
+
94
+ # pyenv
95
+ # For a library or package, you might want to ignore these files since the code is
96
+ # intended to run in multiple environments; otherwise, check them in:
97
+ # .python-version
98
+
99
+ # pipenv
100
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
102
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
103
+ # install all needed dependencies.
104
+ #Pipfile.lock
105
+
106
+ # UV
107
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ #uv.lock
111
+
112
+ # poetry
113
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
114
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
115
+ # commonly ignored for libraries.
116
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
117
+ #poetry.lock
118
+ #poetry.toml
119
+
120
+ # pdm
121
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
122
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
123
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
124
+ #pdm.lock
125
+ #pdm.toml
126
+ .pdm-python
127
+ .pdm-build/
128
+
129
+ # pixi
130
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
131
+ #pixi.lock
132
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
133
+ # in the .venv directory. It is recommended not to include this directory in version control.
134
+ .pixi
135
+
136
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
137
+ __pypackages__/
138
+
139
+ # Celery stuff
140
+ celerybeat-schedule
141
+ celerybeat.pid
142
+
143
+ # SageMath parsed files
144
+ *.sage.py
145
+
146
+ # Environments
147
+ .env
148
+ .envrc
149
+ .venv
150
+ env/
151
+ venv/
152
+ ENV/
153
+ env.bak/
154
+ venv.bak/
155
+
156
+ # Spyder project settings
157
+ .spyderproject
158
+ .spyproject
159
+
160
+ # Rope project settings
161
+ .ropeproject
162
+
163
+ # mkdocs documentation
164
+ /site
165
+
166
+ # mypy
167
+ .mypy_cache/
168
+ .dmypy.json
169
+ dmypy.json
170
+
171
+ # Pyre type checker
172
+ .pyre/
173
+
174
+ # pytype static type analyzer
175
+ .pytype/
176
+
177
+ # Cython debug symbols
178
+ cython_debug/
179
+
180
+ # PyCharm
181
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
182
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
183
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
184
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
185
+ .idea/
186
+
187
+ # Abstra
188
+ # Abstra is an AI-powered process automation framework.
189
+ # Ignore directories containing user credentials, local state, and settings.
190
+ # Learn more at https://abstra.io/docs
191
+ .abstra/
192
+
193
+ # Visual Studio Code
194
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
195
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
196
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
197
+ # you could uncomment the following to ignore the entire vscode folder
198
+ # .vscode/
199
+
200
+ # Ruff stuff:
201
+ .ruff_cache/
202
+
203
+ # PyPI configuration file
204
+ .pypirc
205
+
206
+ # Cursor
207
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
208
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
209
+ # refer to https://docs.cursor.com/context/ignore-files
210
+ .cursorignore
211
+ .cursorindexingignore
212
+
213
+ # Marimo
214
+ marimo/_static/
215
+ marimo/_lsp/
216
+ __marimo__/
217
+
218
+ # Streamlit
219
+ .streamlit/secrets.toml
@@ -0,0 +1,10 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.12.3
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff-check
8
+ args: [ --fix ]
9
+ # Run the formatter.
10
+ - id: ruff-format
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,18 @@
1
+ Copyright 2025 Endless Labs Inc
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the “Software”), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: datalab-python-sdk
3
+ Version: 0.1.0
4
+ Summary: Auto-generated SDK for Datalab API
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: aiohttp>=3.12.14
8
+ Requires-Dist: click>=8.2.1
9
+ Requires-Dist: pydantic-settings<3.0.0,>=2.10.1
10
+ Requires-Dist: pydantic<3.0.0,>=2.11.7
11
+ Requires-Dist: pytest-asyncio>=1.0.0
12
+ Provides-Extra: test
13
+ Requires-Dist: aiofiles>=23.2.0; extra == 'test'
14
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'test'
15
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'test'
16
+ Requires-Dist: pytest-mock>=3.11.0; extra == 'test'
17
+ Requires-Dist: pytest>=7.4.0; extra == 'test'
@@ -0,0 +1,174 @@
1
+ # Datalab SDK
2
+
3
+ A Python SDK for the [Datalab API](https://www.datalab.to) - a document intelligence platform powered by [marker](https://github.com/VikParuchuri/marker) and [surya](https://github.com/VikParuchuri/surya).
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install datalab-sdk
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ### Authentication
14
+
15
+ Get your API key from [https://www.datalab.to/app/keys](https://www.datalab.to/app/keys):
16
+
17
+ ```bash
18
+ export DATALAB_API_KEY="your_api_key_here"
19
+ ```
20
+
21
+ ### Basic Usage
22
+
23
+ ```python
24
+ from datalab_sdk import DatalabClient
25
+
26
+ client = DatalabClient() # use env var from above, or pass api_key="your_api_key_here"
27
+
28
+ # Convert PDF to markdown
29
+ result = client.convert("document.pdf")
30
+ print(result.markdown)
31
+
32
+ # OCR a document
33
+ ocr_result = client.ocr("document.pdf")
34
+ print(ocr_result.get_text()) # Get all text as string
35
+ ```
36
+
37
+ ### Async Usage
38
+
39
+ ```python
40
+ import asyncio
41
+ from datalab_sdk import AsyncDatalabClient
42
+
43
+ async def main():
44
+ async with AsyncDatalabClient(api_key="YOUR_API_KEY") as client:
45
+ # Convert PDF to markdown
46
+ result = await client.convert("document.pdf")
47
+ print(result.markdown)
48
+
49
+ # OCR a document
50
+ ocr_result = await client.ocr("document.pdf")
51
+ print(f"OCR found {len(ocr_result.pages)} pages")
52
+
53
+ asyncio.run(main())
54
+ ```
55
+
56
+ ## API Methods
57
+
58
+ ### Document Conversion
59
+
60
+ Convert PDFs, Office documents, and images to markdown, HTML, or JSON.
61
+
62
+ ```python
63
+ # Basic conversion
64
+ result = client.convert("document.pdf")
65
+
66
+ # With options
67
+ from datalab_sdk import ProcessingOptions
68
+ options = ProcessingOptions(
69
+ force_ocr=True,
70
+ output_format="html",
71
+ use_llm=True,
72
+ max_pages=10
73
+ )
74
+ result = client.convert("document.pdf", options=options)
75
+
76
+ # Convert and save automatically
77
+ result = client.convert("document.pdf", save_output="output/result")
78
+ ```
79
+
80
+ ### OCR
81
+
82
+ Extract text with bounding boxes from documents.
83
+
84
+ ```python
85
+ # Basic OCR
86
+ result = client.ocr("document.pdf")
87
+ print(result.get_text())
88
+
89
+ # OCR with options
90
+ result = client.ocr("document.pdf", max_pages=5)
91
+
92
+ # OCR and save automatically
93
+ result = client.ocr("document.pdf", save_output="output/ocr_result")
94
+ ```
95
+
96
+ ## CLI Usage
97
+
98
+ The SDK includes a command-line interface:
99
+
100
+ ```bash
101
+ # Convert document to markdown
102
+ datalab convert document.pdf
103
+
104
+ # OCR with JSON output
105
+ datalab ocr document.pdf --output-format json
106
+ ```
107
+
108
+ ## Error Handling
109
+
110
+ ```python
111
+ from datalab_sdk import DatalabAPIError, DatalabTimeoutError
112
+
113
+ try:
114
+ result = client.convert("document.pdf")
115
+ except DatalabAPIError as e:
116
+ print(f"API Error: {e}")
117
+ except DatalabTimeoutError as e:
118
+ print(f"Timeout: {e}")
119
+ ```
120
+
121
+ ## Supported File Types
122
+
123
+ - **PDF**: `pdf`
124
+ - **Images**: `png`, `jpeg`, `webp`, `gif`, `tiff`
125
+ - **Office Documents**: `docx`, `xlsx`, `pptx`, `doc`, `xls`, `ppt`
126
+ - **Other**: `html`, `epub`, `odt`, `ods`, `odp`
127
+
128
+ ## Rate Limits
129
+
130
+ - 200 requests per 60 seconds
131
+ - Maximum 200 concurrent requests
132
+ - 200MB file size limit
133
+
134
+ * email hi@datalab.to for higher limits.
135
+
136
+ ## Examples
137
+
138
+ ### Extract JSON Data
139
+
140
+ ```python
141
+ from datalab_sdk import DatalabClient, ProcessingOptions
142
+
143
+ client = DatalabClient(api_key="YOUR_API_KEY")
144
+ options = ProcessingOptions(output_format="json")
145
+ result = client.convert("research_paper.pdf", options=options)
146
+
147
+ # Parse JSON to find equations
148
+ import json
149
+ data = json.loads(result.json)
150
+ equations = [block for block in data if block.get('btype') == 'Formula']
151
+ print(f"Found {len(equations)} equations")
152
+ ```
153
+
154
+ ### Batch Process Documents
155
+
156
+ ```python
157
+ import asyncio
158
+ from pathlib import Path
159
+ from datalab_sdk import AsyncDatalabClient
160
+
161
+ async def process_documents():
162
+ files = list(Path("documents/").glob("*.pdf"))
163
+
164
+ async with AsyncDatalabClient(api_key="YOUR_API_KEY") as client:
165
+ for file in files[:5]:
166
+ result = await client.convert(str(file), save_output=f"output/{file.stem}")
167
+ print(f"{file.name}: {result.page_count} pages")
168
+
169
+ asyncio.run(process_documents())
170
+ ```
171
+
172
+ ## License
173
+
174
+ MIT License