byteit 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. byteit-0.1.2/CHANGELOG.md +45 -0
  2. {byteit-0.1.0 → byteit-0.1.2}/MANIFEST.in +1 -0
  3. byteit-0.1.2/PKG-INFO +275 -0
  4. byteit-0.1.2/README.md +237 -0
  5. {byteit-0.1.0 → byteit-0.1.2}/byteit/ByteITClient.py +56 -28
  6. {byteit-0.1.0 → byteit-0.1.2}/byteit/__init__.py +11 -1
  7. byteit-0.1.2/byteit/progress.py +167 -0
  8. byteit-0.1.2/byteit.egg-info/PKG-INFO +275 -0
  9. {byteit-0.1.0 → byteit-0.1.2}/byteit.egg-info/SOURCES.txt +4 -1
  10. {byteit-0.1.0 → byteit-0.1.2}/byteit.egg-info/requires.txt +1 -0
  11. {byteit-0.1.0 → byteit-0.1.2}/pyproject.toml +5 -3
  12. {byteit-0.1.0 → byteit-0.1.2}/tests/test_client.py +134 -5
  13. byteit-0.1.2/tests/test_progress.py +133 -0
  14. byteit-0.1.0/PKG-INFO +0 -424
  15. byteit-0.1.0/README.md +0 -387
  16. byteit-0.1.0/byteit.egg-info/PKG-INFO +0 -424
  17. {byteit-0.1.0 → byteit-0.1.2}/LICENSE +0 -0
  18. {byteit-0.1.0 → byteit-0.1.2}/byteit/connectors/LocalFileInputConnector.py +0 -0
  19. {byteit-0.1.0 → byteit-0.1.2}/byteit/connectors/LocalFileOutputConnector.py +0 -0
  20. {byteit-0.1.0 → byteit-0.1.2}/byteit/connectors/S3InputConnector.py +0 -0
  21. {byteit-0.1.0 → byteit-0.1.2}/byteit/connectors/S3OutputConnector.py +0 -0
  22. {byteit-0.1.0 → byteit-0.1.2}/byteit/connectors/__init__.py +0 -0
  23. {byteit-0.1.0 → byteit-0.1.2}/byteit/connectors/base.py +0 -0
  24. {byteit-0.1.0 → byteit-0.1.2}/byteit/exceptions.py +0 -0
  25. {byteit-0.1.0 → byteit-0.1.2}/byteit/models/DocumentMetadata.py +0 -0
  26. {byteit-0.1.0 → byteit-0.1.2}/byteit/models/Job.py +0 -0
  27. {byteit-0.1.0 → byteit-0.1.2}/byteit/models/JobList.py +0 -0
  28. {byteit-0.1.0 → byteit-0.1.2}/byteit/models/OutputFormat.py +0 -0
  29. {byteit-0.1.0 → byteit-0.1.2}/byteit/models/ProcessingOptions.py +0 -0
  30. {byteit-0.1.0 → byteit-0.1.2}/byteit/validations.py +0 -0
  31. {byteit-0.1.0 → byteit-0.1.2}/byteit.egg-info/dependency_links.txt +0 -0
  32. {byteit-0.1.0 → byteit-0.1.2}/byteit.egg-info/top_level.txt +0 -0
  33. {byteit-0.1.0 → byteit-0.1.2}/setup.cfg +0 -0
  34. {byteit-0.1.0 → byteit-0.1.2}/setup.py +0 -0
  35. {byteit-0.1.0 → byteit-0.1.2}/tests/__init__.py +0 -0
  36. {byteit-0.1.0 → byteit-0.1.2}/tests/test_connectors.py +0 -0
  37. {byteit-0.1.0 → byteit-0.1.2}/tests/test_exceptions.py +0 -0
  38. {byteit-0.1.0 → byteit-0.1.2}/tests/test_integration.py +0 -0
  39. {byteit-0.1.0 → byteit-0.1.2}/tests/test_models.py +0 -0
@@ -0,0 +1,45 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+
9
+ ## [0.1.2] - 2026-01-31
10
+
11
+ ### Changed
12
+ - Processing migrated to top-enhanced production servers, reducing processing time by ~50%.
13
+
14
+ - Adaptive polling improved for smoother operation.
15
+
16
+ - Progress bar and output display enhanced for notebooks.
17
+
18
+ - Increased `DEFAULT_TIMEOUT` from 10 minutes to 30 minutes for large documents
19
+
20
+ ## [0.1.1] - 2026-01-24
21
+
22
+ ### Changed
23
+ - Increased `DEFAULT_TIMEOUT` from 30 seconds to 10 minutes for large documents
24
+ - Adaptive polling in `_wait_for_completion`: starts at 2s, increases to max 10s
25
+
26
+ ### Added
27
+ - Progress logging during document parsing
28
+
29
+ ## [0.1.0] - 2026-01-18
30
+
31
+ ### Added
32
+ - Initial release of the ByteIT Python SDK
33
+ - `ByteITClient` for AI-powered document parsing
34
+ - Multiple output formats: text, JSON, Markdown, HTML
35
+ - Input connectors:
36
+ - `LocalFileInputConnector`
37
+ - `S3InputConnector`
38
+ - Output connector:
39
+ - `LocalFileOutputConnector`
40
+ - Job management (list jobs, check status, download results)
41
+ - Support for PDF, Word, Excel, and other common document formats
42
+ - Batch processing support
43
+ - Environment variable configuration
44
+ - Custom base URL support (testing & staging)
45
+ - Python 3.8+ support
@@ -2,4 +2,5 @@ recursive-include byteit *.py
2
2
  recursive-include tests *.py
3
3
  include README.md
4
4
  include LICENSE
5
+ include CHANGELOG.md
5
6
  include pyproject.toml
byteit-0.1.2/PKG-INFO ADDED
@@ -0,0 +1,275 @@
1
+ Metadata-Version: 2.1
2
+ Name: byteit
3
+ Version: 0.1.2
4
+ Summary: AI-powered document intelligence platform - Turn your data into structured data with a single line of code.
5
+ Author-email: ByteIT GmbH <support@byteit.ai>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://byteit.ai
8
+ Project-URL: Repository, https://github.com/byteit-ai/byteit-api
9
+ Project-URL: Pricing, https://byteit.ai/pricing
10
+ Project-URL: Support, https://byteit.ai/support
11
+ Project-URL: Contact, https://byteit.ai/contact
12
+ Keywords: document-processing,ai,document-intelligence,pdf,data-extraction,machine-learning,ocr
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Classifier: Topic :: Office/Business
26
+ Requires-Python: >=3.8
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: requests>=2.28.0
30
+ Requires-Dist: tqdm>=4.65.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
34
+ Requires-Dist: black>=23.0.0; extra == "dev"
35
+ Requires-Dist: isort>=5.0.0; extra == "dev"
36
+ Requires-Dist: flake8>=6.0.0; extra == "dev"
37
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
38
+
39
+ # ByteIT Python SDK
40
+
41
+ ByteIT's Python library for extracting structured data from documents.
42
+ It is designed for backend services and ETL pipelines that require reliable, consistent document parsing at scale through a simple API.
43
+
44
+ ---
45
+
46
+ ## Installation
47
+
48
+ Install from PyPI:
49
+
50
+ ```bash
51
+ pip install byteit
52
+ ```
53
+
54
+ Python 3.8 or newer is required.
55
+
56
+ ---
57
+
58
+ ## Quick Start
59
+
60
+ ```python
61
+ from byteit import ByteITClient
62
+
63
+ client = ByteITClient(api_key="your_api_key")
64
+
65
+ result = client.parse("document.pdf")
66
+ print(result.decode())
67
+ ```
68
+
69
+ The returned value is raw bytes containing the parsed document content.
70
+
71
+ ---
72
+
73
+ ## Supported Input File Types
74
+
75
+ ByteIT supports the following file types as input:
76
+
77
+ * PDF (`.pdf`)
78
+ * Word (`.docx`)
79
+ * PowerPoint (`.pptx`)
80
+ * HTML (`.html`)
81
+ * Markdown (`.md`)
82
+ * Plain text (`.txt`)
83
+ * JSON (`.json`)
84
+ * XML (`.xml`)
85
+
86
+ ---
87
+
88
+ ## Basic Usage
89
+
90
+ ### Parse a Local File
91
+
92
+ ```python
93
+ result = client.parse("invoice.pdf")
94
+ ```
95
+
96
+ By default, the output format is **Markdown (`md`)**.
97
+
98
+ ---
99
+
100
+ ## Output Formats
101
+
102
+ You can choose the output format depending on your pipeline needs:
103
+
104
+ ```python
105
+ txt = client.parse("doc.pdf", output_format="txt")
106
+ json = client.parse("doc.pdf", output_format="json")
107
+ md = client.parse("doc.pdf", output_format="md")
108
+ html = client.parse("doc.pdf", output_format="html")
109
+ ```
110
+
111
+ Supported output formats:
112
+
113
+ * Plain text (`txt`)
114
+ * JSON (`json`)
115
+ * Markdown (`md`) *(default)*
116
+ * HTML (`html`)
117
+
118
+ ---
119
+
120
+ ## Save Output to File
121
+
122
+ ```python
123
+ client.parse(
124
+ "doc.pdf",
125
+ output_format="md",
126
+ output="result.md"
127
+ )
128
+ ```
129
+
130
+ When `output` is provided, the parsed result is written directly to disk.
131
+
132
+ ---
133
+
134
+ ## Notebook Integration
135
+
136
+ When used in Jupyter notebooks, ByteIT automatically displays results in a readable format:
137
+
138
+ * **JSON**: Interactive, expandable/collapsible tree view
139
+ * **Markdown**: Rendered with formatting (headers, lists, etc.)
140
+ * **HTML**: Rendered as HTML
141
+ * **Text**: Code block with syntax highlighting
142
+
143
+ ```python
144
+ # In a Jupyter notebook - automatically displays formatted result
145
+ result = client.parse("document.pdf", result_format="json")
146
+ ```
147
+
148
+ To disable auto-display, save to a file instead:
149
+
150
+ ```python
151
+ # Saves to file, no auto-display
152
+ result = client.parse("doc.pdf", result_format="json", output="output.json")
153
+ ```
154
+
155
+ ---
156
+
157
+ ## Typical Use Cases
158
+
159
+ * Extracting structured data from documents in ETL pipelines
160
+ * Preprocessing documents before indexing or downstream processing
161
+ * Automating ingestion of invoices, contracts, or reports
162
+ * Interactive document exploration in Jupyter notebooks
163
+
164
+ ---
165
+
166
+ ## API Reference
167
+
168
+ ### `ByteITClient`
169
+
170
+ ```python
171
+ ByteITClient(api_key: str)
172
+ ```
173
+
174
+ Creates a new ByteIT client.
175
+
176
+ #### Parameters
177
+
178
+ * `api_key` (`str`): Your ByteIT API key
179
+
180
+ ---
181
+
182
+ ### `parse(...)`
183
+
184
+ ```python
185
+ parse(
186
+ input,
187
+ output_format: str = "md",
188
+ output = None
189
+ )
190
+ ```
191
+
192
+ Parse a document and return the extracted content.
193
+
194
+ #### Parameters
195
+
196
+ * `input` (`str | Path`): Path to a local document
197
+ * `output_format` (`str`): Output format (`txt`, `json`, `md`, `html`)
198
+ * `output` (`str | Path | None`): Optional path to save the result
199
+
200
+ #### Returns
201
+
202
+ * `bytes`: Parsed document content
203
+
204
+ ---
205
+
206
+ ## Error Handling
207
+
208
+ The SDK exposes specific exceptions for common error cases:
209
+
210
+ ```python
211
+ from byteit.exceptions import (
212
+ ByteITError,
213
+ ValidationError,
214
+ AuthenticationError,
215
+ RateLimitError,
216
+ ServerError,
217
+ )
218
+
219
+ try:
220
+ result = client.parse("document.pdf")
221
+ except ValidationError as e:
222
+ print("Invalid input:", e.message)
223
+ except AuthenticationError:
224
+ print("Invalid API key")
225
+ except RateLimitError:
226
+ print("Rate limit exceeded")
227
+ except ByteITError as e:
228
+ print("ByteIT error:", e.message)
229
+ ```
230
+
231
+ All exceptions inherit from `ByteITError`.
232
+
233
+ ---
234
+
235
+ ## Configuration
236
+
237
+ ### Environment Variable
238
+
239
+ You can provide the API key via environment variable:
240
+
241
+ ```bash
242
+ export BYTEIT_API_KEY="your_api_key"
243
+ ```
244
+
245
+ ```python
246
+ import os
247
+ from byteit import ByteITClient
248
+
249
+ client = ByteITClient(api_key=os.getenv("BYTEIT_API_KEY"))
250
+ ```
251
+
252
+ ---
253
+
254
+ ## Requirements
255
+
256
+ * Python 3.8+
257
+ * `requests`
258
+
259
+ ---
260
+
261
+ ## About ByteIT
262
+
263
+ ByteIT provides document parsing and data extraction APIs designed for backend systems and automation workflows.
264
+
265
+ Website: [https://byteit.ai](https://byteit.ai)
266
+
267
+ ---
268
+
269
+ ## License
270
+
271
+ This project is licensed under the terms specified in the [LICENSE](LICENSE) file.
272
+
273
+ © 2026 ByteIT GmbH
274
+
275
+ ---
byteit-0.1.2/README.md ADDED
@@ -0,0 +1,237 @@
1
+ # ByteIT Python SDK
2
+
3
+ ByteIT's Python library for extracting structured data from documents.
4
+ It is designed for backend services and ETL pipelines that require reliable, consistent document parsing at scale through a simple API.
5
+
6
+ ---
7
+
8
+ ## Installation
9
+
10
+ Install from PyPI:
11
+
12
+ ```bash
13
+ pip install byteit
14
+ ```
15
+
16
+ Python 3.8 or newer is required.
17
+
18
+ ---
19
+
20
+ ## Quick Start
21
+
22
+ ```python
23
+ from byteit import ByteITClient
24
+
25
+ client = ByteITClient(api_key="your_api_key")
26
+
27
+ result = client.parse("document.pdf")
28
+ print(result.decode())
29
+ ```
30
+
31
+ The returned value is raw bytes containing the parsed document content.
32
+
33
+ ---
34
+
35
+ ## Supported Input File Types
36
+
37
+ ByteIT supports the following file types as input:
38
+
39
+ * PDF (`.pdf`)
40
+ * Word (`.docx`)
41
+ * PowerPoint (`.pptx`)
42
+ * HTML (`.html`)
43
+ * Markdown (`.md`)
44
+ * Plain text (`.txt`)
45
+ * JSON (`.json`)
46
+ * XML (`.xml`)
47
+
48
+ ---
49
+
50
+ ## Basic Usage
51
+
52
+ ### Parse a Local File
53
+
54
+ ```python
55
+ result = client.parse("invoice.pdf")
56
+ ```
57
+
58
+ By default, the output format is **Markdown (`md`)**.
59
+
60
+ ---
61
+
62
+ ## Output Formats
63
+
64
+ You can choose the output format depending on your pipeline needs:
65
+
66
+ ```python
67
+ txt = client.parse("doc.pdf", output_format="txt")
68
+ json = client.parse("doc.pdf", output_format="json")
69
+ md = client.parse("doc.pdf", output_format="md")
70
+ html = client.parse("doc.pdf", output_format="html")
71
+ ```
72
+
73
+ Supported output formats:
74
+
75
+ * Plain text (`txt`)
76
+ * JSON (`json`)
77
+ * Markdown (`md`) *(default)*
78
+ * HTML (`html`)
79
+
80
+ ---
81
+
82
+ ## Save Output to File
83
+
84
+ ```python
85
+ client.parse(
86
+ "doc.pdf",
87
+ output_format="md",
88
+ output="result.md"
89
+ )
90
+ ```
91
+
92
+ When `output` is provided, the parsed result is written directly to disk.
93
+
94
+ ---
95
+
96
+ ## Notebook Integration
97
+
98
+ When used in Jupyter notebooks, ByteIT automatically displays results in a readable format:
99
+
100
+ * **JSON**: Interactive, expandable/collapsible tree view
101
+ * **Markdown**: Rendered with formatting (headers, lists, etc.)
102
+ * **HTML**: Rendered as HTML
103
+ * **Text**: Code block with syntax highlighting
104
+
105
+ ```python
106
+ # In a Jupyter notebook - automatically displays formatted result
107
+ result = client.parse("document.pdf", result_format="json")
108
+ ```
109
+
110
+ To disable auto-display, save to a file instead:
111
+
112
+ ```python
113
+ # Saves to file, no auto-display
114
+ result = client.parse("doc.pdf", result_format="json", output="output.json")
115
+ ```
116
+
117
+ ---
118
+
119
+ ## Typical Use Cases
120
+
121
+ * Extracting structured data from documents in ETL pipelines
122
+ * Preprocessing documents before indexing or downstream processing
123
+ * Automating ingestion of invoices, contracts, or reports
124
+ * Interactive document exploration in Jupyter notebooks
125
+
126
+ ---
127
+
128
+ ## API Reference
129
+
130
+ ### `ByteITClient`
131
+
132
+ ```python
133
+ ByteITClient(api_key: str)
134
+ ```
135
+
136
+ Creates a new ByteIT client.
137
+
138
+ #### Parameters
139
+
140
+ * `api_key` (`str`): Your ByteIT API key
141
+
142
+ ---
143
+
144
+ ### `parse(...)`
145
+
146
+ ```python
147
+ parse(
148
+ input,
149
+ output_format: str = "md",
150
+ output = None
151
+ )
152
+ ```
153
+
154
+ Parse a document and return the extracted content.
155
+
156
+ #### Parameters
157
+
158
+ * `input` (`str | Path`): Path to a local document
159
+ * `output_format` (`str`): Output format (`txt`, `json`, `md`, `html`)
160
+ * `output` (`str | Path | None`): Optional path to save the result
161
+
162
+ #### Returns
163
+
164
+ * `bytes`: Parsed document content
165
+
166
+ ---
167
+
168
+ ## Error Handling
169
+
170
+ The SDK exposes specific exceptions for common error cases:
171
+
172
+ ```python
173
+ from byteit.exceptions import (
174
+ ByteITError,
175
+ ValidationError,
176
+ AuthenticationError,
177
+ RateLimitError,
178
+ ServerError,
179
+ )
180
+
181
+ try:
182
+ result = client.parse("document.pdf")
183
+ except ValidationError as e:
184
+ print("Invalid input:", e.message)
185
+ except AuthenticationError:
186
+ print("Invalid API key")
187
+ except RateLimitError:
188
+ print("Rate limit exceeded")
189
+ except ByteITError as e:
190
+ print("ByteIT error:", e.message)
191
+ ```
192
+
193
+ All exceptions inherit from `ByteITError`.
194
+
195
+ ---
196
+
197
+ ## Configuration
198
+
199
+ ### Environment Variable
200
+
201
+ You can provide the API key via environment variable:
202
+
203
+ ```bash
204
+ export BYTEIT_API_KEY="your_api_key"
205
+ ```
206
+
207
+ ```python
208
+ import os
209
+ from byteit import ByteITClient
210
+
211
+ client = ByteITClient(api_key=os.getenv("BYTEIT_API_KEY"))
212
+ ```
213
+
214
+ ---
215
+
216
+ ## Requirements
217
+
218
+ * Python 3.8+
219
+ * `requests`
220
+
221
+ ---
222
+
223
+ ## About ByteIT
224
+
225
+ ByteIT provides document parsing and data extraction APIs designed for backend systems and automation workflows.
226
+
227
+ Website: [https://byteit.ai](https://byteit.ai)
228
+
229
+ ---
230
+
231
+ ## License
232
+
233
+ This project is licensed under the terms specified in the [LICENSE](LICENSE) file.
234
+
235
+ © 2026 ByteIT GmbH
236
+
237
+ ---