doctra 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doctra-0.3.0/doctra.egg-info → doctra-0.3.2}/PKG-INFO +29 -3
- {doctra-0.3.0 → doctra-0.3.2}/README.md +25 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/progress.py +39 -10
- {doctra-0.3.0 → doctra-0.3.2}/doctra/version.py +1 -1
- {doctra-0.3.0 → doctra-0.3.2/doctra.egg-info}/PKG-INFO +29 -3
- {doctra-0.3.0 → doctra-0.3.2}/doctra.egg-info/requires.txt +3 -3
- {doctra-0.3.0 → doctra-0.3.2}/pyproject.toml +3 -1
- {doctra-0.3.0 → doctra-0.3.2}/setup.py +3 -1
- {doctra-0.3.0 → doctra-0.3.2}/LICENSE +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/MANIFEST.in +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/cli/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/cli/main.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/cli/utils.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/layout/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/layout/layout_models.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/layout/paddle_layout.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/ocr/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/ocr/api.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/ocr/path_resolver.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/ocr/pytesseract_engine.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/vlm/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/vlm/outlines_types.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/vlm/provider.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/engines/vlm/service.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/exporters/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/exporters/excel_writer.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/exporters/html_writer.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/exporters/image_saver.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/exporters/markdown_table.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/exporters/markdown_writer.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/parsers/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/parsers/layout_order.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/parsers/structured_pdf_parser.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/parsers/table_chart_extractor.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/ui/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/ui/app.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/__init__.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/bbox.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/constants.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/file_ops.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/io_utils.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/ocr_utils.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/pdf_io.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/quiet.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra/utils/structured_utils.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra.egg-info/SOURCES.txt +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra.egg-info/dependency_links.txt +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra.egg-info/not-zip-safe +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/doctra.egg-info/top_level.txt +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/requirements.txt +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/setup.cfg +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/tests/test_structured_pdf_parser.py +0 -0
- {doctra-0.3.0 → doctra-0.3.2}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -234,6 +234,9 @@ Requires-Dist: opencv-python>=4.5.0
|
|
234
234
|
Requires-Dist: pandas>=1.3.0
|
235
235
|
Requires-Dist: openpyxl>=3.0.0
|
236
236
|
Requires-Dist: tesseract>=0.1.3
|
237
|
+
Requires-Dist: pytesseract>=0.3.10
|
238
|
+
Requires-Dist: pdf2image>=1.16.0
|
239
|
+
Requires-Dist: anthropic>=0.40.0
|
237
240
|
Requires-Dist: outlines>=0.0.34
|
238
241
|
Requires-Dist: tqdm>=4.62.0
|
239
242
|
Requires-Dist: matplotlib>=3.5.0
|
@@ -241,8 +244,6 @@ Provides-Extra: openai
|
|
241
244
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
245
|
Provides-Extra: gemini
|
243
246
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
-
Provides-Extra: anthropic
|
245
|
-
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
246
247
|
Provides-Extra: dev
|
247
248
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
248
249
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -295,6 +296,31 @@ cd Doctra
|
|
295
296
|
pip install .
|
296
297
|
```
|
297
298
|
|
299
|
+
### System Dependencies
|
300
|
+
|
301
|
+
Doctra requires **Poppler** for PDF processing. Install it based on your operating system:
|
302
|
+
|
303
|
+
#### Ubuntu/Debian
|
304
|
+
```bash
|
305
|
+
sudo apt install poppler-utils
|
306
|
+
```
|
307
|
+
|
308
|
+
#### macOS
|
309
|
+
```bash
|
310
|
+
brew install poppler
|
311
|
+
```
|
312
|
+
|
313
|
+
#### Windows
|
314
|
+
Download and install from [Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) or use conda:
|
315
|
+
```bash
|
316
|
+
conda install -c conda-forge poppler
|
317
|
+
```
|
318
|
+
|
319
|
+
#### Google Colab
|
320
|
+
```bash
|
321
|
+
!sudo apt install poppler-utils
|
322
|
+
```
|
323
|
+
|
298
324
|
## ⚡ Quick Start
|
299
325
|
|
300
326
|
```python
|
@@ -37,6 +37,31 @@ cd Doctra
|
|
37
37
|
pip install .
|
38
38
|
```
|
39
39
|
|
40
|
+
### System Dependencies
|
41
|
+
|
42
|
+
Doctra requires **Poppler** for PDF processing. Install it based on your operating system:
|
43
|
+
|
44
|
+
#### Ubuntu/Debian
|
45
|
+
```bash
|
46
|
+
sudo apt install poppler-utils
|
47
|
+
```
|
48
|
+
|
49
|
+
#### macOS
|
50
|
+
```bash
|
51
|
+
brew install poppler
|
52
|
+
```
|
53
|
+
|
54
|
+
#### Windows
|
55
|
+
Download and install from [Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) or use conda:
|
56
|
+
```bash
|
57
|
+
conda install -c conda-forge poppler
|
58
|
+
```
|
59
|
+
|
60
|
+
#### Google Colab
|
61
|
+
```bash
|
62
|
+
!sudo apt install poppler-utils
|
63
|
+
```
|
64
|
+
|
40
65
|
## ⚡ Quick Start
|
41
66
|
|
42
67
|
```python
|
@@ -354,10 +354,11 @@ def create_notebook_friendly_bar(
|
|
354
354
|
**kwargs
|
355
355
|
) -> tqdm:
|
356
356
|
"""
|
357
|
-
Create a notebook-friendly progress bar with
|
357
|
+
Create a notebook-friendly progress bar with consistent sizing and static display.
|
358
358
|
|
359
359
|
This function creates progress bars specifically optimized for Jupyter notebooks
|
360
|
-
to avoid display issues and ANSI code problems
|
360
|
+
to avoid display issues and ANSI code problems while maintaining consistency
|
361
|
+
with the main progress bar styling.
|
361
362
|
|
362
363
|
:param total: Total number of items to process
|
363
364
|
:param desc: Description text for the progress bar
|
@@ -384,24 +385,52 @@ def create_notebook_friendly_bar(
|
|
384
385
|
if prefix:
|
385
386
|
desc = f"{prefix} {desc}"
|
386
387
|
|
387
|
-
#
|
388
|
-
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}"
|
388
|
+
# Use same format as main progress bar for consistency
|
389
|
+
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
|
390
|
+
|
391
|
+
# Color schemes based on operation type (same as main progress bar)
|
392
|
+
color_schemes = {
|
393
|
+
"loading": {"colour": "cyan", "ncols": 100},
|
394
|
+
"charts": {"colour": "green", "ncols": 100},
|
395
|
+
"tables": {"colour": "blue", "ncols": 100},
|
396
|
+
"figures": {"colour": "magenta", "ncols": 100},
|
397
|
+
"ocr": {"colour": "yellow", "ncols": 100},
|
398
|
+
"vlm": {"colour": "red", "ncols": 100},
|
399
|
+
"processing": {"colour": "white", "ncols": 100},
|
400
|
+
}
|
401
|
+
|
402
|
+
# Determine color scheme based on description
|
403
|
+
if "loading" in desc_lower or "model" in desc_lower:
|
404
|
+
color_scheme = color_schemes["loading"]
|
405
|
+
elif "chart" in desc_lower:
|
406
|
+
color_scheme = color_schemes["charts"]
|
407
|
+
elif "table" in desc_lower:
|
408
|
+
color_scheme = color_schemes["tables"]
|
409
|
+
elif "figure" in desc_lower:
|
410
|
+
color_scheme = color_schemes["figures"]
|
411
|
+
elif "ocr" in desc_lower:
|
412
|
+
color_scheme = color_schemes["ocr"]
|
413
|
+
elif "vlm" in desc_lower:
|
414
|
+
color_scheme = color_schemes["vlm"]
|
415
|
+
else:
|
416
|
+
color_scheme = color_schemes["processing"]
|
389
417
|
|
390
418
|
tqdm_config = {
|
391
419
|
"total": total,
|
392
420
|
"desc": desc,
|
393
421
|
"leave": True,
|
394
422
|
"bar_format": bar_format,
|
395
|
-
"ncols": _PROGRESS_CONFIG.ncols_env or
|
423
|
+
"ncols": _PROGRESS_CONFIG.ncols_env or color_scheme["ncols"], # Use same width as main progress bar
|
396
424
|
"ascii": kwargs.get("ascii", False),
|
397
|
-
"dynamic_ncols":
|
398
|
-
"smoothing": 0.
|
399
|
-
"mininterval": 0.
|
400
|
-
"maxinterval": 0
|
425
|
+
"dynamic_ncols": True, # Enable responsive width like main progress bar
|
426
|
+
"smoothing": 0.3, # Use same smoothing as main progress bar
|
427
|
+
"mininterval": 0.1, # Use same intervals as main progress bar
|
428
|
+
"maxinterval": 1.0,
|
401
429
|
**kwargs
|
402
430
|
}
|
403
431
|
|
404
|
-
|
432
|
+
# Use regular tqdm instead of tqdm_auto to avoid interactive widgets
|
433
|
+
return tqdm(**tqdm_config)
|
405
434
|
|
406
435
|
|
407
436
|
def progress_for(iterable: Iterable[Any], desc: str, total: Optional[int] = None, leave: bool = True, **kwargs) -> Iterator[Any]:
|
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Version information for Doctra."""
|
2
|
-
__version__ = '0.3.
|
2
|
+
__version__ = '0.3.2'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -234,6 +234,9 @@ Requires-Dist: opencv-python>=4.5.0
|
|
234
234
|
Requires-Dist: pandas>=1.3.0
|
235
235
|
Requires-Dist: openpyxl>=3.0.0
|
236
236
|
Requires-Dist: tesseract>=0.1.3
|
237
|
+
Requires-Dist: pytesseract>=0.3.10
|
238
|
+
Requires-Dist: pdf2image>=1.16.0
|
239
|
+
Requires-Dist: anthropic>=0.40.0
|
237
240
|
Requires-Dist: outlines>=0.0.34
|
238
241
|
Requires-Dist: tqdm>=4.62.0
|
239
242
|
Requires-Dist: matplotlib>=3.5.0
|
@@ -241,8 +244,6 @@ Provides-Extra: openai
|
|
241
244
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
245
|
Provides-Extra: gemini
|
243
246
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
-
Provides-Extra: anthropic
|
245
|
-
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
246
247
|
Provides-Extra: dev
|
247
248
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
248
249
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -295,6 +296,31 @@ cd Doctra
|
|
295
296
|
pip install .
|
296
297
|
```
|
297
298
|
|
299
|
+
### System Dependencies
|
300
|
+
|
301
|
+
Doctra requires **Poppler** for PDF processing. Install it based on your operating system:
|
302
|
+
|
303
|
+
#### Ubuntu/Debian
|
304
|
+
```bash
|
305
|
+
sudo apt install poppler-utils
|
306
|
+
```
|
307
|
+
|
308
|
+
#### macOS
|
309
|
+
```bash
|
310
|
+
brew install poppler
|
311
|
+
```
|
312
|
+
|
313
|
+
#### Windows
|
314
|
+
Download and install from [Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) or use conda:
|
315
|
+
```bash
|
316
|
+
conda install -c conda-forge poppler
|
317
|
+
```
|
318
|
+
|
319
|
+
#### Google Colab
|
320
|
+
```bash
|
321
|
+
!sudo apt install poppler-utils
|
322
|
+
```
|
323
|
+
|
298
324
|
## ⚡ Quick Start
|
299
325
|
|
300
326
|
```python
|
@@ -5,13 +5,13 @@ opencv-python>=4.5.0
|
|
5
5
|
pandas>=1.3.0
|
6
6
|
openpyxl>=3.0.0
|
7
7
|
tesseract>=0.1.3
|
8
|
+
pytesseract>=0.3.10
|
9
|
+
pdf2image>=1.16.0
|
10
|
+
anthropic>=0.40.0
|
8
11
|
outlines>=0.0.34
|
9
12
|
tqdm>=4.62.0
|
10
13
|
matplotlib>=3.5.0
|
11
14
|
|
12
|
-
[anthropic]
|
13
|
-
anthropic>=0.40.0
|
14
|
-
|
15
15
|
[dev]
|
16
16
|
pytest>=6.0
|
17
17
|
pytest-cov>=2.0
|
@@ -36,6 +36,9 @@ dependencies = [
|
|
36
36
|
"pandas>=1.3.0",
|
37
37
|
"openpyxl>=3.0.0",
|
38
38
|
"tesseract>=0.1.3",
|
39
|
+
"pytesseract>=0.3.10",
|
40
|
+
"pdf2image>=1.16.0",
|
41
|
+
"anthropic>=0.40.0",
|
39
42
|
"outlines>=0.0.34",
|
40
43
|
"tqdm>=4.62.0",
|
41
44
|
"matplotlib>=3.5.0",
|
@@ -44,7 +47,6 @@ dependencies = [
|
|
44
47
|
[project.optional-dependencies]
|
45
48
|
openai = ["openai>=1.0.0"]
|
46
49
|
gemini = ["google-generativeai>=0.3.0"]
|
47
|
-
anthropic = ["anthropic>=0.40.0"]
|
48
50
|
dev = [
|
49
51
|
"pytest>=6.0",
|
50
52
|
"pytest-cov>=2.0",
|
@@ -43,6 +43,9 @@ setup(
|
|
43
43
|
"pandas>=1.3.0",
|
44
44
|
"openpyxl>=3.0.0",
|
45
45
|
"tesseract>=0.1.3",
|
46
|
+
"pytesseract>=0.3.10",
|
47
|
+
"pdf2image>=1.16.0",
|
48
|
+
"anthropic>=0.40.0",
|
46
49
|
"outlines>=0.0.34",
|
47
50
|
"tqdm>=4.62.0",
|
48
51
|
"matplotlib>=3.5.0",
|
@@ -50,7 +53,6 @@ setup(
|
|
50
53
|
extras_require={
|
51
54
|
"openai": ["openai>=1.0.0"],
|
52
55
|
"gemini": ["google-generativeai>=0.3.0"],
|
53
|
-
"anthropic": ["anthropic>=0.40.0"],
|
54
56
|
"dev": [
|
55
57
|
"pytest>=6.0",
|
56
58
|
"pytest-cov>=2.0",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|