kreuzberg 3.19.0__py3-none-any.whl → 3.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_ocr/_tesseract.py +16 -3
- {kreuzberg-3.19.0.dist-info → kreuzberg-3.19.1.dist-info}/METADATA +1 -1
- {kreuzberg-3.19.0.dist-info → kreuzberg-3.19.1.dist-info}/RECORD +6 -6
- {kreuzberg-3.19.0.dist-info → kreuzberg-3.19.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.19.0.dist-info → kreuzberg-3.19.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.19.0.dist-info → kreuzberg-3.19.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -301,8 +301,15 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
301
301
|
"OFF",
|
302
302
|
]
|
303
303
|
|
304
|
-
|
305
|
-
|
304
|
+
# Handle output format - use config option for HOCR to ensure Windows compatibility
|
305
|
+
# Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
|
306
|
+
tesseract_format = run_config["tesseract_format"]
|
307
|
+
if tesseract_format == "hocr":
|
308
|
+
command.extend(["-c", "tessedit_create_hocr=1"])
|
309
|
+
elif tesseract_format == "tsv":
|
310
|
+
command.append("tsv")
|
311
|
+
elif tesseract_format != "text":
|
312
|
+
command.append(tesseract_format)
|
306
313
|
|
307
314
|
for kwarg, value in run_config["remaining_kwargs"].items():
|
308
315
|
if kwarg.startswith("table_"):
|
@@ -1162,7 +1169,13 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1162
1169
|
"OFF",
|
1163
1170
|
]
|
1164
1171
|
|
1165
|
-
|
1172
|
+
# Handle output format - use config option for HOCR to ensure Windows compatibility
|
1173
|
+
# Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
|
1174
|
+
if output_format == "hocr":
|
1175
|
+
command.extend(["-c", "tessedit_create_hocr=1"])
|
1176
|
+
elif output_format == "tsv":
|
1177
|
+
command.append("tsv")
|
1178
|
+
elif output_format != "text":
|
1166
1179
|
command.append(output_format)
|
1167
1180
|
|
1168
1181
|
for kwarg, value in kwargs.items():
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.19.
|
3
|
+
Version: 3.19.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -36,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
|
36
36
|
kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
|
37
37
|
kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
|
38
38
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
39
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
39
|
+
kreuzberg/_ocr/_tesseract.py,sha256=6NSKSvXpH66j_ACKwz2rs_b9Exg0xOWRHelhZAcuw80,53162
|
40
40
|
kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
|
41
41
|
kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
|
42
42
|
kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
|
@@ -122,8 +122,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
122
122
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
123
123
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
124
124
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
125
|
-
kreuzberg-3.19.
|
126
|
-
kreuzberg-3.19.
|
127
|
-
kreuzberg-3.19.
|
128
|
-
kreuzberg-3.19.
|
129
|
-
kreuzberg-3.19.
|
125
|
+
kreuzberg-3.19.1.dist-info/METADATA,sha256=CpVI7_0uQVLHP1NnQBB0mXqgjwfAW-BCO3kxTiKdvnY,12492
|
126
|
+
kreuzberg-3.19.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
127
|
+
kreuzberg-3.19.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
128
|
+
kreuzberg-3.19.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
129
|
+
kreuzberg-3.19.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|