kreuzberg 3.19.0__py3-none-any.whl → 3.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,8 +301,15 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
301
301
  "OFF",
302
302
  ]
303
303
 
304
- if run_config["tesseract_format"] != "text":
305
- command.append(run_config["tesseract_format"])
304
+ # Handle output format - use config option for HOCR to ensure Windows compatibility
305
+ # Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
306
+ tesseract_format = run_config["tesseract_format"]
307
+ if tesseract_format == "hocr":
308
+ command.extend(["-c", "tessedit_create_hocr=1"])
309
+ elif tesseract_format == "tsv":
310
+ command.append("tsv")
311
+ elif tesseract_format != "text":
312
+ command.append(tesseract_format)
306
313
 
307
314
  for kwarg, value in run_config["remaining_kwargs"].items():
308
315
  if kwarg.startswith("table_"):
@@ -1162,7 +1169,13 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1162
1169
  "OFF",
1163
1170
  ]
1164
1171
 
1165
- if output_format != "text":
1172
+ # Handle output format - use config option for HOCR to ensure Windows compatibility
1173
+ # Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
1174
+ if output_format == "hocr":
1175
+ command.extend(["-c", "tessedit_create_hocr=1"])
1176
+ elif output_format == "tsv":
1177
+ command.append("tsv")
1178
+ elif output_format != "text":
1166
1179
  command.append(output_format)
1167
1180
 
1168
1181
  for kwarg, value in kwargs.items():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.19.0
3
+ Version: 3.19.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -36,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
36
36
  kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
37
37
  kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
38
38
  kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
39
- kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
39
+ kreuzberg/_ocr/_tesseract.py,sha256=6NSKSvXpH66j_ACKwz2rs_b9Exg0xOWRHelhZAcuw80,53162
40
40
  kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
41
41
  kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
42
42
  kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
@@ -122,8 +122,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
122
122
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
123
123
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
124
124
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
125
- kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
126
- kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
127
- kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
128
- kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
129
- kreuzberg-3.19.0.dist-info/RECORD,,
125
+ kreuzberg-3.19.1.dist-info/METADATA,sha256=CpVI7_0uQVLHP1NnQBB0mXqgjwfAW-BCO3kxTiKdvnY,12492
126
+ kreuzberg-3.19.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
127
+ kreuzberg-3.19.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
128
+ kreuzberg-3.19.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
129
+ kreuzberg-3.19.1.dist-info/RECORD,,