kreuzberg 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,6 +1,13 @@
1
1
  from ._types import ExtractionResult, Metadata
2
2
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
- from .extraction import extract_bytes, extract_file
3
+ from .extraction import (
4
+ batch_extract_bytes,
5
+ batch_extract_bytes_sync,
6
+ batch_extract_file,
7
+ batch_extract_file_sync,
8
+ extract_bytes,
9
+ extract_file,
10
+ )
4
11
 
5
12
  __all__ = [
6
13
  "ExtractionResult",
@@ -10,6 +17,10 @@ __all__ = [
10
17
  "OCRError",
11
18
  "ParsingError",
12
19
  "ValidationError",
20
+ "batch_extract_bytes",
21
+ "batch_extract_bytes_sync",
22
+ "batch_extract_file",
23
+ "batch_extract_file_sync",
13
24
  "extract_bytes",
14
25
  "extract_file",
15
26
  ]
kreuzberg/_pdf.py CHANGED
@@ -11,7 +11,7 @@ from kreuzberg import ExtractionResult
11
11
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
12
  from kreuzberg._string import normalize_spaces
13
13
  from kreuzberg._sync import run_sync
14
- from kreuzberg._tesseract import PSMMode, SupportedLanguage, batch_process_images
14
+ from kreuzberg._tesseract import PSMMode, batch_process_images
15
15
  from kreuzberg.exceptions import ParsingError
16
16
 
17
17
  if TYPE_CHECKING: # pragma: no cover
@@ -80,7 +80,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
80
80
  async def _extract_pdf_text_with_ocr(
81
81
  input_file: Path,
82
82
  *,
83
- language: SupportedLanguage = "eng",
83
+ language: str = "eng",
84
84
  max_processes: int,
85
85
  psm: PSMMode = PSMMode.AUTO,
86
86
  ) -> ExtractionResult:
@@ -132,7 +132,7 @@ async def extract_pdf_file(
132
132
  input_file: Path,
133
133
  *,
134
134
  force_ocr: bool,
135
- language: SupportedLanguage = "eng",
135
+ language: str = "eng",
136
136
  max_processes: int,
137
137
  psm: PSMMode = PSMMode.AUTO,
138
138
  ) -> ExtractionResult:
@@ -162,7 +162,7 @@ async def extract_pdf_content(
162
162
  content: bytes,
163
163
  *,
164
164
  force_ocr: bool,
165
- language: SupportedLanguage = "eng",
165
+ language: str = "eng",
166
166
  max_processes: int,
167
167
  psm: PSMMode = PSMMode.AUTO,
168
168
  ) -> ExtractionResult:
kreuzberg/_tesseract.py CHANGED
@@ -6,7 +6,7 @@ import sys
6
6
  from enum import Enum
7
7
  from functools import partial
8
8
  from os import PathLike
9
- from typing import Final, Literal, TypeVar, Union, cast
9
+ from typing import Final, TypeVar, Union, cast
10
10
 
11
11
  from anyio import CapacityLimiter, create_task_group, to_process
12
12
  from anyio import Path as AsyncPath
@@ -29,136 +29,6 @@ version_ref = {"checked": False}
29
29
 
30
30
  T = TypeVar("T", bound=Union[Image, PathLike[str], str])
31
31
 
32
- SupportedLanguage = Literal[
33
- "afr",
34
- "amh",
35
- "ara",
36
- "asm",
37
- "aze",
38
- "aze_cyrl",
39
- "bel",
40
- "ben",
41
- "bod",
42
- "bos",
43
- "bre",
44
- "bul",
45
- "cat",
46
- "ceb",
47
- "ces",
48
- "chi_sim",
49
- "chi_tra",
50
- "chr",
51
- "cos",
52
- "cym",
53
- "dan",
54
- "dan_frak",
55
- "deu",
56
- "deu_frak",
57
- "deu_latf",
58
- "dzo",
59
- "ell",
60
- "eng",
61
- "enm",
62
- "epo",
63
- "equ",
64
- "est",
65
- "eus",
66
- "fao",
67
- "fas",
68
- "fil",
69
- "fin",
70
- "fra",
71
- "frk",
72
- "frm",
73
- "fry",
74
- "gla",
75
- "gle",
76
- "glg",
77
- "grc",
78
- "guj",
79
- "hat",
80
- "heb",
81
- "hin",
82
- "hrv",
83
- "hun",
84
- "hye",
85
- "iku",
86
- "ind",
87
- "isl",
88
- "ita",
89
- "ita_old",
90
- "jav",
91
- "jpn",
92
- "kan",
93
- "kat",
94
- "kat_old",
95
- "kaz",
96
- "khm",
97
- "kir",
98
- "kmr",
99
- "kor",
100
- "kor_vert",
101
- "kur",
102
- "lao",
103
- "lat",
104
- "lav",
105
- "lit",
106
- "ltz",
107
- "mal",
108
- "mar",
109
- "mkd",
110
- "mlt",
111
- "mon",
112
- "mri",
113
- "msa",
114
- "mya",
115
- "nep",
116
- "nld",
117
- "nor",
118
- "oci",
119
- "ori",
120
- "osd",
121
- "pan",
122
- "pol",
123
- "por",
124
- "pus",
125
- "que",
126
- "ron",
127
- "rus",
128
- "san",
129
- "sin",
130
- "slk",
131
- "slk_frak",
132
- "slv",
133
- "snd",
134
- "spa",
135
- "spa_old",
136
- "sqi",
137
- "srp",
138
- "srp_latn",
139
- "sun",
140
- "swa",
141
- "swe",
142
- "syr",
143
- "tam",
144
- "tat",
145
- "tel",
146
- "tgk",
147
- "tgl",
148
- "tha",
149
- "tir",
150
- "ton",
151
- "tur",
152
- "uig",
153
- "ukr",
154
- "urd",
155
- "uzb",
156
- "uzb_cyrl",
157
- "vie",
158
- "yid",
159
- "yor",
160
- ]
161
-
162
32
 
163
33
  class PSMMode(Enum):
164
34
  """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
@@ -211,7 +81,7 @@ async def validate_tesseract_version() -> None:
211
81
  async def process_file(
212
82
  input_file: str | PathLike[str],
213
83
  *,
214
- language: SupportedLanguage,
84
+ language: str,
215
85
  psm: PSMMode,
216
86
  max_processes: int = DEFAULT_MAX_PROCESSES,
217
87
  ) -> ExtractionResult:
@@ -263,7 +133,7 @@ async def process_file(
263
133
  async def process_image(
264
134
  image: Image,
265
135
  *,
266
- language: SupportedLanguage,
136
+ language: str,
267
137
  psm: PSMMode,
268
138
  max_processes: int = DEFAULT_MAX_PROCESSES,
269
139
  ) -> ExtractionResult:
@@ -288,7 +158,7 @@ async def process_image(
288
158
  async def process_image_with_tesseract(
289
159
  image: Image | PathLike[str] | str,
290
160
  *,
291
- language: SupportedLanguage = "eng",
161
+ language: str = "eng",
292
162
  psm: PSMMode = PSMMode.AUTO,
293
163
  max_processes: int = DEFAULT_MAX_PROCESSES,
294
164
  ) -> ExtractionResult:
@@ -320,7 +190,7 @@ async def process_image_with_tesseract(
320
190
  async def batch_process_images(
321
191
  images: list[T],
322
192
  *,
323
- language: SupportedLanguage = "eng",
193
+ language: str = "eng",
324
194
  psm: PSMMode = PSMMode.AUTO,
325
195
  max_processes: int = DEFAULT_MAX_PROCESSES,
326
196
  ) -> list[ExtractionResult]:
kreuzberg/extraction.py CHANGED
@@ -38,7 +38,7 @@ from kreuzberg._pdf import (
38
38
  )
39
39
  from kreuzberg._pptx import extract_pptx_file_content
40
40
  from kreuzberg._string import safe_decode
41
- from kreuzberg._tesseract import PSMMode, SupportedLanguage, process_image_with_tesseract
41
+ from kreuzberg._tesseract import PSMMode, process_image_with_tesseract
42
42
  from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
43
43
  from kreuzberg.exceptions import ValidationError
44
44
 
@@ -52,7 +52,7 @@ async def extract_bytes(
52
52
  mime_type: str,
53
53
  *,
54
54
  force_ocr: bool = False,
55
- language: SupportedLanguage = "eng",
55
+ language: str = "eng",
56
56
  max_processes: int = DEFAULT_MAX_PROCESSES,
57
57
  psm: PSMMode = PSMMode.AUTO,
58
58
  ) -> ExtractionResult:
@@ -114,7 +114,7 @@ async def extract_file(
114
114
  mime_type: str | None = None,
115
115
  *,
116
116
  force_ocr: bool = False,
117
- language: SupportedLanguage = "eng",
117
+ language: str = "eng",
118
118
  max_processes: int = DEFAULT_MAX_PROCESSES,
119
119
  psm: PSMMode = PSMMode.AUTO,
120
120
  ) -> ExtractionResult:
@@ -170,7 +170,7 @@ async def batch_extract_file(
170
170
  file_paths: Sequence[PathLike[str] | str],
171
171
  *,
172
172
  force_ocr: bool = False,
173
- language: SupportedLanguage = "eng",
173
+ language: str = "eng",
174
174
  max_processes: int = DEFAULT_MAX_PROCESSES,
175
175
  psm: PSMMode = PSMMode.AUTO,
176
176
  ) -> list[ExtractionResult]:
@@ -209,7 +209,7 @@ async def batch_extract_bytes(
209
209
  contents: Sequence[tuple[bytes, str]],
210
210
  *,
211
211
  force_ocr: bool = False,
212
- language: SupportedLanguage = "eng",
212
+ language: str = "eng",
213
213
  max_processes: int = DEFAULT_MAX_PROCESSES,
214
214
  psm: PSMMode = PSMMode.AUTO,
215
215
  ) -> list[ExtractionResult]:
@@ -253,7 +253,7 @@ def extract_bytes_sync(
253
253
  mime_type: str,
254
254
  *,
255
255
  force_ocr: bool = False,
256
- language: SupportedLanguage = "eng",
256
+ language: str = "eng",
257
257
  max_processes: int = DEFAULT_MAX_PROCESSES,
258
258
  psm: PSMMode = PSMMode.AUTO,
259
259
  ) -> ExtractionResult:
@@ -281,7 +281,7 @@ def extract_file_sync(
281
281
  mime_type: str | None = None,
282
282
  *,
283
283
  force_ocr: bool = False,
284
- language: SupportedLanguage = "eng",
284
+ language: str = "eng",
285
285
  max_processes: int = DEFAULT_MAX_PROCESSES,
286
286
  psm: PSMMode = PSMMode.AUTO,
287
287
  ) -> ExtractionResult:
@@ -308,7 +308,7 @@ def batch_extract_file_sync(
308
308
  file_paths: Sequence[PathLike[str] | str],
309
309
  *,
310
310
  force_ocr: bool = False,
311
- language: SupportedLanguage = "eng",
311
+ language: str = "eng",
312
312
  max_processes: int = DEFAULT_MAX_PROCESSES,
313
313
  psm: PSMMode = PSMMode.AUTO,
314
314
  ) -> list[ExtractionResult]:
@@ -339,7 +339,7 @@ def batch_extract_bytes_sync(
339
339
  contents: Sequence[tuple[bytes, str]],
340
340
  *,
341
341
  force_ocr: bool = False,
342
- language: SupportedLanguage = "eng",
342
+ language: str = "eng",
343
343
  max_processes: int = DEFAULT_MAX_PROCESSES,
344
344
  psm: PSMMode = PSMMode.AUTO,
345
345
  ) -> list[ExtractionResult]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -64,7 +64,31 @@ Kreuzberg requires two system level dependencies:
64
64
  - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
65
  - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
66
66
 
67
- Please install these using their respective installation guides.
67
+ You can install these with:
68
+
69
+ #### Linux (Ubuntu)
70
+
71
+ ```shell
72
+ sudo apt-get install pandoc tesseract-ocr
73
+ ```
74
+
75
+ #### MacOS
76
+
77
+ ```shell
78
+ # MacOS
79
+ brew install tesseract pandoc
80
+ ```
81
+
82
+ #### Windows
83
+
84
+ ```shell
85
+ choco install -y tesseract pandoc
86
+ ```
87
+
88
+ Notes:
89
+
90
+ - in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
91
+ - please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
68
92
 
69
93
  ## Architecture
70
94
 
@@ -152,18 +176,26 @@ All extraction functions accept the following optional parameters for configurin
152
176
 
153
177
  #### OCR Configuration
154
178
 
155
- - `language` (default: "eng"): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for non-English documents. Examples:
156
- - "eng" for English
157
- - "deu" for German
158
- - "fra" for French
179
+ - `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
180
+ - `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
181
+
182
+ - `eng` for English
183
+ - `deu` for German
184
+ - `eng+deu` for English and German
185
+
186
+ Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
187
+
188
+ - `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
189
+
190
+ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
159
191
 
160
- Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information.
192
+ #### Processing Configuration
161
193
 
162
- - `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
194
+ - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
163
195
 
164
- #### Performance Configuration
196
+ Notes:
165
197
 
166
- - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc. Higher values can lead to performance improvements, but may cause resource exhaustion and deadlocks (especially for tesseract).
198
+ - Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
167
199
 
168
200
  ### Quick Start
169
201
 
@@ -171,7 +203,7 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
171
203
  from pathlib import Path
172
204
  from kreuzberg import extract_file
173
205
  from kreuzberg.extraction import ExtractionResult
174
- from kreuzberg._tesseract import PSMMode, SupportedLanguage
206
+ from kreuzberg._tesseract import PSMMode
175
207
 
176
208
 
177
209
  # Basic file extraction
@@ -193,7 +225,7 @@ async def extract_document():
193
225
  docx_result = await extract_file(Path("document.docx"))
194
226
  if docx_result.metadata:
195
227
  print(f"Title: {docx_result.metadata.get('title')}")
196
- print(f"Author: {docx_result.metadata.get('author')}")
228
+ print(f"Author: {docx_result.metadata.get('creator')}")
197
229
  ```
198
230
 
199
231
  ### Extracting Bytes
@@ -236,7 +268,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
236
268
 
237
269
  ```python
238
270
  from pathlib import Path
239
- from kreuzberg import batch_extract_file, batch_extract_bytes
271
+ from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
240
272
 
241
273
 
242
274
  # Process multiple files concurrently
@@ -1,21 +1,21 @@
1
- kreuzberg/__init__.py,sha256=3opnj4Q8Ci151QuVqPaM3sCb8mpFIRhZbZUgBmp1LI0,410
1
+ kreuzberg/__init__.py,sha256=CBRHXPhjdslaSXaUjZO5V0k57uz5_x12cwo0HTtxOcU,647
2
2
  kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
3
3
  kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
4
4
  kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
5
  kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
6
- kreuzberg/_pdf.py,sha256=V1TVwPpGyrE0YJqnmW_5kh4Y1qWwZI5SSF-lwT_Bbac,6288
6
+ kreuzberg/_pdf.py,sha256=9YErIrRvMMFXKHckXzBDCEMzDAEnC0JVOR38gFhvHKQ,6227
7
7
  kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
8
  kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
9
9
  kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
10
- kreuzberg/_tesseract.py,sha256=xt_4MU7PfN1nZWlWBVQF6zmJnMs9pJq8yWTzPUxTqm0,9240
10
+ kreuzberg/_tesseract.py,sha256=SZsv0gFWvzR8iLaMyGr4Oc0lXE7atCR3sNxXR7TQzEE,7686
11
11
  kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
12
  kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
13
  kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
14
14
  kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
15
- kreuzberg/extraction.py,sha256=1RIs7YaUK0wcOpY1eDcIqh3n-UlJY7ZeulZPdaAxdvo,13345
15
+ kreuzberg/extraction.py,sha256=kuEKvOGhPBRcFeGX7eKmup9BukX6o55740F_KdZ15qQ,13214
16
16
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- kreuzberg-2.0.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
- kreuzberg-2.0.0.dist-info/METADATA,sha256=cvD9ypz004yHqePKuw8eZZcuZ2lanyN1y2jlB5FMG0Q,14201
19
- kreuzberg-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- kreuzberg-2.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
- kreuzberg-2.0.0.dist-info/RECORD,,
17
+ kreuzberg-2.0.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
+ kreuzberg-2.0.1.dist-info/METADATA,sha256=KmKLubQ89i0_JwpK96kYbhuq1MuucrqHe2bCLNcbyic,15023
19
+ kreuzberg-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
+ kreuzberg-2.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
+ kreuzberg-2.0.1.dist-info/RECORD,,