kreuzberg 2.1.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_mime_types.py CHANGED
@@ -15,6 +15,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
15
15
  PDF_MIME_TYPE: Final = "application/pdf"
16
16
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
+ DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
18
19
  # Excel formats
19
20
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
20
21
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
@@ -73,7 +74,7 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
73
74
  "application/epub+zip",
74
75
  "application/rtf",
75
76
  "application/vnd.oasis.opendocument.text",
76
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
77
+ DOCX_MIME_TYPE,
77
78
  "application/x-biblatex",
78
79
  "application/x-bibtex",
79
80
  "application/x-endnote+xml",
@@ -146,7 +147,7 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
146
147
  ".epub": "application/epub+zip",
147
148
  ".rtf": "application/rtf",
148
149
  ".odt": "application/vnd.oasis.opendocument.text",
149
- ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
150
+ ".docx": DOCX_MIME_TYPE,
150
151
  ".bib": "application/x-bibtex",
151
152
  ".ipynb": "application/x-ipynb+json",
152
153
  ".tex": "application/x-latex",
kreuzberg/_pandoc.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  import sys
5
- from functools import partial
6
5
  from json import JSONDecodeError, loads
7
6
  from typing import TYPE_CHECKING, Any, Final, Literal, cast
8
7
 
@@ -333,14 +332,14 @@ async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type
333
332
  _get_pandoc_type_from_mime_type(mime_type)
334
333
 
335
334
  try:
336
- metadata, content = await run_taskgroup(
337
- partial(_handle_extract_metadata, input_file, mime_type=mime_type),
338
- partial(_handle_extract_file, input_file, mime_type=mime_type),
339
- )
335
+ metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
336
+ content_task = _handle_extract_file(input_file, mime_type=mime_type)
337
+ results = await run_taskgroup(metadata_task, content_task)
338
+ metadata, content = cast(tuple[Metadata, str], results)
340
339
 
341
340
  return ExtractionResult(
342
- content=normalize_spaces(cast(str, content)),
343
- metadata=cast(Metadata, metadata),
341
+ content=normalize_spaces(content),
342
+ metadata=metadata,
344
343
  mime_type=MARKDOWN_MIME_TYPE,
345
344
  )
346
345
  except ExceptionGroup as eg:
kreuzberg/_pdf.py CHANGED
@@ -24,32 +24,36 @@ if TYPE_CHECKING: # pragma: no cover
24
24
  # - Control and non-printable characters
25
25
  # - Unicode replacement and invalid characters
26
26
  # - Zero-width spaces and other invisible characters
27
- CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
28
- r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
29
- )
27
+ CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
28
+ SHORT_TEXT_THRESHOLD: Final[int] = 50
29
+ MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
30
30
 
31
31
 
32
- def _validate_extracted_text(text: str) -> bool:
32
+ def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
33
33
  """Check if text extracted from PDF is valid or corrupted.
34
34
 
35
- This checks for common indicators of corrupted PDF text extraction:
35
+ This checks for indicators of corrupted PDF text extraction:
36
36
  1. Empty or whitespace-only text
37
- 2. Control characters and other non-printable characters
38
- 3. Unicode replacement characters
39
- 4. Zero-width spaces and other invisible characters
37
+ 2. High concentration of control characters and null bytes
38
+ 3. High concentration of Unicode replacement characters
40
39
 
41
40
  Args:
42
41
  text: The extracted text to validate
42
+ corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
43
+ characters (default: 0.05 or 5%)
43
44
 
44
45
  Returns:
45
46
  True if the text appears valid, False if it seems corrupted
46
47
  """
47
- # Check for empty or whitespace-only text
48
48
  if not text or not text.strip():
49
49
  return False
50
50
 
51
- # Check for corruption indicators
52
- return not bool(CORRUPTED_PATTERN.search(text))
51
+ corruption_matches = CORRUPTED_PATTERN.findall(text)
52
+
53
+ if len(text) < SHORT_TEXT_THRESHOLD:
54
+ return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
55
+
56
+ return (len(corruption_matches) / len(text)) < corruption_threshold
53
57
 
54
58
 
55
59
  async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
@@ -148,12 +152,10 @@ async def extract_pdf_file(
148
152
  Returns:
149
153
  The extracted text.
150
154
  """
151
- if (
152
- not force_ocr
153
- and (content := await _extract_pdf_searchable_text(input_file))
154
- and _validate_extracted_text(content)
155
- ):
156
- return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
155
+ if not force_ocr:
156
+ content = await _extract_pdf_searchable_text(input_file)
157
+ if _validate_extracted_text(content):
158
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
159
  return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
158
160
 
159
161
 
kreuzberg/_sync.py CHANGED
@@ -2,13 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  import sys
4
4
  from functools import partial
5
- from typing import TYPE_CHECKING, TypeVar, cast
5
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
6
6
 
7
7
  from anyio import create_task_group
8
8
  from anyio.to_thread import run_sync as any_io_run_sync
9
9
 
10
10
  if TYPE_CHECKING: # pragma: no cover
11
- from collections.abc import Callable, Coroutine
11
+ from collections.abc import Awaitable, Callable
12
12
 
13
13
  if sys.version_info >= (3, 10):
14
14
  from typing import ParamSpec
@@ -34,7 +34,7 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
34
34
  return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
35
 
36
36
 
37
- async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
37
+ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
38
38
  """Run a list of coroutines concurrently.
39
39
 
40
40
  Args:
@@ -43,10 +43,10 @@ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) ->
43
43
  Returns:
44
44
  The results of the coroutines.
45
45
  """
46
- results = cast(list[T], [None] * len(async_tasks))
46
+ results: list[Any] = [None] * len(async_tasks)
47
47
 
48
- async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
49
- results[index] = await task()
48
+ async def run_task(index: int, task: Awaitable[T]) -> None:
49
+ results[index] = await task
50
50
 
51
51
  async with create_task_group() as tg:
52
52
  for i, t in enumerate(async_tasks):
@@ -55,7 +55,7 @@ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) ->
55
55
  return results
56
56
 
57
57
 
58
- async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
58
+ async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
59
59
  """Run a list of coroutines concurrently in batches.
60
60
 
61
61
  Args:
@@ -65,7 +65,7 @@ async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None,
65
65
  Returns:
66
66
  The results of the coroutines.
67
67
  """
68
- results: list[T] = []
68
+ results: list[Any] = []
69
69
 
70
70
  for i in range(0, len(async_tasks), batch_size):
71
71
  batch = async_tasks[i : i + batch_size]
kreuzberg/_tesseract.py CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import re
4
4
  import sys
5
5
  from enum import Enum
6
- from functools import partial
7
6
  from os import PathLike
8
7
  from typing import Any, TypeVar, Union
9
8
 
@@ -226,9 +225,7 @@ async def batch_process_images(
226
225
  """
227
226
  await validate_tesseract_version()
228
227
  try:
229
- return await run_taskgroup_batched(
230
- *[partial(process_image_with_tesseract, image, language=language, psm=psm) for image in images],
231
- batch_size=max_processes,
232
- )
228
+ tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
229
+ return await run_taskgroup_batched(*tasks, batch_size=max_processes)
233
230
  except ExceptionGroup as eg:
234
231
  raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
kreuzberg/_xlsx.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import csv
4
4
  import sys
5
- from functools import partial
6
5
  from io import StringIO
7
6
  from typing import TYPE_CHECKING
8
7
 
@@ -57,9 +56,8 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
57
56
  """
58
57
  try:
59
58
  workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
60
- results = await run_taskgroup(
61
- *[partial(convert_sheet_to_text, workbook, sheet_name) for sheet_name in workbook.sheet_names]
62
- )
59
+ tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
60
+ results: list[str] = await run_taskgroup(*tasks)
63
61
 
64
62
  return ExtractionResult(
65
63
  content="\n\n".join(results),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 2.1.0
3
+ Version: 2.1.1
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -31,7 +31,7 @@ Requires-Dist: html-to-markdown>=1.2.0
31
31
  Requires-Dist: pypdfium2>=4.30.1
32
32
  Requires-Dist: python-calamine>=0.3.1
33
33
  Requires-Dist: python-pptx>=1.0.2
34
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.11"
34
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
35
35
 
36
36
  # Kreuzberg
37
37
 
@@ -62,7 +62,7 @@ pip install kreuzberg
62
62
  Kreuzberg requires two system level dependencies:
63
63
 
64
64
  - [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
65
- - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 5.
66
66
 
67
67
  You can install these with:
68
68
 
@@ -75,7 +75,6 @@ sudo apt-get install pandoc tesseract-ocr
75
75
  #### MacOS
76
76
 
77
77
  ```shell
78
- #
79
78
  brew install tesseract pandoc
80
79
  ```
81
80
 
@@ -1,21 +1,21 @@
1
1
  kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
2
2
  kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
3
3
  kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
4
- kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
- kreuzberg/_pandoc.py,sha256=lUqG1GQqezz011fLn12AUKJ_xw9gElj-S7xRO5g-Rlw,12513
6
- kreuzberg/_pdf.py,sha256=BI7ooYvvLPEX3y7lKyri4r0k6bW4pj_cmBQW1UqZiF8,6227
4
+ kreuzberg/_mime_types.py,sha256=Kuu0yWY4p0Eck8b_vdp9oamqRZc1RJaS_ZKikVD2Z2o,6431
5
+ kreuzberg/_pandoc.py,sha256=YIXaFC11N2tgVHjBd3JD_21GZ6OOVQ0UY3aKrWNfK-I,12531
6
+ kreuzberg/_pdf.py,sha256=AIwxlydZkJOU4878SaeF9cKUmzSN7o3X40Hye7z017U,6479
7
7
  kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
8
  kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
9
- kreuzberg/_sync.py,sha256=DepezWTfsyyeEq7VYjhWD6XFRiaEz-uCvXFUYkQMswQ,2191
10
- kreuzberg/_tesseract.py,sha256=gKGyZpa_MLLsMTpzi_VvSXFAmLxagRE-sfqH2oKFmPM,7662
9
+ kreuzberg/_sync.py,sha256=sDVH4GrpYW9SOnmu3BqKPL76xl0hxzHjTAC78aovbQA,2122
10
+ kreuzberg/_tesseract.py,sha256=0BkguZJIKlOFHkrN2mjVgaycWwolmuEv6DwpQY7n7Os,7610
11
11
  kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
12
  kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
- kreuzberg/_xlsx.py,sha256=JcQTdV38uiNdyRmHQ1DI6khN8ng4W38tIRaxonIoaHs,2703
13
+ kreuzberg/_xlsx.py,sha256=kSH7PJ33vdLgoh5LmL_bqbc4I0VgZlZUeF4ckKl6NJM,2675
14
14
  kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
15
15
  kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
16
16
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- kreuzberg-2.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
- kreuzberg-2.1.0.dist-info/METADATA,sha256=t1NeglNqJFjWpr6WeIp-d33OikT_HIrS8FrEMGSk1hA,14844
19
- kreuzberg-2.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- kreuzberg-2.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
- kreuzberg-2.1.0.dist-info/RECORD,,
17
+ kreuzberg-2.1.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
+ kreuzberg-2.1.1.dist-info/METADATA,sha256=tWRsv1bx9os2dQnU5KrQpUd4fNeQ4x-J2fXWKdcuQAA,14842
19
+ kreuzberg-2.1.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
20
+ kreuzberg-2.1.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
+ kreuzberg-2.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5