kreuzberg 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -46,7 +46,7 @@ Hence, this library.
46
46
 
47
47
  ## Features
48
48
 
49
- - Extract text from PDFs, images, and office documents
49
+ - Extract text from PDFs, images, office documents and more (see supported formats below)
50
50
  - Use modern Python with async (via `anyio`) and proper type hints
51
51
  - Extensive error handling for easy debugging
52
52
 
@@ -164,6 +164,21 @@ async def process_uploaded_image(image_content: bytes):
164
164
  return result.content
165
165
  ```
166
166
 
167
+ ### Forcing OCR
168
+
169
+ When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
170
+ You can do this by passing `force_ocr=True`:
171
+
172
+ ```python
173
+ from kreuzberg import extract_bytes
174
+
175
+
176
+ # Extract text from PDF bytes and force OCR
177
+ async def process_uploaded_pdf(pdf_content: bytes):
178
+ result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
179
+ return result.content
180
+ ```
181
+
167
182
  ### Error Handling
168
183
 
169
184
  Kreuzberg raises two exception types:
@@ -173,8 +188,8 @@ Kreuzberg raises two exception types:
173
188
  Raised when there are issues with input validation:
174
189
 
175
190
  - Unsupported mime types
176
- - Non-existent files
177
191
  - Undetectable mime types
192
+ - Path doesn't point at an exist file
178
193
 
179
194
  #### ParsingError
180
195
 
@@ -218,8 +233,8 @@ except ParsingError as e:
218
233
 
219
234
  All extraction functions return an ExtractionResult named tuple containing:
220
235
 
221
- - content: The extracted text as a string
222
- - mime_type: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
236
+ - `content`: The extracted text as a string
237
+ - `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
223
238
 
224
239
  ```python
225
240
  from kreuzberg import ExtractionResult
@@ -14,7 +14,7 @@ Hence, this library.
14
14
 
15
15
  ## Features
16
16
 
17
- - Extract text from PDFs, images, and office documents
17
+ - Extract text from PDFs, images, office documents and more (see supported formats below)
18
18
  - Use modern Python with async (via `anyio`) and proper type hints
19
19
  - Extensive error handling for easy debugging
20
20
 
@@ -132,6 +132,21 @@ async def process_uploaded_image(image_content: bytes):
132
132
  return result.content
133
133
  ```
134
134
 
135
+ ### Forcing OCR
136
+
137
+ When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
138
+ You can do this by passing `force_ocr=True`:
139
+
140
+ ```python
141
+ from kreuzberg import extract_bytes
142
+
143
+
144
+ # Extract text from PDF bytes and force OCR
145
+ async def process_uploaded_pdf(pdf_content: bytes):
146
+ result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
147
+ return result.content
148
+ ```
149
+
135
150
  ### Error Handling
136
151
 
137
152
  Kreuzberg raises two exception types:
@@ -141,8 +156,8 @@ Kreuzberg raises two exception types:
141
156
  Raised when there are issues with input validation:
142
157
 
143
158
  - Unsupported mime types
144
- - Non-existent files
145
159
  - Undetectable mime types
160
+ - Path doesn't point at an exist file
146
161
 
147
162
  #### ParsingError
148
163
 
@@ -186,8 +201,8 @@ except ParsingError as e:
186
201
 
187
202
  All extraction functions return an ExtractionResult named tuple containing:
188
203
 
189
- - content: The extracted text as a string
190
- - mime_type: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
204
+ - `content`: The extracted text as a string
205
+ - `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
191
206
 
192
207
  ```python
193
208
  from kreuzberg import ExtractionResult
@@ -11,7 +11,7 @@ from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
11
11
  from kreuzberg._sync import run_sync
12
12
  from kreuzberg.exceptions import ParsingError
13
13
 
14
- if TYPE_CHECKING:
14
+ if TYPE_CHECKING: # pragma: no cover
15
15
  from pathlib import Path
16
16
 
17
17
 
@@ -35,6 +35,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
35
35
  text = "\n".join(image_to_string(img) for img in images)
36
36
  return text.strip()
37
37
  except (PdfiumError, TesseractError) as e:
38
+ # TODO: add test case
38
39
  raise ParsingError(
39
40
  "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
40
41
  ) from e
@@ -57,21 +58,23 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
57
58
  text = "\n".join(page.get_textpage().get_text_range() for page in document)
58
59
  return text.strip()
59
60
  except PdfiumError as e:
61
+ # TODO: add test case
60
62
  raise ParsingError(
61
63
  "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
62
64
  ) from e
63
65
 
64
66
 
65
- async def _extract_pdf_file(file_path: Path) -> str:
67
+ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
66
68
  """Extract text from a PDF file.
67
69
 
68
70
  Args:
69
71
  file_path: The path to the PDF file.
72
+ force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
70
73
 
71
74
  Returns:
72
75
  The extracted text.
73
76
  """
74
- if content := await run_sync(_extract_pdf_with_pdfium2, file_path):
77
+ if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
75
78
  return content
76
79
 
77
80
  return await run_sync(_extract_pdf_with_tesseract, file_path)
@@ -96,6 +99,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
96
99
  try:
97
100
  return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
98
101
  except RuntimeError as e:
102
+ # TODO: add test case
99
103
  raise ParsingError(
100
104
  f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
101
105
  context={"error": str(e)},
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, Final
4
4
 
5
- if TYPE_CHECKING:
5
+ if TYPE_CHECKING: # pragma: no cover
6
6
  from collections.abc import Mapping
7
7
 
8
8
  MARKDOWN_MIME_TYPE: Final[str] = "text/markdown"
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import suppress
4
+
3
5
  from charset_normalizer import detect
4
6
 
5
7
 
@@ -16,20 +18,18 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
16
18
  if not byte_data:
17
19
  return ""
18
20
 
21
+ encodings = ["utf-8", "latin-1"]
22
+
19
23
  if encoding:
20
- try:
24
+ with suppress(UnicodeDecodeError):
21
25
  return byte_data.decode(encoding, errors="ignore")
22
- except UnicodeDecodeError: # pragma: no cover
23
- pass
24
26
 
25
- encodings = ["utf-8", "latin-1"]
26
27
  if encoding := detect(byte_data).get("encoding"):
27
28
  encodings.append(encoding)
28
29
 
29
30
  for encoding in encodings:
30
- try:
31
+ with suppress(UnicodeDecodeError):
31
32
  return byte_data.decode(encoding, errors="ignore")
32
- except UnicodeDecodeError: # pragma: no cover # noqa: PERF203
33
- pass
34
33
 
34
+ # TODO: add test case
35
35
  return byte_data.decode("latin-1", errors="replace")
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, TypeVar, cast
6
6
  from anyio.to_thread import run_sync as any_io_run_sync
7
7
  from typing_extensions import ParamSpec
8
8
 
9
- if TYPE_CHECKING:
9
+ if TYPE_CHECKING: # pragma: no cover
10
10
  from collections.abc import Callable
11
11
 
12
12
  T = TypeVar("T")
@@ -35,12 +35,13 @@ class ExtractionResult(NamedTuple):
35
35
  """The mime type of the content."""
36
36
 
37
37
 
38
- async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
38
+ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
39
39
  """Extract the textual content from a given byte string representing a file's contents.
40
40
 
41
41
  Args:
42
42
  content: The content to extract.
43
43
  mime_type: The mime type of the content.
44
+ force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
44
45
 
45
46
  Raises:
46
47
  ValidationError: If the mime type is not supported.
@@ -58,7 +59,7 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
58
59
  with NamedTemporaryFile(suffix=".pdf") as temp_file:
59
60
  temp_file.write(content)
60
61
  return ExtractionResult(
61
- content=await _extract_pdf_file(Path(temp_file.name)), mime_type=PLAIN_TEXT_MIME_TYPE
62
+ content=await _extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
62
63
  )
63
64
 
64
65
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
@@ -81,12 +82,15 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
81
82
  )
82
83
 
83
84
 
84
- async def extract_file(file_path: Path | str, mime_type: str | None = None) -> ExtractionResult:
85
+ async def extract_file(
86
+ file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
87
+ ) -> ExtractionResult:
85
88
  """Extract the textual content from a given file.
86
89
 
87
90
  Args:
88
91
  file_path: The path to the file.
89
92
  mime_type: The mime type of the file.
93
+ force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
90
94
 
91
95
  Raises:
92
96
  ValidationError: If the mime type is not supported.
@@ -109,7 +113,7 @@ async def extract_file(file_path: Path | str, mime_type: str | None = None) -> E
109
113
  raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
110
114
 
111
115
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
112
- return ExtractionResult(content=await _extract_pdf_file(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
116
+ return ExtractionResult(content=await _extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
113
117
 
114
118
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
115
119
  return ExtractionResult(content=await _extract_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -46,7 +46,7 @@ Hence, this library.
46
46
 
47
47
  ## Features
48
48
 
49
- - Extract text from PDFs, images, and office documents
49
+ - Extract text from PDFs, images, office documents and more (see supported formats below)
50
50
  - Use modern Python with async (via `anyio`) and proper type hints
51
51
  - Extensive error handling for easy debugging
52
52
 
@@ -164,6 +164,21 @@ async def process_uploaded_image(image_content: bytes):
164
164
  return result.content
165
165
  ```
166
166
 
167
+ ### Forcing OCR
168
+
169
+ When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
170
+ You can do this by passing `force_ocr=True`:
171
+
172
+ ```python
173
+ from kreuzberg import extract_bytes
174
+
175
+
176
+ # Extract text from PDF bytes and force OCR
177
+ async def process_uploaded_pdf(pdf_content: bytes):
178
+ result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
179
+ return result.content
180
+ ```
181
+
167
182
  ### Error Handling
168
183
 
169
184
  Kreuzberg raises two exception types:
@@ -173,8 +188,8 @@ Kreuzberg raises two exception types:
173
188
  Raised when there are issues with input validation:
174
189
 
175
190
  - Unsupported mime types
176
- - Non-existent files
177
191
  - Undetectable mime types
192
+ - Path doesn't point at an exist file
178
193
 
179
194
  #### ParsingError
180
195
 
@@ -218,8 +233,8 @@ except ParsingError as e:
218
233
 
219
234
  All extraction functions return an ExtractionResult named tuple containing:
220
235
 
221
- - content: The extracted text as a string
222
- - mime_type: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
236
+ - `content`: The extracted text as a string
237
+ - `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
223
238
 
224
239
  ```python
225
240
  from kreuzberg import ExtractionResult
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kreuzberg"
3
- version = "1.0.0"
3
+ version = "1.1.0"
4
4
  description = "A text extraction library supporting PDFs, images, office documents and more"
5
5
  readme = "README.md"
6
6
  keywords = [
@@ -128,7 +128,7 @@ source = [ "kreuzberg" ]
128
128
 
129
129
  [tool.coverage.report]
130
130
  exclude_lines = [ 'if TYPE_CHECKING:' ]
131
- fail_under = 100
131
+ fail_under = 90
132
132
 
133
133
  [tool.mypy]
134
134
  packages = [ "kreuzberg", "tests" ]
File without changes
File without changes
File without changes