kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. kreuzberg/__init__.py +16 -2
  2. kreuzberg/_chunker.py +51 -0
  3. kreuzberg/_constants.py +2 -3
  4. kreuzberg/_extractors/__init__.py +0 -0
  5. kreuzberg/_extractors/_base.py +92 -0
  6. kreuzberg/_extractors/_html.py +34 -0
  7. kreuzberg/_extractors/_image.py +74 -0
  8. kreuzberg/_extractors/_pandoc.py +613 -0
  9. kreuzberg/_extractors/_pdf.py +163 -0
  10. kreuzberg/_extractors/_presentation.py +233 -0
  11. kreuzberg/_extractors/_spread_sheet.py +125 -0
  12. kreuzberg/_mime_types.py +19 -26
  13. kreuzberg/_ocr/__init__.py +17 -0
  14. kreuzberg/_ocr/_base.py +54 -0
  15. kreuzberg/_ocr/_easyocr.py +376 -0
  16. kreuzberg/_ocr/_paddleocr.py +291 -0
  17. kreuzberg/_ocr/_tesseract.py +342 -0
  18. kreuzberg/_playa.py +276 -0
  19. kreuzberg/_registry.py +108 -0
  20. kreuzberg/_types.py +133 -36
  21. kreuzberg/_utils/__init__.py +0 -0
  22. kreuzberg/{_string.py → _utils/_string.py} +0 -2
  23. kreuzberg/_utils/_sync.py +121 -0
  24. kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
  25. kreuzberg/exceptions.py +25 -0
  26. kreuzberg/extraction.py +114 -227
  27. kreuzberg-3.0.1.dist-info/METADATA +178 -0
  28. kreuzberg-3.0.1.dist-info/RECORD +32 -0
  29. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
  30. kreuzberg/_html.py +0 -31
  31. kreuzberg/_pandoc.py +0 -366
  32. kreuzberg/_pdf.py +0 -190
  33. kreuzberg/_pptx.py +0 -88
  34. kreuzberg/_sync.py +0 -74
  35. kreuzberg/_tesseract.py +0 -231
  36. kreuzberg/_xlsx.py +0 -88
  37. kreuzberg-2.1.2.dist-info/METADATA +0 -446
  38. kreuzberg-2.1.2.dist-info/RECORD +0 -21
  39. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
  40. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,32 @@
1
+ kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
2
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
+ kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
5
+ kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
6
+ kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
7
+ kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
8
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
9
+ kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
10
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
13
+ kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
14
+ kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
15
+ kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
16
+ kreuzberg/_extractors/_pdf.py,sha256=dcSAXyqH8SZ-z45OUAjjwdboSEbrli0YekS8PxCaVGA,6384
17
+ kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
18
+ kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
19
+ kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
20
+ kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
21
+ kreuzberg/_ocr/_easyocr.py,sha256=VfYW66SkB2Bigbrtd7WEeJ6QZ_1Y5d8Z_rZYBPMsuk0,11037
22
+ kreuzberg/_ocr/_paddleocr.py,sha256=X5es69QMl0P6DZuuRNKWHaRtLi1OJqFs-mWHR_gVKvY,10837
23
+ kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
24
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
26
+ kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
27
+ kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
28
+ kreuzberg-3.0.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
29
+ kreuzberg-3.0.1.dist-info/METADATA,sha256=5Kt0w9rFBAina8SzbO-m2umEMRJQL-4mcPGAQASko_k,6545
30
+ kreuzberg-3.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
31
+ kreuzberg-3.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
32
+ kreuzberg-3.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
kreuzberg/_html.py DELETED
@@ -1,31 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING
4
-
5
- import html_to_markdown
6
- from anyio import Path as AsyncPath
7
-
8
- from kreuzberg import ExtractionResult
9
- from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
- from kreuzberg._string import normalize_spaces, safe_decode
11
-
12
- if TYPE_CHECKING:
13
- from pathlib import Path
14
-
15
-
16
- async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
17
- """Extract text from an HTML string.
18
-
19
- Args:
20
- file_path_or_contents: The HTML content.
21
-
22
- Returns:
23
- The extracted text content.
24
- """
25
- content = (
26
- safe_decode(file_path_or_contents)
27
- if isinstance(file_path_or_contents, bytes)
28
- else await AsyncPath(file_path_or_contents).read_text()
29
- )
30
- result = html_to_markdown.convert_to_markdown(content)
31
- return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_pandoc.py DELETED
@@ -1,366 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- import sys
5
- from json import JSONDecodeError, loads
6
- from typing import TYPE_CHECKING, Any, Final, Literal, cast
7
-
8
- from anyio import Path as AsyncPath
9
- from anyio import run_process
10
-
11
- from kreuzberg import ValidationError
12
- from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
13
- from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
14
- from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_taskgroup
16
- from kreuzberg._tmp import create_temp_file
17
- from kreuzberg._types import ExtractionResult, Metadata
18
- from kreuzberg.exceptions import MissingDependencyError, ParsingError
19
-
20
- if TYPE_CHECKING: # pragma: no cover
21
- from collections.abc import Mapping
22
- from os import PathLike
23
-
24
- if sys.version_info < (3, 11): # pragma: no cover
25
- from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
-
27
- version_ref: Final[dict[str, bool]] = {"checked": False}
28
-
29
- # Block-level node types in Pandoc AST
30
- BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
31
- BLOCK_PARA: Final = "Para" # Paragraph containing inline content
32
- BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
33
- BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
34
- BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
35
- BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
36
-
37
- # Inline-level node types in Pandoc AST
38
- INLINE_STR: Final = "Str" # Plain text string
39
- INLINE_SPACE: Final = "Space" # Single space
40
- INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
41
- INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
42
- INLINE_LINK: Final = "Link" # Link with text and target
43
- INLINE_IMAGE: Final = "Image" # Image with alt text and source
44
- INLINE_CODE: Final = "Code" # Inline code span
45
- INLINE_MATH: Final = "Math" # Math expression
46
-
47
- # Metadata node types in Pandoc AST
48
- META_MAP: Final = "MetaMap" # Key-value mapping of metadata
49
- META_LIST: Final = "MetaList" # List of metadata values
50
- META_INLINES: Final = "MetaInlines" # Inline content in metadata
51
- META_STRING: Final = "MetaString" # Plain string in metadata
52
- META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
53
-
54
- # Node content field name
55
- CONTENT_FIELD: Final = "c"
56
- TYPE_FIELD: Final = "t"
57
-
58
- # Valid node types
59
- NodeType = Literal[
60
- # Block types
61
- "Header",
62
- "Para",
63
- "CodeBlock",
64
- "BlockQuote",
65
- "BulletList",
66
- "OrderedList",
67
- # Inline types
68
- "Str",
69
- "Space",
70
- "Emph",
71
- "Strong",
72
- "Link",
73
- "Image",
74
- "Code",
75
- "Math",
76
- # Meta types
77
- "MetaMap",
78
- "MetaList",
79
- "MetaInlines",
80
- "MetaString",
81
- "MetaBlocks",
82
- ]
83
-
84
- MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
85
- "application/csl+json": "csljson",
86
- "application/docbook+xml": "docbook",
87
- "application/epub+zip": "epub",
88
- "application/rtf": "rtf",
89
- "application/vnd.oasis.opendocument.text": "odt",
90
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
91
- "application/x-biblatex": "biblatex",
92
- "application/x-bibtex": "bibtex",
93
- "application/x-endnote+xml": "endnotexml",
94
- "application/x-fictionbook+xml": "fb2",
95
- "application/x-ipynb+json": "ipynb",
96
- "application/x-jats+xml": "jats",
97
- "application/x-latex": "latex",
98
- "application/x-opml+xml": "opml",
99
- "application/x-research-info-systems": "ris",
100
- "application/x-typst": "typst",
101
- "text/csv": "csv",
102
- "text/tab-separated-values": "tsv",
103
- "text/troff": "man",
104
- "text/x-commonmark": "commonmark",
105
- "text/x-dokuwiki": "dokuwiki",
106
- "text/x-gfm": "gfm",
107
- "text/x-markdown": "markdown",
108
- "text/x-markdown-extra": "markdown_phpextra",
109
- "text/x-mdoc": "mdoc",
110
- "text/x-multimarkdown": "markdown_mmd",
111
- "text/x-org": "org",
112
- "text/x-pod": "pod",
113
- "text/x-rst": "rst",
114
- }
115
-
116
- MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
117
- "application/csl+json": "json",
118
- "application/docbook+xml": "xml",
119
- "application/epub+zip": "epub",
120
- "application/rtf": "rtf",
121
- "application/vnd.oasis.opendocument.text": "odt",
122
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
123
- "application/x-biblatex": "bib",
124
- "application/x-bibtex": "bib",
125
- "application/x-endnote+xml": "xml",
126
- "application/x-fictionbook+xml": "fb2",
127
- "application/x-ipynb+json": "ipynb",
128
- "application/x-jats+xml": "xml",
129
- "application/x-latex": "tex",
130
- "application/x-opml+xml": "opml",
131
- "application/x-research-info-systems": "ris",
132
- "application/x-typst": "typst",
133
- "text/csv": "csv",
134
- "text/tab-separated-values": "tsv",
135
- "text/troff": "1",
136
- "text/x-commonmark": "md",
137
- "text/x-dokuwiki": "wiki",
138
- "text/x-gfm": "md",
139
- "text/x-markdown": "md",
140
- "text/x-markdown-extra": "md",
141
- "text/x-mdoc": "md",
142
- "text/x-multimarkdown": "md",
143
- "text/x-org": "org",
144
- "text/x-pod": "pod",
145
- "text/x-rst": "rst",
146
- }
147
-
148
-
149
- def _extract_inline_text(node: dict[str, Any]) -> str | None:
150
- if node_type := node.get(TYPE_FIELD):
151
- if node_type == INLINE_STR:
152
- return node.get(CONTENT_FIELD)
153
- if node_type == INLINE_SPACE:
154
- return " "
155
- if node_type in (INLINE_EMPH, INLINE_STRONG):
156
- return _extract_inlines(node.get(CONTENT_FIELD, []))
157
- return None # pragma: no cover
158
-
159
-
160
- def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
161
- texts = [text for node in nodes if (text := _extract_inline_text(node))]
162
- result = "".join(texts).strip()
163
- return result if result else None
164
-
165
-
166
- def _extract_meta_value(node: Any) -> str | list[str] | None:
167
- if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
168
- return None
169
-
170
- content = node[CONTENT_FIELD]
171
- node_type = node[TYPE_FIELD]
172
-
173
- if not content or node_type not in {
174
- META_STRING,
175
- META_INLINES,
176
- META_LIST,
177
- META_BLOCKS,
178
- }:
179
- return None
180
-
181
- if node_type == META_STRING and isinstance(content, str):
182
- return content
183
-
184
- if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
185
- if node_type == META_INLINES:
186
- return _extract_inlines(cast(list[dict[str, Any]], content))
187
-
188
- if node_type == META_LIST:
189
- results = []
190
- for value in [value for item in content if (value := _extract_meta_value(item))]:
191
- if isinstance(value, list): # pragma: no cover
192
- results.extend(value)
193
- else:
194
- results.append(value)
195
- return results
196
-
197
- # This branch is only taken for complex metadata blocks which we don't use
198
- if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]: # pragma: no cover
199
- block_texts = []
200
- for block in blocks:
201
- block_content = block.get(CONTENT_FIELD, [])
202
- if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
203
- block_texts.append(text)
204
- return block_texts if block_texts else None
205
-
206
- return None
207
-
208
-
209
- def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
210
- meta: Metadata = {}
211
-
212
- for key, value in raw_meta.items():
213
- if extracted := _extract_meta_value(value):
214
- meta[key] = extracted # type: ignore[literal-required]
215
-
216
- citations = [
217
- cite["citationId"]
218
- for block in raw_meta.get("blocks", [])
219
- if block.get(TYPE_FIELD) == "Cite"
220
- for cite in block.get(CONTENT_FIELD, [[{}]])[0]
221
- if isinstance(cite, dict)
222
- ]
223
- if citations:
224
- meta["citations"] = citations
225
-
226
- return meta
227
-
228
-
229
- def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
230
- if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
231
- return pandoc_type
232
-
233
- if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
234
- return next(
235
- MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
236
- )
237
-
238
- raise ValidationError(f"Unsupported mime type: {mime_type}")
239
-
240
-
241
- async def _validate_pandoc_version() -> None:
242
- try:
243
- if version_ref["checked"]:
244
- return
245
-
246
- command = ["pandoc", "--version"]
247
- result = await run_process(command)
248
-
249
- version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
250
- if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
251
- raise MissingDependencyError("Pandoc version 2 or above is required")
252
-
253
- version_ref["checked"] = True
254
-
255
- except FileNotFoundError as e:
256
- raise MissingDependencyError("Pandoc is not installed") from e
257
-
258
-
259
- async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
260
- pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
261
- metadata_file, unlink = await create_temp_file(".json")
262
- try:
263
- command = [
264
- "pandoc",
265
- str(input_file),
266
- f"--from={pandoc_type}",
267
- "--to=json",
268
- "--standalone",
269
- "--quiet",
270
- "--output",
271
- str(metadata_file),
272
- ]
273
-
274
- result = await run_process(command)
275
-
276
- if result.returncode != 0:
277
- raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
278
-
279
- json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
280
- return _extract_metadata(json_data)
281
- except (RuntimeError, OSError, JSONDecodeError) as e:
282
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
283
- finally:
284
- await unlink()
285
-
286
-
287
- async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
288
- pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
289
- output_path, unlink = await create_temp_file(".md")
290
- try:
291
- command = [
292
- "pandoc",
293
- str(input_file),
294
- f"--from={pandoc_type}",
295
- "--to=markdown",
296
- "--standalone",
297
- "--wrap=preserve",
298
- "--quiet",
299
- ]
300
-
301
- command.extend(["--output", str(output_path)])
302
-
303
- result = await run_process(command)
304
-
305
- if result.returncode != 0:
306
- raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
307
-
308
- text = await AsyncPath(output_path).read_text("utf-8")
309
-
310
- return normalize_spaces(text)
311
- except (RuntimeError, OSError) as e:
312
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
313
- finally:
314
- await unlink()
315
-
316
-
317
- async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
318
- """Process a single file using Pandoc and convert to markdown.
319
-
320
- Args:
321
- input_file: The path to the file to process.
322
- mime_type: The mime type of the file.
323
-
324
- Raises:
325
- ParsingError: If the file data could not be extracted.
326
-
327
- Returns:
328
- ExtractionResult
329
- """
330
- await _validate_pandoc_version()
331
-
332
- _get_pandoc_type_from_mime_type(mime_type)
333
-
334
- try:
335
- metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
336
- content_task = _handle_extract_file(input_file, mime_type=mime_type)
337
- results = await run_taskgroup(metadata_task, content_task)
338
- metadata, content = cast(tuple[Metadata, str], results)
339
-
340
- return ExtractionResult(
341
- content=normalize_spaces(content),
342
- metadata=metadata,
343
- mime_type=MARKDOWN_MIME_TYPE,
344
- )
345
- except ExceptionGroup as eg:
346
- raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
347
-
348
-
349
- async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
350
- """Process content using Pandoc and convert to markdown.
351
-
352
- Args:
353
- content: The content to process.
354
- mime_type: The mime type of the content.
355
-
356
- Returns:
357
- ExtractionResult
358
- """
359
- extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
360
- input_file, unlink = await create_temp_file(f".{extension}")
361
-
362
- await AsyncPath(input_file).write_bytes(content)
363
- result = await process_file_with_pandoc(input_file, mime_type=mime_type)
364
-
365
- await unlink()
366
- return result
kreuzberg/_pdf.py DELETED
@@ -1,190 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from re import Pattern
4
- from re import compile as compile_regex
5
- from typing import TYPE_CHECKING, Final, cast
6
-
7
- import pypdfium2
8
- from anyio import Path as AsyncPath
9
-
10
- from kreuzberg import ExtractionResult
11
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
- from kreuzberg._string import normalize_spaces
13
- from kreuzberg._sync import run_sync
14
- from kreuzberg._tesseract import PSMMode, batch_process_images
15
- from kreuzberg.exceptions import ParsingError
16
-
17
- if TYPE_CHECKING: # pragma: no cover
18
- from pathlib import Path
19
-
20
- from PIL.Image import Image
21
-
22
-
23
- # Pattern to detect common PDF text extraction corruption:
24
- # - Control and non-printable characters
25
- # - Unicode replacement and invalid characters
26
- # - Zero-width spaces and other invisible characters
27
- CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
28
- SHORT_TEXT_THRESHOLD: Final[int] = 50
29
- MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
30
-
31
-
32
- def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
33
- """Check if text extracted from PDF is valid or corrupted.
34
-
35
- This checks for indicators of corrupted PDF text extraction:
36
- 1. Empty or whitespace-only text
37
- 2. High concentration of control characters and null bytes
38
- 3. High concentration of Unicode replacement characters
39
-
40
- Args:
41
- text: The extracted text to validate
42
- corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
43
- characters (default: 0.05 or 5%)
44
-
45
- Returns:
46
- True if the text appears valid, False if it seems corrupted
47
- """
48
- if not text or not text.strip():
49
- return False
50
-
51
- corruption_matches = CORRUPTED_PATTERN.findall(text)
52
-
53
- if len(text) < SHORT_TEXT_THRESHOLD:
54
- return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
55
-
56
- return (len(corruption_matches) / len(text)) < corruption_threshold
57
-
58
-
59
- async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
60
- """Convert a PDF file to images.
61
-
62
- Args:
63
- input_file: The path to the PDF file.
64
-
65
- Raises:
66
- ParsingError: If the PDF file could not be converted to images.
67
-
68
- Returns:
69
- A list of Pillow Images.
70
- """
71
- document: pypdfium2.PdfDocument | None = None
72
- try:
73
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
74
- return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
75
- except pypdfium2.PdfiumError as e:
76
- raise ParsingError(
77
- "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
78
- ) from e
79
- finally:
80
- if document:
81
- await run_sync(document.close)
82
-
83
-
84
- async def _extract_pdf_text_with_ocr(
85
- input_file: Path,
86
- *,
87
- language: str = "eng",
88
- max_processes: int,
89
- psm: PSMMode = PSMMode.AUTO,
90
- ) -> ExtractionResult:
91
- """Extract text from a scanned PDF file using pytesseract.
92
-
93
- Args:
94
- input_file: The path to the PDF file.
95
- language: The language code for OCR. Defaults to "eng".
96
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
97
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
98
-
99
- Returns:
100
- The extracted text.
101
- """
102
- images = await _convert_pdf_to_images(input_file)
103
- ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
104
- return ExtractionResult(
105
- content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
106
- )
107
-
108
-
109
- async def _extract_pdf_searchable_text(input_file: Path) -> str:
110
- """Extract text from a searchable PDF file using pypdfium2.
111
-
112
- Args:
113
- input_file: The path to the PDF file.
114
-
115
- Raises:
116
- ParsingError: If the text could not be extracted from the PDF file.
117
-
118
- Returns:
119
- The extracted text.
120
- """
121
- document: pypdfium2.PdfDocument | None = None
122
- try:
123
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
124
- text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
125
- return normalize_spaces(text)
126
- except pypdfium2.PdfiumError as e:
127
- raise ParsingError(
128
- "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
129
- ) from e
130
- finally:
131
- if document:
132
- await run_sync(document.close)
133
-
134
-
135
- async def extract_pdf_file(
136
- input_file: Path,
137
- *,
138
- force_ocr: bool,
139
- language: str = "eng",
140
- max_processes: int,
141
- psm: PSMMode = PSMMode.AUTO,
142
- ) -> ExtractionResult:
143
- """Extract text from a PDF file.
144
-
145
- Args:
146
- input_file: The path to the PDF file.
147
- force_ocr: Whether to force OCR on PDF files that have a text layer.
148
- language: The language code for OCR. Defaults to "eng".
149
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
150
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
151
-
152
- Returns:
153
- The extracted text.
154
- """
155
- if not force_ocr:
156
- content = await _extract_pdf_searchable_text(input_file)
157
- if _validate_extracted_text(content):
158
- return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
159
- return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
160
-
161
-
162
- async def extract_pdf_content(
163
- content: bytes,
164
- *,
165
- force_ocr: bool,
166
- language: str = "eng",
167
- max_processes: int,
168
- psm: PSMMode = PSMMode.AUTO,
169
- ) -> ExtractionResult:
170
- """Extract text from a PDF file content.
171
-
172
- Args:
173
- content: The PDF file content.
174
- force_ocr: Whether to force OCR on PDF files that have a text layer.
175
- language: The language code for OCR. Defaults to "eng".
176
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
177
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
178
-
179
- Returns:
180
- The extracted text.
181
- """
182
- from kreuzberg._tmp import create_temp_file
183
-
184
- file_path, unlink = await create_temp_file(".pdf")
185
- await AsyncPath(file_path).write_bytes(content)
186
- result = await extract_pdf_file(
187
- file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
188
- )
189
- await unlink()
190
- return result