kreuzberg 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -4,24 +4,24 @@ import re
4
4
  from contextlib import suppress
5
5
  from html import escape
6
6
  from io import BytesIO
7
- from typing import TYPE_CHECKING, cast
7
+ from pathlib import Path
8
+ from tempfile import NamedTemporaryFile
9
+ from typing import TYPE_CHECKING
8
10
 
9
11
  import html_to_markdown
10
12
  import pptx
11
- import pypandoc
12
13
  import pypdfium2
13
14
  from anyio import Path as AsyncPath
14
- from charset_normalizer import detect
15
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
16
+ from xlsx2csv import Xlsx2csv
15
17
 
16
- from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
18
+ from kreuzberg._pandoc import process_content, process_file
17
19
  from kreuzberg._string import normalize_spaces, safe_decode
18
20
  from kreuzberg._sync import run_sync
19
21
  from kreuzberg._tesseract import batch_process_images
20
22
  from kreuzberg.exceptions import ParsingError
21
23
 
22
24
  if TYPE_CHECKING: # pragma: no cover
23
- from pathlib import Path
24
-
25
25
  from PIL.Image import Image
26
26
 
27
27
 
@@ -98,32 +98,18 @@ async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
98
98
  return await extract_pdf_with_tesseract(file_path)
99
99
 
100
100
 
101
- async def extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
101
+ async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
102
102
  """Extract text using pandoc.
103
103
 
104
104
  Args:
105
105
  file_data: The content of the file.
106
106
  mime_type: The mime type of the file.
107
- encoding: An optional encoding to use when decoding the string.
108
-
109
- Raises:
110
- ParsingError: If the text could not be extracted from the file using pandoc.
111
107
 
112
108
  Returns:
113
109
  The extracted text.
114
110
  """
115
- ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
116
- encoding = encoding or detect(file_data)["encoding"] or "utf-8"
117
- try:
118
- return normalize_spaces(
119
- cast(str, await run_sync(pypandoc.convert_text, file_data, to="md", format=ext, encoding=encoding))
120
- )
121
- except RuntimeError as e:
122
- # TODO: add test case
123
- raise ParsingError(
124
- f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
125
- context={"error": str(e)},
126
- ) from e
111
+ result = await process_content(file_data, mime_type=mime_type)
112
+ return normalize_spaces(result.content)
127
113
 
128
114
 
129
115
  async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
@@ -133,20 +119,11 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
133
119
  file_path: The path to the file.
134
120
  mime_type: The mime type of the file.
135
121
 
136
- Raises:
137
- ParsingError: If the text could not be extracted from the file using pandoc.
138
-
139
122
  Returns:
140
123
  The extracted text.
141
124
  """
142
- ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
143
- try:
144
- return normalize_spaces(cast(str, await run_sync(pypandoc.convert_file, file_path, to="md", format=ext)))
145
- except RuntimeError as e:
146
- raise ParsingError(
147
- f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
148
- context={"file_path": str(file_path), "error": str(e)},
149
- ) from e
125
+ result = await process_file(file_path, mime_type=mime_type)
126
+ return normalize_spaces(result.content)
150
127
 
151
128
 
152
129
  async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
@@ -161,8 +138,6 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
161
138
  Returns:
162
139
  The extracted text content
163
140
  """
164
- from pptx.enum.shapes import MSO_SHAPE_TYPE
165
-
166
141
  md_content = ""
167
142
  file_contents = (
168
143
  file_path_or_contents
@@ -221,6 +196,40 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
221
196
  return normalize_spaces(md_content)
222
197
 
223
198
 
199
+ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
200
+ """Extract text from an XLSX file by converting it to CSV and then to markdown.
201
+
202
+ Args:
203
+ file_path_or_contents: The path to the XLSX file or its contents as bytes.
204
+
205
+ Returns:
206
+ The extracted text content.
207
+
208
+ Raises:
209
+ ParsingError: If the XLSX file could not be parsed.
210
+ """
211
+ try:
212
+ with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
213
+ if isinstance(file_path_or_contents, bytes):
214
+ xlsx_file.write(file_path_or_contents)
215
+ xlsx_file.flush()
216
+ xlsx_path = xlsx_file.name
217
+ else:
218
+ xlsx_path = str(file_path_or_contents)
219
+
220
+ await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
221
+ result = await process_file(csv_file.name, mime_type="text/csv")
222
+ return normalize_spaces(result.content)
223
+ except Exception as e:
224
+ raise ParsingError(
225
+ "Could not extract text from XLSX file",
226
+ context={
227
+ "error": str(e),
228
+ "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
229
+ },
230
+ ) from e
231
+
232
+
224
233
  async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
225
234
  """Extract text from an HTML string.
226
235
 
kreuzberg/_mime_types.py CHANGED
@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
10
  PDF_MIME_TYPE: Final = "application/pdf"
11
11
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
12
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
13
-
13
+ EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
14
14
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
15
15
 
16
16
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -54,49 +54,40 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
54
54
  "image/x-portable-pixmap": "ppm",
55
55
  }
56
56
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
57
- "application/csv",
58
- "application/latex",
57
+ "application/csl+json",
58
+ "application/docbook+xml",
59
+ "application/epub+zip",
59
60
  "application/rtf",
60
61
  "application/vnd.oasis.opendocument.text",
61
62
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
62
- "application/x-csv",
63
+ "application/x-biblatex",
64
+ "application/x-bibtex",
65
+ "application/x-endnote+xml",
66
+ "application/x-fictionbook+xml",
67
+ "application/x-ipynb+json",
68
+ "application/x-jats+xml",
63
69
  "application/x-latex",
64
- "application/x-rtf",
65
- "application/x-vnd.oasis.opendocument.text",
70
+ "application/x-opml+xml",
71
+ "application/x-research-info-systems",
72
+ "application/x-typst",
66
73
  "text/csv",
67
- "text/latex",
68
- "text/rst",
69
- "text/rtf",
70
74
  "text/tab-separated-values",
71
- "text/x-csv",
72
- "text/x-latex",
75
+ "text/troff",
76
+ "text/x-commonmark",
77
+ "text/x-dokuwiki",
78
+ "text/x-gfm",
79
+ "text/x-markdown",
80
+ "text/x-markdown-extra",
81
+ "text/x-mdoc",
82
+ "text/x-multimarkdown",
83
+ "text/x-org",
84
+ "text/x-pod",
73
85
  "text/x-rst",
74
- "text/x-tsv",
75
- }
76
- PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
77
- "application/csv": "csv",
78
- "application/latex": "latex",
79
- "application/rtf": "rtf",
80
- "application/vnd.oasis.opendocument.text": "odt",
81
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
82
- "application/x-csv": "csv",
83
- "application/x-latex": "latex",
84
- "application/x-rtf": "rtf",
85
- "application/x-vnd.oasis.opendocument.text": "odt",
86
- "text/csv": "csv",
87
- "text/latex": "latex",
88
- "text/rst": "rst",
89
- "text/rtf": "rtf",
90
- "text/tab-separated-values": "tsv",
91
- "text/x-csv": "csv",
92
- "text/x-latex": "latex",
93
- "text/x-rst": "rst",
94
- "text/x-tsv": "tsv",
95
86
  }
96
87
 
97
88
  SUPPORTED_MIME_TYPES: Final[set[str]] = (
98
89
  PLAIN_TEXT_MIME_TYPES
99
90
  | IMAGE_MIME_TYPES
100
91
  | PANDOC_SUPPORTED_MIME_TYPES
101
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
92
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
102
93
  )
kreuzberg/_pandoc.py ADDED
@@ -0,0 +1,416 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import subprocess
5
+ from asyncio import gather
6
+ from dataclasses import dataclass
7
+ from tempfile import NamedTemporaryFile
8
+ from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
9
+
10
+ from anyio import Path as AsyncPath
11
+
12
+ from kreuzberg._string import normalize_spaces
13
+ from kreuzberg._sync import run_sync
14
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Mapping
18
+ from os import PathLike
19
+
20
+ try: # pragma: no cover
21
+ from typing import NotRequired # type: ignore[attr-defined]
22
+ except ImportError: # pragma: no cover
23
+ from typing_extensions import NotRequired
24
+
25
+ version_ref: Final[dict[str, bool]] = {"checked": False}
26
+
27
+
28
+ # Block-level node types in Pandoc AST
29
+ BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
30
+ BLOCK_PARA: Final = "Para" # Paragraph containing inline content
31
+ BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
32
+ BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
33
+ BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
34
+ BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
35
+
36
+ # Inline-level node types in Pandoc AST
37
+ INLINE_STR: Final = "Str" # Plain text string
38
+ INLINE_SPACE: Final = "Space" # Single space
39
+ INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
40
+ INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
41
+ INLINE_LINK: Final = "Link" # Link with text and target
42
+ INLINE_IMAGE: Final = "Image" # Image with alt text and source
43
+ INLINE_CODE: Final = "Code" # Inline code span
44
+ INLINE_MATH: Final = "Math" # Math expression
45
+
46
+ # Metadata node types in Pandoc AST
47
+ META_MAP: Final = "MetaMap" # Key-value mapping of metadata
48
+ META_LIST: Final = "MetaList" # List of metadata values
49
+ META_INLINES: Final = "MetaInlines" # Inline content in metadata
50
+ META_STRING: Final = "MetaString" # Plain string in metadata
51
+ META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
52
+
53
+ # Node content field name
54
+ CONTENT_FIELD: Final = "c"
55
+ TYPE_FIELD: Final = "t"
56
+
57
+ # Valid node types
58
+ NodeType = Literal[
59
+ # Block types
60
+ "Header",
61
+ "Para",
62
+ "CodeBlock",
63
+ "BlockQuote",
64
+ "BulletList",
65
+ "OrderedList",
66
+ # Inline types
67
+ "Str",
68
+ "Space",
69
+ "Emph",
70
+ "Strong",
71
+ "Link",
72
+ "Image",
73
+ "Code",
74
+ "Math",
75
+ # Meta types
76
+ "MetaMap",
77
+ "MetaList",
78
+ "MetaInlines",
79
+ "MetaString",
80
+ "MetaBlocks",
81
+ ]
82
+
83
+ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
84
+ "application/csl+json": "csljson",
85
+ "application/docbook+xml": "docbook",
86
+ "application/epub+zip": "epub",
87
+ "application/rtf": "rtf",
88
+ "application/vnd.oasis.opendocument.text": "odt",
89
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
90
+ "application/x-biblatex": "biblatex",
91
+ "application/x-bibtex": "bibtex",
92
+ "application/x-endnote+xml": "endnotexml",
93
+ "application/x-fictionbook+xml": "fb2",
94
+ "application/x-ipynb+json": "ipynb",
95
+ "application/x-jats+xml": "jats",
96
+ "application/x-latex": "latex",
97
+ "application/x-opml+xml": "opml",
98
+ "application/x-research-info-systems": "ris",
99
+ "application/x-typst": "typst",
100
+ "text/csv": "csv",
101
+ "text/tab-separated-values": "tsv",
102
+ "text/troff": "man",
103
+ "text/x-commonmark": "commonmark",
104
+ "text/x-dokuwiki": "dokuwiki",
105
+ "text/x-gfm": "gfm",
106
+ "text/x-markdown": "markdown",
107
+ "text/x-markdown-extra": "markdown_phpextra",
108
+ "text/x-mdoc": "mdoc",
109
+ "text/x-multimarkdown": "markdown_mmd",
110
+ "text/x-org": "org",
111
+ "text/x-pod": "pod",
112
+ "text/x-rst": "rst",
113
+ }
114
+
115
+
116
+ class Metadata(TypedDict, total=False):
117
+ """Document metadata extracted from Pandoc document.
118
+
119
+ All fields are optional but will only be included if they contain non-empty values.
120
+ Any field that would be empty or None is omitted from the dictionary.
121
+ """
122
+
123
+ title: NotRequired[str]
124
+ """Document title."""
125
+ subtitle: NotRequired[str]
126
+ """Document subtitle."""
127
+ abstract: NotRequired[str | list[str]]
128
+ """Document abstract, summary or description."""
129
+ authors: NotRequired[list[str]]
130
+ """List of document authors."""
131
+ date: NotRequired[str]
132
+ """Document date as string to preserve original format."""
133
+ subject: NotRequired[str]
134
+ """Document subject or topic."""
135
+ description: NotRequired[str]
136
+ """Extended description."""
137
+ keywords: NotRequired[list[str]]
138
+ """Keywords or tags."""
139
+ categories: NotRequired[list[str]]
140
+ """Categories or classifications."""
141
+ version: NotRequired[str]
142
+ """Version identifier."""
143
+ language: NotRequired[str]
144
+ """Document language code."""
145
+ references: NotRequired[list[str]]
146
+ """Reference entries."""
147
+ citations: NotRequired[list[str]]
148
+ """Citation identifiers."""
149
+ copyright: NotRequired[str]
150
+ """Copyright information."""
151
+ license: NotRequired[str]
152
+ """License information."""
153
+ identifier: NotRequired[str]
154
+ """Document identifier."""
155
+ publisher: NotRequired[str]
156
+ """Publisher name."""
157
+ contributors: NotRequired[list[str]]
158
+ """Additional contributors."""
159
+ creator: NotRequired[str]
160
+ """Document creator."""
161
+ institute: NotRequired[str | list[str]]
162
+ """Institute or organization."""
163
+
164
+
165
+ @dataclass
166
+ class PandocResult:
167
+ """Result of a pandoc conversion including content and metadata."""
168
+
169
+ content: str
170
+ """The processed markdown content."""
171
+ metadata: Metadata
172
+ """Document metadata extracted from the source."""
173
+
174
+
175
+ def _extract_inline_text(node: dict[str, Any]) -> str | None:
176
+ if node_type := node.get(TYPE_FIELD):
177
+ if node_type == INLINE_STR:
178
+ return node.get(CONTENT_FIELD)
179
+ if node_type == INLINE_SPACE:
180
+ return " "
181
+ if node_type in (INLINE_EMPH, INLINE_STRONG):
182
+ return _extract_inlines(node.get(CONTENT_FIELD, []))
183
+ return None # pragma: no cover
184
+
185
+
186
+ def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
187
+ texts = [text for node in nodes if (text := _extract_inline_text(node))]
188
+ result = "".join(texts).strip()
189
+ return result if result else None
190
+
191
+
192
+ def _extract_meta_value(node: Any) -> str | list[str] | None:
193
+ if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
194
+ return None
195
+
196
+ content = node[CONTENT_FIELD]
197
+ node_type = node[TYPE_FIELD]
198
+
199
+ if not content or node_type not in {
200
+ META_STRING,
201
+ META_INLINES,
202
+ META_LIST,
203
+ META_BLOCKS,
204
+ }:
205
+ return None
206
+
207
+ if node_type == META_STRING and isinstance(content, str):
208
+ return content
209
+
210
+ if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
211
+ if node_type == META_INLINES:
212
+ return _extract_inlines(cast(list[dict[str, Any]], content))
213
+
214
+ if node_type == META_LIST:
215
+ results = []
216
+ for value in [value for item in content if (value := _extract_meta_value(item))]:
217
+ if isinstance(value, list):
218
+ results.extend(value) # pragma: no cover
219
+ else:
220
+ results.append(value)
221
+ return results
222
+
223
+ if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
224
+ block_texts = []
225
+ for block in blocks:
226
+ block_content = block.get(CONTENT_FIELD, [])
227
+ if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
228
+ block_texts.append(text)
229
+ return block_texts if block_texts else None
230
+
231
+ return None
232
+
233
+
234
+ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
235
+ """Extract all non-empty metadata values from Pandoc AST metadata."""
236
+ meta: Metadata = {}
237
+
238
+ for key, value in raw_meta.items():
239
+ if extracted := _extract_meta_value(value):
240
+ meta[key] = extracted # type: ignore[literal-required]
241
+
242
+ citations = [
243
+ cite["citationId"]
244
+ for block in raw_meta.get("blocks", [])
245
+ if block.get(TYPE_FIELD) == "Cite"
246
+ for cite in block.get(CONTENT_FIELD, [[{}]])[0]
247
+ if isinstance(cite, dict)
248
+ ]
249
+ if citations:
250
+ meta["citations"] = citations
251
+
252
+ return meta
253
+
254
+
255
+ def _get_extension_from_mime_type(mime_type: str) -> str:
256
+ if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
257
+ mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
258
+ ):
259
+ raise ValidationError(
260
+ f"Unsupported mime type: {mime_type}",
261
+ context={
262
+ "mime_type": mime_type,
263
+ "supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
264
+ },
265
+ )
266
+
267
+ return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
268
+ PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
269
+ )
270
+
271
+
272
+ async def validate_pandoc_version() -> None:
273
+ """Validate that Pandoc is installed and is version 3 or above.
274
+
275
+ Raises:
276
+ MissingDependencyError: If Pandoc is not installed or is below version 3.
277
+ """
278
+ try:
279
+ if version_ref["checked"]:
280
+ return
281
+
282
+ result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
283
+ version = result.stdout.decode().split("\n")[0].split()[1]
284
+ if not version.startswith("3."):
285
+ raise MissingDependencyError("Pandoc version 3 or above is required.")
286
+
287
+ version_ref["checked"] = True
288
+
289
+ except FileNotFoundError as e:
290
+ raise MissingDependencyError("Pandoc is not installed.") from e
291
+
292
+
293
+ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
294
+ """Extract metadata from a document using pandoc.
295
+
296
+ Args:
297
+ input_file: The path to the file to process.
298
+ mime_type: The mime type of the file.
299
+
300
+ Raises:
301
+ ParsingError: If Pandoc fails to extract metadata.
302
+
303
+ Returns:
304
+ Dictionary containing document metadata.
305
+ """
306
+ extension = _get_extension_from_mime_type(mime_type)
307
+
308
+ with NamedTemporaryFile(suffix=".json") as metadata_file:
309
+ try:
310
+ command = [
311
+ "pandoc",
312
+ str(input_file),
313
+ f"--from={extension}",
314
+ "--to=json",
315
+ "--standalone",
316
+ "--quiet",
317
+ "--output",
318
+ metadata_file.name,
319
+ ]
320
+
321
+ result = await run_sync(
322
+ subprocess.run,
323
+ command,
324
+ capture_output=True,
325
+ )
326
+
327
+ if result.returncode != 0:
328
+ raise ParsingError(
329
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
330
+ )
331
+
332
+ json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
333
+ return _extract_metadata(json_data)
334
+
335
+ except (RuntimeError, OSError, json.JSONDecodeError) as e:
336
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
337
+
338
+
339
+ async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
340
+ extension = _get_extension_from_mime_type(mime_type)
341
+
342
+ with NamedTemporaryFile(suffix=".md") as output_file:
343
+ command = [
344
+ "pandoc",
345
+ str(input_file),
346
+ f"--from={extension}",
347
+ "--to=markdown",
348
+ "--standalone",
349
+ "--wrap=preserve",
350
+ "--quiet",
351
+ "--output",
352
+ output_file.name,
353
+ ]
354
+
355
+ if extra_args:
356
+ command.extend(extra_args)
357
+
358
+ result = await run_sync(
359
+ subprocess.run,
360
+ command,
361
+ capture_output=True,
362
+ )
363
+
364
+ if result.returncode != 0:
365
+ raise ParsingError(
366
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
367
+ )
368
+
369
+ text = await AsyncPath(output_file.name).read_text()
370
+
371
+ return normalize_spaces(text)
372
+
373
+
374
+ async def process_file(
375
+ input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
376
+ ) -> PandocResult:
377
+ """Process a single file using Pandoc and convert to markdown.
378
+
379
+ Args:
380
+ input_file: The path to the file to process.
381
+ mime_type: The mime type of the file.
382
+ extra_args: Additional Pandoc command line arguments.
383
+
384
+ Returns:
385
+ PandocResult containing processed content and metadata.
386
+ """
387
+ await validate_pandoc_version()
388
+
389
+ metadata, content = await gather(
390
+ *[
391
+ extract_metadata(input_file, mime_type=mime_type),
392
+ _extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
393
+ ]
394
+ )
395
+ return PandocResult(
396
+ content=content, # type: ignore[arg-type]
397
+ metadata=metadata, # type: ignore[arg-type]
398
+ )
399
+
400
+
401
+ async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
402
+ """Process content using Pandoc and convert to markdown.
403
+
404
+ Args:
405
+ content: The content to process.
406
+ mime_type: The mime type of the content.
407
+ extra_args: Additional Pandoc command line arguments.
408
+
409
+ Returns:
410
+ PandocResult containing processed content and metadata.
411
+ """
412
+ extension = _get_extension_from_mime_type(mime_type)
413
+
414
+ with NamedTemporaryFile(suffix=f".{extension}") as input_file:
415
+ await AsyncPath(input_file.name).write_bytes(content)
416
+ return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
kreuzberg/extraction.py CHANGED
@@ -22,8 +22,10 @@ from kreuzberg._extractors import (
22
22
  extract_html_string,
23
23
  extract_pdf_file,
24
24
  extract_pptx_file,
25
+ extract_xlsx_file,
25
26
  )
26
27
  from kreuzberg._mime_types import (
28
+ EXCEL_MIME_TYPE,
27
29
  HTML_MIME_TYPE,
28
30
  IMAGE_MIME_TYPE_EXT_MAP,
29
31
  IMAGE_MIME_TYPES,
@@ -75,6 +77,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
75
77
  content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
76
78
  )
77
79
 
80
+ if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
81
+ return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
82
+
78
83
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
79
84
  with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
80
85
  temp_file.write(content)
@@ -134,6 +139,9 @@ async def extract_file(
134
139
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
135
140
  return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
136
141
 
142
+ if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
143
+ return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
144
+
137
145
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
138
146
  return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
139
147
 
@@ -0,0 +1,317 @@
1
+ Metadata-Version: 2.2
2
+ Name: kreuzberg
3
+ Version: 1.6.0
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: anyio>=4.8.0
28
+ Requires-Dist: charset-normalizer>=3.4.1
29
+ Requires-Dist: html-to-markdown>=1.2.0
30
+ Requires-Dist: pypdfium2>=4.30.1
31
+ Requires-Dist: python-pptx>=1.0.2
32
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
33
+ Requires-Dist: xlsx2csv>=0.8.4
34
+
35
+ # Kreuzberg
36
+
37
+ Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
38
+
39
+ ## Why Kreuzberg?
40
+
41
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
42
+ - **Local Processing**: No external API calls or cloud dependencies required
43
+ - **Resource Efficient**: Lightweight processing without GPU requirements
44
+ - **Format Support**: Comprehensive support for documents, images, and text formats
45
+ - **Modern Python**: Built with async/await, type hints, and current best practices
46
+
47
+ Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
48
+
49
+ ## Features
50
+
51
+ - **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
52
+ - **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
53
+ - **Modern Python Design**:
54
+ - Async-first API using `anyio`
55
+ - Comprehensive type hints for better IDE support
56
+ - Detailed error handling with context information
57
+ - **Production Ready**:
58
+ - Robust error handling
59
+ - Detailed debugging information
60
+ - Memory efficient processing
61
+
62
+ ## Installation
63
+
64
+ ### 1. Install the Python Package
65
+
66
+ ```shell
67
+ pip install kreuzberg
68
+ ```
69
+
70
+ ### 2. Install System Dependencies
71
+
72
+ Kreuzberg requires two system level dependencies:
73
+
74
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
75
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
76
+
77
+ Please install these using their respective installation guides.
78
+
79
+ ## Architecture
80
+
81
+ Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
82
+
83
+ - **PDF Processing**:
84
+ - `pdfium2` for searchable PDFs
85
+ - Tesseract OCR for scanned content
86
+ - **Document Conversion**:
87
+ - Pandoc for many document and markup formats
88
+ - `python-pptx` for PowerPoint files
89
+ - `html-to-markdown` for HTML content
90
+ - `xlsx2csv` for Excel spreadsheets
91
+ - **Text Processing**:
92
+ - Smart encoding detection
93
+ - Markdown and plain text handling
94
+
95
+ ### Supported Formats
96
+
97
+ #### Document Formats
98
+
99
+ - PDF (`.pdf`, both searchable and scanned documents)
100
+ - Microsoft Word (`.docx`, `.doc`)
101
+ - PowerPoint presentations (`.pptx`)
102
+ - OpenDocument Text (`.odt`)
103
+ - Rich Text Format (`.rtf`)
104
+ - EPUB (`.epub`)
105
+ - DocBook XML (`.dbk`, `.xml`)
106
+ - FictionBook (`.fb2`)
107
+ - LaTeX (`.tex`, `.latex`)
108
+ - Typst (`.typ`)
109
+
110
+ #### Markup and Text Formats
111
+
112
+ - HTML (`.html`, `.htm`)
113
+ - Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
114
+ - reStructuredText (`.rst`)
115
+ - Org-mode (`.org`)
116
+ - DokuWiki (`.txt`)
117
+ - Pod (`.pod`)
118
+ - Man pages (`.1`, `.2`, etc.)
119
+
120
+ #### Data and Research Formats
121
+
122
+ - Excel spreadsheets (`.xlsx`)
123
+ - CSV (`.csv`) and TSV (`.tsv`) files
124
+ - Jupyter Notebooks (`.ipynb`)
125
+ - BibTeX (`.bib`) and BibLaTeX (`.bib`)
126
+ - CSL-JSON (`.json`)
127
+ - EndNote XML (`.xml`)
128
+ - RIS (`.ris`)
129
+ - JATS XML (`.xml`)
130
+
131
+ #### Image Formats
132
+
133
+ - JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
134
+ - PNG (`.png`)
135
+ - TIFF (`.tiff`, `.tif`)
136
+ - BMP (`.bmp`)
137
+ - GIF (`.gif`)
138
+ - WebP (`.webp`)
139
+ - JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
140
+ - Portable Anymap (`.pnm`)
141
+ - Portable Bitmap (`.pbm`)
142
+ - Portable Graymap (`.pgm`)
143
+ - Portable Pixmap (`.ppm`)
144
+
145
+ ## Usage
146
+
147
+ Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
148
+
149
+ - `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
150
+ - `extract_bytes()`: Extract text from bytes (accepts a byte string)
151
+
152
+ ### Quick Start
153
+
154
+ ```python
155
+ from pathlib import Path
156
+ from kreuzberg import extract_file, extract_bytes
157
+
158
+ # Basic file extraction
159
+ async def extract_document():
160
+ # Extract from a PDF file
161
+ pdf_result = await extract_file("document.pdf")
162
+ print(f"PDF text: {pdf_result.content}")
163
+
164
+ # Extract from an image
165
+ img_result = await extract_file("scan.png")
166
+ print(f"Image text: {img_result.content}")
167
+
168
+ # Extract from Word document
169
+ docx_result = await extract_file(Path("document.docx"))
170
+ print(f"Word text: {docx_result.content}")
171
+ ```
172
+
173
+ ### Processing Uploaded Files
174
+
175
+ ```python
176
+ from kreuzberg import extract_bytes
177
+
178
+ async def process_upload(file_content: bytes, mime_type: str):
179
+ """Process uploaded file content with known MIME type."""
180
+ result = await extract_bytes(file_content, mime_type=mime_type)
181
+ return result.content
182
+
183
+ # Example usage with different file types
184
+ async def handle_uploads():
185
+ # Process PDF upload
186
+ pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
187
+
188
+ # Process image upload
189
+ img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
190
+
191
+ # Process Word document upload
192
+ docx_result = await extract_bytes(docx_bytes,
193
+ mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
194
+ ```
195
+
196
+ ### Advanced Features
197
+
198
+ #### PDF Processing Options
199
+
200
+ ```python
201
+ from kreuzberg import extract_file
202
+
203
+ async def process_pdf():
204
+ # Force OCR for PDFs with embedded images or scanned content
205
+ result = await extract_file("document.pdf", force_ocr=True)
206
+
207
+ # Process a scanned PDF (automatically uses OCR)
208
+ scanned = await extract_file("scanned.pdf")
209
+ ```
210
+
211
+ #### ExtractionResult Object
212
+
213
+ All extraction functions return an `ExtractionResult` containing:
214
+
215
+ - `content`: The extracted text (str)
216
+ - `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
217
+
218
+ ```python
219
+ from kreuzberg import ExtractionResult
220
+
221
+ async def process_document(path: str) -> tuple[str, str]:
222
+ # Access as a named tuple
223
+ result: ExtractionResult = await extract_file(path)
224
+ print(f"Content: {result.content}")
225
+ print(f"Format: {result.mime_type}")
226
+
227
+ # Or unpack as a tuple
228
+ content, mime_type = await extract_file(path)
229
+ return content, mime_type
230
+ ```
231
+
232
+ ### Error Handling
233
+
234
+ Kreuzberg provides detailed error handling with two main exception types:
235
+
236
+ ```python
237
+ from kreuzberg import extract_file
238
+ from kreuzberg.exceptions import ValidationError, ParsingError
239
+
240
+ async def safe_extract(path: str) -> str:
241
+ try:
242
+ result = await extract_file(path)
243
+ return result.content
244
+
245
+ except ValidationError as e:
246
+ # Handles input validation issues:
247
+ # - Unsupported file types
248
+ # - Missing files
249
+ # - Invalid MIME types
250
+ print(f"Invalid input: {e.message}")
251
+ print(f"Details: {e.context}")
252
+
253
+ except ParsingError as e:
254
+ # Handles processing errors:
255
+ # - PDF parsing failures
256
+ # - OCR errors
257
+ # - Format conversion issues
258
+ print(f"Processing failed: {e.message}")
259
+ print(f"Details: {e.context}")
260
+
261
+ return ""
262
+
263
+ # Example error contexts
264
+ try:
265
+ result = await extract_file("document.xyz")
266
+ except ValidationError as e:
267
+ # e.context might contain:
268
+ # {
269
+ # "file_path": "document.xyz",
270
+ # "error": "Unsupported file type",
271
+ # "supported_types": ["pdf", "docx", ...]
272
+ # }
273
+
274
+ try:
275
+ result = await extract_file("scan.pdf")
276
+ except ParsingError as e:
277
+ # e.context might contain:
278
+ # {
279
+ # "file_path": "scan.pdf",
280
+ # "error": "OCR processing failed",
281
+ # "details": "Tesseract error: Unable to process image"
282
+ # }
283
+ ```
284
+
285
+ ## Roadmap
286
+
287
+ V1:
288
+
289
+ - [x] - html file text extraction
290
+ - [ ] - better PDF table extraction
291
+ - [ ] - batch APIs
292
+ - [ ] - sync APIs
293
+
294
+ V2:
295
+
296
+ - [ ] - metadata extraction (breaking change)
297
+ - [ ] - TBD
298
+
299
+ ## Contribution
300
+
301
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
302
+ submitting PRs to avoid disappointment.
303
+
304
+ ### Local Development
305
+
306
+ 1. Clone the repo
307
+ 2. Install the system dependencies
308
+ 3. Install the full dependencies with `uv sync`
309
+ 4. Install the pre-commit hooks with:
310
+ ```shell
311
+ pre-commit install && pre-commit install --hook-type commit-msg
312
+ ```
313
+ 5. Make your changes and submit a PR
314
+
315
+ ## License
316
+
317
+ This library uses the MIT license.
@@ -0,0 +1,15 @@
1
+ kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
+ kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
3
+ kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
4
+ kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
5
+ kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
6
+ kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
7
+ kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
8
+ kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
9
+ kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
10
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
+ kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
13
+ kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
+ kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
+ kreuzberg-1.6.0.dist-info/RECORD,,
@@ -1,304 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: kreuzberg
3
- Version: 1.4.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
5
- Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
- License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
- Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3 :: Only
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Classifier: Programming Language :: Python :: 3.12
18
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
- Classifier: Topic :: Text Processing :: General
21
- Classifier: Topic :: Utilities
22
- Classifier: Typing :: Typed
23
- Requires-Python: >=3.9
24
- Description-Content-Type: text/markdown
25
- License-File: LICENSE
26
- Requires-Dist: anyio>=4.8.0
27
- Requires-Dist: charset-normalizer>=3.4.1
28
- Requires-Dist: html-to-markdown>=1.2.0
29
- Requires-Dist: pypandoc>=1.15
30
- Requires-Dist: pypdfium2>=4.30.1
31
- Requires-Dist: python-pptx>=1.0.2
32
-
33
- # Kreuzberg
34
-
35
- Kreuzberg is a library for simplified text extraction from PDF files. It's meant to offer simple, hassle free text
36
- extraction.
37
-
38
- Why?
39
-
40
- I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
41
- There are quite a lot of commercial options out there, and several open-source + paid options.
42
- But I wanted something simple, which does not require expansive round-trips to an external API.
43
- Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
44
-
45
- Hence, this library.
46
-
47
- ## Features
48
-
49
- - Extract text from PDFs, images, office documents and more (see supported formats below)
50
- - Use modern Python with async (via `anyio`) and proper type hints
51
- - Extensive error handling for easy debugging
52
-
53
- ## Installation
54
-
55
- 1. Begin by installing the python package:
56
-
57
- ```shell
58
-
59
- pip install kreuzberg
60
-
61
- ```
62
-
63
- 2. Install the system dependencies:
64
-
65
- - [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
66
- - [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
67
-
68
- ## Dependencies and Philosophy
69
-
70
- This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
71
- high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
72
- polished and well maintained.
73
-
74
- ### Dependencies
75
-
76
- - PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
77
- - Images are processed using Tesseract OCR
78
- - Office documents and other formats are processed using Pandoc
79
- - PPTX files are converted using python-pptx
80
- - HTML files are converted using html-to-markdown
81
- - Plain text files are read directly with appropriate encoding detection
82
-
83
- ### Roadmap
84
-
85
- V1:
86
-
87
- - [x] - html file text extraction
88
- - [ ] - better PDF table extraction
89
- - [ ] - TBD
90
-
91
- V2:
92
-
93
- - [ ] - extra install groups (to make dependencies optional)
94
- - [ ] - metadata extraction (possible breaking change)
95
- - [ ] - TBD
96
-
97
- ### Feature Requests
98
-
99
- Feel free to open a discussion in GitHub or an issue if you have any feature requests
100
-
101
- ### Contribution
102
-
103
- Is welcome! Read guidelines below.
104
-
105
- ## Supported File Types
106
-
107
- Kreuzberg supports a wide range of file formats:
108
-
109
- ### Document Formats
110
-
111
- - PDF (`.pdf`) - both searchable and scanned documents
112
- - Word Documents (`.docx`, `.doc`)
113
- - Power Point Presentations (`.pptx`)
114
- - OpenDocument Text (`.odt`)
115
- - Rich Text Format (`.rtf`)
116
-
117
- ### Image Formats
118
-
119
- - JPEG, JPG (`.jpg`, `.jpeg`, `.pjpeg`)
120
- - PNG (`.png`)
121
- - TIFF (`.tiff`, `.tif`)
122
- - BMP (`.bmp`)
123
- - GIF (`.gif`)
124
- - WebP (`.webp`)
125
- - JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
126
- - Portable Anymap (`.pnm`)
127
- - Portable Bitmap (`.pbm`)
128
- - Portable Graymap (`.pgm`)
129
- - Portable Pixmap (`.ppm`)
130
-
131
- #### Text and Markup Formats
132
-
133
- - HTML (`.html`, `.htm`)
134
- - Plain Text (`.txt`)
135
- - Markdown (`.md`)
136
- - reStructuredText (`.rst`)
137
- - LaTeX (`.tex`)
138
-
139
- #### Data Formats
140
-
141
- - Comma-Separated Values (`.csv`)
142
- - Tab-Separated Values (`.tsv`)
143
-
144
- ## Usage
145
-
146
- Kreuzberg exports two async functions:
147
-
148
- - Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
149
- - Extract text from a byte-string using `extract_bytes()`
150
-
151
- ### Extract from File
152
-
153
- ```python
154
- from pathlib import Path
155
- from kreuzberg import extract_file
156
-
157
-
158
- # Extract text from a PDF file
159
- async def extract_pdf():
160
- result = await extract_file("document.pdf")
161
- print(f"Extracted text: {result.content}")
162
- print(f"Output mime type: {result.mime_type}")
163
-
164
-
165
- # Extract text from an image
166
- async def extract_image():
167
- result = await extract_file("scan.png")
168
- print(f"Extracted text: {result.content}")
169
-
170
-
171
- # or use Path
172
-
173
- async def extract_pdf():
174
- result = await extract_file(Path("document.pdf"))
175
- print(f"Extracted text: {result.content}")
176
- print(f"Output mime type: {result.mime_type}")
177
- ```
178
-
179
- ### Extract from Bytes
180
-
181
- ```python
182
- from kreuzberg import extract_bytes
183
-
184
-
185
- # Extract text from PDF bytes
186
- async def process_uploaded_pdf(pdf_content: bytes):
187
- result = await extract_bytes(pdf_content, mime_type="application/pdf")
188
- return result.content
189
-
190
-
191
- # Extract text from image bytes
192
- async def process_uploaded_image(image_content: bytes):
193
- result = await extract_bytes(image_content, mime_type="image/jpeg")
194
- return result.content
195
- ```
196
-
197
- ### Forcing OCR
198
-
199
- When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
200
- You can do this by passing `force_ocr=True`:
201
-
202
- ```python
203
- from kreuzberg import extract_bytes
204
-
205
-
206
- # Extract text from PDF bytes and force OCR
207
- async def process_uploaded_pdf(pdf_content: bytes):
208
- result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
209
- return result.content
210
- ```
211
-
212
- ### Error Handling
213
-
214
- Kreuzberg raises two exception types:
215
-
216
- #### ValidationError
217
-
218
- Raised when there are issues with input validation:
219
-
220
- - Unsupported mime types
221
- - Undetectable mime types
222
- - Path doesn't point at an exist file
223
-
224
- #### ParsingError
225
-
226
- Raised when there are issues during the text extraction process:
227
-
228
- - PDF parsing failures
229
- - OCR errors
230
- - Pandoc conversion errors
231
-
232
- ```python
233
- from kreuzberg import extract_file
234
- from kreuzberg.exceptions import ValidationError, ParsingError
235
-
236
-
237
- async def safe_extract():
238
- try:
239
- result = await extract_file("document.doc")
240
- return result.content
241
- except ValidationError as e:
242
- print(f"Validation error: {e.message}")
243
- print(f"Context: {e.context}")
244
- except ParsingError as e:
245
- print(f"Parsing error: {e.message}")
246
- print(f"Context: {e.context}") # Contains detailed error information
247
- ```
248
-
249
- Both error types include helpful context information for debugging:
250
-
251
- ```python
252
- try:
253
- result = await extract_file("scanned.pdf")
254
- except ParsingError as e:
255
- # e.context might contain:
256
- # {
257
- # "file_path": "scanned.pdf",
258
- # "error": "Tesseract OCR failed: Unable to process image"
259
- # }
260
- ```
261
-
262
- ### ExtractionResult
263
-
264
- All extraction functions return an ExtractionResult named tuple containing:
265
-
266
- - `content`: The extracted text as a string
267
- - `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
268
-
269
- ```python
270
- from kreuzberg import ExtractionResult
271
-
272
-
273
- async def process_document(path: str) -> str:
274
- result: ExtractionResult = await extract_file(path)
275
- return result.content
276
-
277
-
278
- # or access the result as tuple
279
-
280
- async def process_document(path: str) -> str:
281
- content, mime_type = await extract_file(path)
282
- # do something with mime_type
283
- return content
284
- ```
285
-
286
- ## Contribution
287
-
288
- This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
289
- submitting PRs to avoid disappointment.
290
-
291
- ### Local Development
292
-
293
- 1. Clone the repo
294
- 2. Install the system dependencies
295
- 3. Install the full dependencies with `uv sync`
296
- 4. Install the pre-commit hooks with:
297
- ```shell
298
- pre-commit install && pre-commit install --hook-type commit-msg
299
- ```
300
- 5. Make your changes and submit a PR
301
-
302
- ## License
303
-
304
- This library uses the MIT license.
@@ -1,14 +0,0 @@
1
- kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=Z6fxNMODsiNGPBv8gYpZ0jrc2hPbX-56xdrVPJ-6SQ4,7658
3
- kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
4
- kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
5
- kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
- kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
7
- kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
8
- kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
9
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- kreuzberg-1.4.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
11
- kreuzberg-1.4.0.dist-info/METADATA,sha256=ul0iSWSu_1i029aq8X4T4ZboOzWpKK8wZRuvvLVqAoQ,8503
12
- kreuzberg-1.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- kreuzberg-1.4.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
14
- kreuzberg-1.4.0.dist-info/RECORD,,