kreuzberg 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -4,16 +4,15 @@ import re
4
4
  from contextlib import suppress
5
5
  from html import escape
6
6
  from io import BytesIO
7
- from typing import TYPE_CHECKING, cast
7
+ from typing import TYPE_CHECKING
8
8
 
9
9
  import html_to_markdown
10
10
  import pptx
11
- import pypandoc
12
11
  import pypdfium2
13
12
  from anyio import Path as AsyncPath
14
- from charset_normalizer import detect
13
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
15
14
 
16
- from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
15
+ from kreuzberg._pandoc import process_content, process_file
17
16
  from kreuzberg._string import normalize_spaces, safe_decode
18
17
  from kreuzberg._sync import run_sync
19
18
  from kreuzberg._tesseract import batch_process_images
@@ -98,32 +97,18 @@ async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
98
97
  return await extract_pdf_with_tesseract(file_path)
99
98
 
100
99
 
101
- async def extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
100
+ async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
102
101
  """Extract text using pandoc.
103
102
 
104
103
  Args:
105
104
  file_data: The content of the file.
106
105
  mime_type: The mime type of the file.
107
- encoding: An optional encoding to use when decoding the string.
108
-
109
- Raises:
110
- ParsingError: If the text could not be extracted from the file using pandoc.
111
106
 
112
107
  Returns:
113
108
  The extracted text.
114
109
  """
115
- ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
116
- encoding = encoding or detect(file_data)["encoding"] or "utf-8"
117
- try:
118
- return normalize_spaces(
119
- cast(str, await run_sync(pypandoc.convert_text, file_data, to="md", format=ext, encoding=encoding))
120
- )
121
- except RuntimeError as e:
122
- # TODO: add test case
123
- raise ParsingError(
124
- f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
125
- context={"error": str(e)},
126
- ) from e
110
+ result = await process_content(file_data, mime_type=mime_type)
111
+ return normalize_spaces(result.content)
127
112
 
128
113
 
129
114
  async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
@@ -133,20 +118,11 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
133
118
  file_path: The path to the file.
134
119
  mime_type: The mime type of the file.
135
120
 
136
- Raises:
137
- ParsingError: If the text could not be extracted from the file using pandoc.
138
-
139
121
  Returns:
140
122
  The extracted text.
141
123
  """
142
- ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
143
- try:
144
- return normalize_spaces(cast(str, await run_sync(pypandoc.convert_file, file_path, to="md", format=ext)))
145
- except RuntimeError as e:
146
- raise ParsingError(
147
- f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
148
- context={"file_path": str(file_path), "error": str(e)},
149
- ) from e
124
+ result = await process_file(file_path, mime_type=mime_type)
125
+ return normalize_spaces(result.content)
150
126
 
151
127
 
152
128
  async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
@@ -161,8 +137,6 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
161
137
  Returns:
162
138
  The extracted text content
163
139
  """
164
- from pptx.enum.shapes import MSO_SHAPE_TYPE
165
-
166
140
  md_content = ""
167
141
  file_contents = (
168
142
  file_path_or_contents
kreuzberg/_mime_types.py CHANGED
@@ -54,44 +54,35 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
54
54
  "image/x-portable-pixmap": "ppm",
55
55
  }
56
56
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
57
- "application/csv",
58
- "application/latex",
57
+ "application/csl+json",
58
+ "application/docbook+xml",
59
+ "application/epub+zip",
59
60
  "application/rtf",
60
61
  "application/vnd.oasis.opendocument.text",
61
62
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
62
- "application/x-csv",
63
+ "application/x-biblatex",
64
+ "application/x-bibtex",
65
+ "application/x-endnote+xml",
66
+ "application/x-fictionbook+xml",
67
+ "application/x-ipynb+json",
68
+ "application/x-jats+xml",
63
69
  "application/x-latex",
64
- "application/x-rtf",
65
- "application/x-vnd.oasis.opendocument.text",
70
+ "application/x-opml+xml",
71
+ "application/x-research-info-systems",
72
+ "application/x-typst",
66
73
  "text/csv",
67
- "text/latex",
68
- "text/rst",
69
- "text/rtf",
70
74
  "text/tab-separated-values",
71
- "text/x-csv",
72
- "text/x-latex",
75
+ "text/troff",
76
+ "text/x-commonmark",
77
+ "text/x-dokuwiki",
78
+ "text/x-gfm",
79
+ "text/x-markdown",
80
+ "text/x-markdown-extra",
81
+ "text/x-mdoc",
82
+ "text/x-multimarkdown",
83
+ "text/x-org",
84
+ "text/x-pod",
73
85
  "text/x-rst",
74
- "text/x-tsv",
75
- }
76
- PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
77
- "application/csv": "csv",
78
- "application/latex": "latex",
79
- "application/rtf": "rtf",
80
- "application/vnd.oasis.opendocument.text": "odt",
81
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
82
- "application/x-csv": "csv",
83
- "application/x-latex": "latex",
84
- "application/x-rtf": "rtf",
85
- "application/x-vnd.oasis.opendocument.text": "odt",
86
- "text/csv": "csv",
87
- "text/latex": "latex",
88
- "text/rst": "rst",
89
- "text/rtf": "rtf",
90
- "text/tab-separated-values": "tsv",
91
- "text/x-csv": "csv",
92
- "text/x-latex": "latex",
93
- "text/x-rst": "rst",
94
- "text/x-tsv": "tsv",
95
86
  }
96
87
 
97
88
  SUPPORTED_MIME_TYPES: Final[set[str]] = (
kreuzberg/_pandoc.py ADDED
@@ -0,0 +1,416 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import subprocess
5
+ from asyncio import gather
6
+ from dataclasses import dataclass
7
+ from tempfile import NamedTemporaryFile
8
+ from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
9
+
10
+ from anyio import Path as AsyncPath
11
+
12
+ from kreuzberg._string import normalize_spaces
13
+ from kreuzberg._sync import run_sync
14
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Mapping
18
+ from os import PathLike
19
+
20
+ try: # pragma: no cover
21
+ from typing import NotRequired # type: ignore[attr-defined]
22
+ except ImportError: # pragma: no cover
23
+ from typing_extensions import NotRequired
24
+
25
+ version_ref: Final[dict[str, bool]] = {"checked": False}
26
+
27
+
28
+ # Block-level node types in Pandoc AST
29
+ BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
30
+ BLOCK_PARA: Final = "Para" # Paragraph containing inline content
31
+ BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
32
+ BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
33
+ BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
34
+ BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
35
+
36
+ # Inline-level node types in Pandoc AST
37
+ INLINE_STR: Final = "Str" # Plain text string
38
+ INLINE_SPACE: Final = "Space" # Single space
39
+ INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
40
+ INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
41
+ INLINE_LINK: Final = "Link" # Link with text and target
42
+ INLINE_IMAGE: Final = "Image" # Image with alt text and source
43
+ INLINE_CODE: Final = "Code" # Inline code span
44
+ INLINE_MATH: Final = "Math" # Math expression
45
+
46
+ # Metadata node types in Pandoc AST
47
+ META_MAP: Final = "MetaMap" # Key-value mapping of metadata
48
+ META_LIST: Final = "MetaList" # List of metadata values
49
+ META_INLINES: Final = "MetaInlines" # Inline content in metadata
50
+ META_STRING: Final = "MetaString" # Plain string in metadata
51
+ META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
52
+
53
+ # Node content field name
54
+ CONTENT_FIELD: Final = "c"
55
+ TYPE_FIELD: Final = "t"
56
+
57
+ # Valid node types
58
+ NodeType = Literal[
59
+ # Block types
60
+ "Header",
61
+ "Para",
62
+ "CodeBlock",
63
+ "BlockQuote",
64
+ "BulletList",
65
+ "OrderedList",
66
+ # Inline types
67
+ "Str",
68
+ "Space",
69
+ "Emph",
70
+ "Strong",
71
+ "Link",
72
+ "Image",
73
+ "Code",
74
+ "Math",
75
+ # Meta types
76
+ "MetaMap",
77
+ "MetaList",
78
+ "MetaInlines",
79
+ "MetaString",
80
+ "MetaBlocks",
81
+ ]
82
+
83
+ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
84
+ "application/csl+json": "csljson",
85
+ "application/docbook+xml": "docbook",
86
+ "application/epub+zip": "epub",
87
+ "application/rtf": "rtf",
88
+ "application/vnd.oasis.opendocument.text": "odt",
89
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
90
+ "application/x-biblatex": "biblatex",
91
+ "application/x-bibtex": "bibtex",
92
+ "application/x-endnote+xml": "endnotexml",
93
+ "application/x-fictionbook+xml": "fb2",
94
+ "application/x-ipynb+json": "ipynb",
95
+ "application/x-jats+xml": "jats",
96
+ "application/x-latex": "latex",
97
+ "application/x-opml+xml": "opml",
98
+ "application/x-research-info-systems": "ris",
99
+ "application/x-typst": "typst",
100
+ "text/csv": "csv",
101
+ "text/tab-separated-values": "tsv",
102
+ "text/troff": "man",
103
+ "text/x-commonmark": "commonmark",
104
+ "text/x-dokuwiki": "dokuwiki",
105
+ "text/x-gfm": "gfm",
106
+ "text/x-markdown": "markdown",
107
+ "text/x-markdown-extra": "markdown_phpextra",
108
+ "text/x-mdoc": "mdoc",
109
+ "text/x-multimarkdown": "markdown_mmd",
110
+ "text/x-org": "org",
111
+ "text/x-pod": "pod",
112
+ "text/x-rst": "rst",
113
+ }
114
+
115
+
116
+ class Metadata(TypedDict, total=False):
117
+ """Document metadata extracted from Pandoc document.
118
+
119
+ All fields are optional but will only be included if they contain non-empty values.
120
+ Any field that would be empty or None is omitted from the dictionary.
121
+ """
122
+
123
+ title: NotRequired[str]
124
+ """Document title."""
125
+ subtitle: NotRequired[str]
126
+ """Document subtitle."""
127
+ abstract: NotRequired[str | list[str]]
128
+ """Document abstract, summary or description."""
129
+ authors: NotRequired[list[str]]
130
+ """List of document authors."""
131
+ date: NotRequired[str]
132
+ """Document date as string to preserve original format."""
133
+ subject: NotRequired[str]
134
+ """Document subject or topic."""
135
+ description: NotRequired[str]
136
+ """Extended description."""
137
+ keywords: NotRequired[list[str]]
138
+ """Keywords or tags."""
139
+ categories: NotRequired[list[str]]
140
+ """Categories or classifications."""
141
+ version: NotRequired[str]
142
+ """Version identifier."""
143
+ language: NotRequired[str]
144
+ """Document language code."""
145
+ references: NotRequired[list[str]]
146
+ """Reference entries."""
147
+ citations: NotRequired[list[str]]
148
+ """Citation identifiers."""
149
+ copyright: NotRequired[str]
150
+ """Copyright information."""
151
+ license: NotRequired[str]
152
+ """License information."""
153
+ identifier: NotRequired[str]
154
+ """Document identifier."""
155
+ publisher: NotRequired[str]
156
+ """Publisher name."""
157
+ contributors: NotRequired[list[str]]
158
+ """Additional contributors."""
159
+ creator: NotRequired[str]
160
+ """Document creator."""
161
+ institute: NotRequired[str | list[str]]
162
+ """Institute or organization."""
163
+
164
+
165
+ @dataclass
166
+ class PandocResult:
167
+ """Result of a pandoc conversion including content and metadata."""
168
+
169
+ content: str
170
+ """The processed markdown content."""
171
+ metadata: Metadata
172
+ """Document metadata extracted from the source."""
173
+
174
+
175
+ def _extract_inline_text(node: dict[str, Any]) -> str | None:
176
+ if node_type := node.get(TYPE_FIELD):
177
+ if node_type == INLINE_STR:
178
+ return node.get(CONTENT_FIELD)
179
+ if node_type == INLINE_SPACE:
180
+ return " "
181
+ if node_type in (INLINE_EMPH, INLINE_STRONG):
182
+ return _extract_inlines(node.get(CONTENT_FIELD, []))
183
+ return None # pragma: no cover
184
+
185
+
186
+ def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
187
+ texts = [text for node in nodes if (text := _extract_inline_text(node))]
188
+ result = "".join(texts).strip()
189
+ return result if result else None
190
+
191
+
192
+ def _extract_meta_value(node: Any) -> str | list[str] | None:
193
+ if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
194
+ return None
195
+
196
+ content = node[CONTENT_FIELD]
197
+ node_type = node[TYPE_FIELD]
198
+
199
+ if not content or node_type not in {
200
+ META_STRING,
201
+ META_INLINES,
202
+ META_LIST,
203
+ META_BLOCKS,
204
+ }:
205
+ return None
206
+
207
+ if node_type == META_STRING and isinstance(content, str):
208
+ return content
209
+
210
+ if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
211
+ if node_type == META_INLINES:
212
+ return _extract_inlines(cast(list[dict[str, Any]], content))
213
+
214
+ if node_type == META_LIST:
215
+ results = []
216
+ for value in [value for item in content if (value := _extract_meta_value(item))]:
217
+ if isinstance(value, list):
218
+ results.extend(value) # pragma: no cover
219
+ else:
220
+ results.append(value)
221
+ return results
222
+
223
+ if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
224
+ block_texts = []
225
+ for block in blocks:
226
+ block_content = block.get(CONTENT_FIELD, [])
227
+ if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
228
+ block_texts.append(text)
229
+ return block_texts if block_texts else None
230
+
231
+ return None
232
+
233
+
234
+ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
235
+ """Extract all non-empty metadata values from Pandoc AST metadata."""
236
+ meta: Metadata = {}
237
+
238
+ for key, value in raw_meta.items():
239
+ if extracted := _extract_meta_value(value):
240
+ meta[key] = extracted # type: ignore[literal-required]
241
+
242
+ citations = [
243
+ cite["citationId"]
244
+ for block in raw_meta.get("blocks", [])
245
+ if block.get(TYPE_FIELD) == "Cite"
246
+ for cite in block.get(CONTENT_FIELD, [[{}]])[0]
247
+ if isinstance(cite, dict)
248
+ ]
249
+ if citations:
250
+ meta["citations"] = citations
251
+
252
+ return meta
253
+
254
+
255
+ def _get_extension_from_mime_type(mime_type: str) -> str:
256
+ if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
257
+ mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
258
+ ):
259
+ raise ValidationError(
260
+ f"Unsupported mime type: {mime_type}",
261
+ context={
262
+ "mime_type": mime_type,
263
+ "supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
264
+ },
265
+ )
266
+
267
+ return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
268
+ PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
269
+ )
270
+
271
+
272
+ async def validate_pandoc_version() -> None:
273
+ """Validate that Pandoc is installed and is version 3 or above.
274
+
275
+ Raises:
276
+ MissingDependencyError: If Pandoc is not installed or is below version 3.
277
+ """
278
+ try:
279
+ if version_ref["checked"]:
280
+ return
281
+
282
+ result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
283
+ version = result.stdout.decode().split("\n")[0].split()[1]
284
+ if not version.startswith("3."):
285
+ raise MissingDependencyError("Pandoc version 3 or above is required.")
286
+
287
+ version_ref["checked"] = True
288
+
289
+ except FileNotFoundError as e:
290
+ raise MissingDependencyError("Pandoc is not installed.") from e
291
+
292
+
293
+ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
294
+ """Extract metadata from a document using pandoc.
295
+
296
+ Args:
297
+ input_file: The path to the file to process.
298
+ mime_type: The mime type of the file.
299
+
300
+ Raises:
301
+ ParsingError: If Pandoc fails to extract metadata.
302
+
303
+ Returns:
304
+ Dictionary containing document metadata.
305
+ """
306
+ extension = _get_extension_from_mime_type(mime_type)
307
+
308
+ with NamedTemporaryFile(suffix=".json") as metadata_file:
309
+ try:
310
+ command = [
311
+ "pandoc",
312
+ str(input_file),
313
+ f"--from={extension}",
314
+ "--to=json",
315
+ "--standalone",
316
+ "--quiet",
317
+ "--output",
318
+ metadata_file.name,
319
+ ]
320
+
321
+ result = await run_sync(
322
+ subprocess.run,
323
+ command,
324
+ capture_output=True,
325
+ )
326
+
327
+ if result.returncode != 0:
328
+ raise ParsingError(
329
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
330
+ )
331
+
332
+ json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
333
+ return _extract_metadata(json_data)
334
+
335
+ except (RuntimeError, OSError, json.JSONDecodeError) as e:
336
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
337
+
338
+
339
+ async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
340
+ extension = _get_extension_from_mime_type(mime_type)
341
+
342
+ with NamedTemporaryFile(suffix=".md") as output_file:
343
+ command = [
344
+ "pandoc",
345
+ str(input_file),
346
+ f"--from={extension}",
347
+ "--to=markdown",
348
+ "--standalone",
349
+ "--wrap=preserve",
350
+ "--quiet",
351
+ "--output",
352
+ output_file.name,
353
+ ]
354
+
355
+ if extra_args:
356
+ command.extend(extra_args)
357
+
358
+ result = await run_sync(
359
+ subprocess.run,
360
+ command,
361
+ capture_output=True,
362
+ )
363
+
364
+ if result.returncode != 0:
365
+ raise ParsingError(
366
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
367
+ )
368
+
369
+ text = await AsyncPath(output_file.name).read_text()
370
+
371
+ return normalize_spaces(text)
372
+
373
+
374
+ async def process_file(
375
+ input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
376
+ ) -> PandocResult:
377
+ """Process a single file using Pandoc and convert to markdown.
378
+
379
+ Args:
380
+ input_file: The path to the file to process.
381
+ mime_type: The mime type of the file.
382
+ extra_args: Additional Pandoc command line arguments.
383
+
384
+ Returns:
385
+ PandocResult containing processed content and metadata.
386
+ """
387
+ await validate_pandoc_version()
388
+
389
+ metadata, content = await gather(
390
+ *[
391
+ extract_metadata(input_file, mime_type=mime_type),
392
+ _extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
393
+ ]
394
+ )
395
+ return PandocResult(
396
+ content=content, # type: ignore[arg-type]
397
+ metadata=metadata, # type: ignore[arg-type]
398
+ )
399
+
400
+
401
+ async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
402
+ """Process content using Pandoc and convert to markdown.
403
+
404
+ Args:
405
+ content: The content to process.
406
+ mime_type: The mime type of the content.
407
+ extra_args: Additional Pandoc command line arguments.
408
+
409
+ Returns:
410
+ PandocResult containing processed content and metadata.
411
+ """
412
+ extension = _get_extension_from_mime_type(mime_type)
413
+
414
+ with NamedTemporaryFile(suffix=f".{extension}") as input_file:
415
+ await AsyncPath(input_file.name).write_bytes(content)
416
+ return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
@@ -0,0 +1,318 @@
1
+ Metadata-Version: 2.2
2
+ Name: kreuzberg
3
+ Version: 1.5.0
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: anyio>=4.8.0
28
+ Requires-Dist: charset-normalizer>=3.4.1
29
+ Requires-Dist: html-to-markdown>=1.2.0
30
+ Requires-Dist: pypdfium2>=4.30.1
31
+ Requires-Dist: python-pptx>=1.0.2
32
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
33
+
34
+ # Kreuzberg
35
+
36
+ Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
37
+
38
+ ## Why Kreuzberg?
39
+
40
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
41
+ - **Local Processing**: No external API calls or cloud dependencies required
42
+ - **Resource Efficient**: Lightweight processing without GPU requirements
43
+ - **Format Support**: Comprehensive support for documents, images, and text formats
44
+ - **Modern Python**: Built with async/await, type hints, and current best practices
45
+
46
+ Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
47
+
48
+ ## Features
49
+
50
+ - **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
51
+ - **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
52
+ - **Modern Python Design**:
53
+ - Async-first API using `anyio`
54
+ - Comprehensive type hints for better IDE support
55
+ - Detailed error handling with context information
56
+ - **Production Ready**:
57
+ - Robust error handling
58
+ - Detailed debugging information
59
+ - Memory efficient processing
60
+
61
+ ## Installation
62
+
63
+ ### 1. Install the Python Package
64
+
65
+ ```shell
66
+ pip install kreuzberg
67
+ ```
68
+
69
+ ### 2. Install System Dependencies
70
+
71
+ Kreuzberg requires two open-source tools:
72
+
73
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
74
+
75
+ - GPL v2.0 licensed (used via CLI only)
76
+ - Handles office documents and markup formats
77
+
78
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
79
+ - Apache License
80
+ - Required for scanned documents and images
81
+
82
+ ## Architecture
83
+
84
+ Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
85
+
86
+ - **PDF Processing**:
87
+ - `pdfium2` for searchable PDFs
88
+ - Tesseract OCR for scanned content
89
+ - **Document Conversion**:
90
+ - Pandoc for office documents and markup
91
+ - `python-pptx` for PowerPoint files
92
+ - `html-to-markdown` for HTML content
93
+ - **Text Processing**:
94
+ - Smart encoding detection
95
+ - Markdown and plain text handling
96
+
97
+ ### Supported Formats
98
+
99
+ #### Document Formats
100
+
101
+ - PDF (`.pdf`, both searchable and scanned documents)
102
+ - Microsoft Word (`.docx`, `.doc`)
103
+ - PowerPoint presentations (`.pptx`)
104
+ - OpenDocument Text (`.odt`)
105
+ - Rich Text Format (`.rtf`)
106
+ - EPUB (`.epub`)
107
+ - DocBook XML (`.dbk`, `.xml`)
108
+ - FictionBook (`.fb2`)
109
+ - LaTeX (`.tex`, `.latex`)
110
+ - Typst (`.typ`)
111
+
112
+ #### Markup and Text Formats
113
+
114
+ - HTML (`.html`, `.htm`)
115
+ - Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
116
+ - reStructuredText (`.rst`)
117
+ - Org-mode (`.org`)
118
+ - DokuWiki (`.txt`)
119
+ - Pod (`.pod`)
120
+ - Man pages (`.1`, `.2`, etc.)
121
+
122
+ #### Data and Research Formats
123
+
124
+ - CSV (`.csv`) and TSV (`.tsv`) files
125
+ - Jupyter Notebooks (`.ipynb`)
126
+ - BibTeX (`.bib`) and BibLaTeX (`.bib`)
127
+ - CSL-JSON (`.json`)
128
+ - EndNote XML (`.xml`)
129
+ - RIS (`.ris`)
130
+ - JATS XML (`.xml`)
131
+
132
+ #### Image Formats
133
+
134
+ - JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
135
+ - PNG (`.png`)
136
+ - TIFF (`.tiff`, `.tif`)
137
+ - BMP (`.bmp`)
138
+ - GIF (`.gif`)
139
+ - WebP (`.webp`)
140
+ - JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
141
+ - Portable Anymap (`.pnm`)
142
+ - Portable Bitmap (`.pbm`)
143
+ - Portable Graymap (`.pgm`)
144
+ - Portable Pixmap (`.ppm`)
145
+
146
+ ## Usage
147
+
148
+ Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
149
+
150
+ - `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
151
+ - `extract_bytes()`: Extract text from bytes (accepts a byte string)
152
+
153
+ ### Quick Start
154
+
155
+ ```python
156
+ from pathlib import Path
157
+ from kreuzberg import extract_file, extract_bytes
158
+
159
+ # Basic file extraction
160
+ async def extract_document():
161
+ # Extract from a PDF file
162
+ pdf_result = await extract_file("document.pdf")
163
+ print(f"PDF text: {pdf_result.content}")
164
+
165
+ # Extract from an image
166
+ img_result = await extract_file("scan.png")
167
+ print(f"Image text: {img_result.content}")
168
+
169
+ # Extract from Word document
170
+ docx_result = await extract_file(Path("document.docx"))
171
+ print(f"Word text: {docx_result.content}")
172
+ ```
173
+
174
+ ### Processing Uploaded Files
175
+
176
+ ```python
177
+ from kreuzberg import extract_bytes
178
+
179
+ async def process_upload(file_content: bytes, mime_type: str):
180
+ """Process uploaded file content with known MIME type."""
181
+ result = await extract_bytes(file_content, mime_type=mime_type)
182
+ return result.content
183
+
184
+ # Example usage with different file types
185
+ async def handle_uploads():
186
+ # Process PDF upload
187
+ pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
188
+
189
+ # Process image upload
190
+ img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
191
+
192
+ # Process Word document upload
193
+ docx_result = await extract_bytes(docx_bytes,
194
+ mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
195
+ ```
196
+
197
+ ### Advanced Features
198
+
199
+ #### PDF Processing Options
200
+
201
+ ```python
202
+ from kreuzberg import extract_file
203
+
204
+ async def process_pdf():
205
+ # Force OCR for PDFs with embedded images or scanned content
206
+ result = await extract_file("document.pdf", force_ocr=True)
207
+
208
+ # Process a scanned PDF (automatically uses OCR)
209
+ scanned = await extract_file("scanned.pdf")
210
+ ```
211
+
212
+ #### ExtractionResult Object
213
+
214
+ All extraction functions return an `ExtractionResult` containing:
215
+
216
+ - `content`: The extracted text (str)
217
+ - `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
218
+
219
+ ```python
220
+ from kreuzberg import ExtractionResult
221
+
222
+ async def process_document(path: str) -> tuple[str, str]:
223
+ # Access as a named tuple
224
+ result: ExtractionResult = await extract_file(path)
225
+ print(f"Content: {result.content}")
226
+ print(f"Format: {result.mime_type}")
227
+
228
+ # Or unpack as a tuple
229
+ content, mime_type = await extract_file(path)
230
+ return content, mime_type
231
+ ```
232
+
233
+ ### Error Handling
234
+
235
+ Kreuzberg provides detailed error handling with two main exception types:
236
+
237
+ ```python
238
+ from kreuzberg import extract_file
239
+ from kreuzberg.exceptions import ValidationError, ParsingError
240
+
241
+ async def safe_extract(path: str) -> str:
242
+ try:
243
+ result = await extract_file(path)
244
+ return result.content
245
+
246
+ except ValidationError as e:
247
+ # Handles input validation issues:
248
+ # - Unsupported file types
249
+ # - Missing files
250
+ # - Invalid MIME types
251
+ print(f"Invalid input: {e.message}")
252
+ print(f"Details: {e.context}")
253
+
254
+ except ParsingError as e:
255
+ # Handles processing errors:
256
+ # - PDF parsing failures
257
+ # - OCR errors
258
+ # - Format conversion issues
259
+ print(f"Processing failed: {e.message}")
260
+ print(f"Details: {e.context}")
261
+
262
+ return ""
263
+
264
+ # Example error contexts
265
+ try:
266
+ result = await extract_file("document.xyz")
267
+ except ValidationError as e:
268
+ # e.context might contain:
269
+ # {
270
+ # "file_path": "document.xyz",
271
+ # "error": "Unsupported file type",
272
+ # "supported_types": ["pdf", "docx", ...]
273
+ # }
274
+
275
+ try:
276
+ result = await extract_file("scan.pdf")
277
+ except ParsingError as e:
278
+ # e.context might contain:
279
+ # {
280
+ # "file_path": "scan.pdf",
281
+ # "error": "OCR processing failed",
282
+ # "details": "Tesseract error: Unable to process image"
283
+ # }
284
+ ```
285
+
286
+ ## Roadmap
287
+
288
+ V1:
289
+
290
+ - [x] - html file text extraction
291
+ - [ ] - better PDF table extraction
292
+ - [ ] - batch APIs
293
+ - [ ] - sync APIs
294
+
295
+ V2:
296
+
297
+ - [ ] - metadata extraction (breaking change)
298
+ - [ ] - TBD
299
+
300
+ ## Contribution
301
+
302
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
303
+ submitting PRs to avoid disappointment.
304
+
305
+ ### Local Development
306
+
307
+ 1. Clone the repo
308
+ 2. Install the system dependencies
309
+ 3. Install the full dependencies with `uv sync`
310
+ 4. Install the pre-commit hooks with:
311
+ ```shell
312
+ pre-commit install && pre-commit install --hook-type commit-msg
313
+ ```
314
+ 5. Make your changes and submit a PR
315
+
316
+ ## License
317
+
318
+ This library uses the MIT license.
@@ -1,14 +1,15 @@
1
1
  kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=Z6fxNMODsiNGPBv8gYpZ0jrc2hPbX-56xdrVPJ-6SQ4,7658
3
- kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
2
+ kreuzberg/_extractors.py,sha256=k6xO_2ItaftPmlqzfXyxTn8rdaWdwrJHGziBbo7gCio,6599
3
+ kreuzberg/_mime_types.py,sha256=0ZYtRrMAaKpCMDkhpTbWAXHCsVob5MFRMGlbni8iYSA,2573
4
+ kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
4
5
  kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
5
6
  kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
7
  kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
7
8
  kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
8
9
  kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
9
10
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- kreuzberg-1.4.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
11
- kreuzberg-1.4.0.dist-info/METADATA,sha256=ul0iSWSu_1i029aq8X4T4ZboOzWpKK8wZRuvvLVqAoQ,8503
12
- kreuzberg-1.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- kreuzberg-1.4.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
14
- kreuzberg-1.4.0.dist-info/RECORD,,
11
+ kreuzberg-1.5.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
+ kreuzberg-1.5.0.dist-info/METADATA,sha256=O462ss7M6Cb8cO6fJXwqsOdzkzaZekqa1oGwb7Vrgx8,9641
13
+ kreuzberg-1.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
+ kreuzberg-1.5.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
+ kreuzberg-1.5.0.dist-info/RECORD,,
@@ -1,304 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: kreuzberg
3
- Version: 1.4.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
5
- Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
- License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
- Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3 :: Only
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Classifier: Programming Language :: Python :: 3.12
18
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
- Classifier: Topic :: Text Processing :: General
21
- Classifier: Topic :: Utilities
22
- Classifier: Typing :: Typed
23
- Requires-Python: >=3.9
24
- Description-Content-Type: text/markdown
25
- License-File: LICENSE
26
- Requires-Dist: anyio>=4.8.0
27
- Requires-Dist: charset-normalizer>=3.4.1
28
- Requires-Dist: html-to-markdown>=1.2.0
29
- Requires-Dist: pypandoc>=1.15
30
- Requires-Dist: pypdfium2>=4.30.1
31
- Requires-Dist: python-pptx>=1.0.2
32
-
33
- # Kreuzberg
34
-
35
- Kreuzberg is a library for simplified text extraction from PDF files. It's meant to offer simple, hassle free text
36
- extraction.
37
-
38
- Why?
39
-
40
- I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
41
- There are quite a lot of commercial options out there, and several open-source + paid options.
42
- But I wanted something simple, which does not require expansive round-trips to an external API.
43
- Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
44
-
45
- Hence, this library.
46
-
47
- ## Features
48
-
49
- - Extract text from PDFs, images, office documents and more (see supported formats below)
50
- - Use modern Python with async (via `anyio`) and proper type hints
51
- - Extensive error handling for easy debugging
52
-
53
- ## Installation
54
-
55
- 1. Begin by installing the python package:
56
-
57
- ```shell
58
-
59
- pip install kreuzberg
60
-
61
- ```
62
-
63
- 2. Install the system dependencies:
64
-
65
- - [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
66
- - [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
67
-
68
- ## Dependencies and Philosophy
69
-
70
- This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
71
- high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
72
- polished and well maintained.
73
-
74
- ### Dependencies
75
-
76
- - PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
77
- - Images are processed using Tesseract OCR
78
- - Office documents and other formats are processed using Pandoc
79
- - PPTX files are converted using python-pptx
80
- - HTML files are converted using html-to-markdown
81
- - Plain text files are read directly with appropriate encoding detection
82
-
83
- ### Roadmap
84
-
85
- V1:
86
-
87
- - [x] - html file text extraction
88
- - [ ] - better PDF table extraction
89
- - [ ] - TBD
90
-
91
- V2:
92
-
93
- - [ ] - extra install groups (to make dependencies optional)
94
- - [ ] - metadata extraction (possible breaking change)
95
- - [ ] - TBD
96
-
97
- ### Feature Requests
98
-
99
- Feel free to open a discussion in GitHub or an issue if you have any feature requests
100
-
101
- ### Contribution
102
-
103
- Is welcome! Read guidelines below.
104
-
105
- ## Supported File Types
106
-
107
- Kreuzberg supports a wide range of file formats:
108
-
109
- ### Document Formats
110
-
111
- - PDF (`.pdf`) - both searchable and scanned documents
112
- - Word Documents (`.docx`, `.doc`)
113
- - Power Point Presentations (`.pptx`)
114
- - OpenDocument Text (`.odt`)
115
- - Rich Text Format (`.rtf`)
116
-
117
- ### Image Formats
118
-
119
- - JPEG, JPG (`.jpg`, `.jpeg`, `.pjpeg`)
120
- - PNG (`.png`)
121
- - TIFF (`.tiff`, `.tif`)
122
- - BMP (`.bmp`)
123
- - GIF (`.gif`)
124
- - WebP (`.webp`)
125
- - JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
126
- - Portable Anymap (`.pnm`)
127
- - Portable Bitmap (`.pbm`)
128
- - Portable Graymap (`.pgm`)
129
- - Portable Pixmap (`.ppm`)
130
-
131
- #### Text and Markup Formats
132
-
133
- - HTML (`.html`, `.htm`)
134
- - Plain Text (`.txt`)
135
- - Markdown (`.md`)
136
- - reStructuredText (`.rst`)
137
- - LaTeX (`.tex`)
138
-
139
- #### Data Formats
140
-
141
- - Comma-Separated Values (`.csv`)
142
- - Tab-Separated Values (`.tsv`)
143
-
144
- ## Usage
145
-
146
- Kreuzberg exports two async functions:
147
-
148
- - Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
149
- - Extract text from a byte-string using `extract_bytes()`
150
-
151
- ### Extract from File
152
-
153
- ```python
154
- from pathlib import Path
155
- from kreuzberg import extract_file
156
-
157
-
158
- # Extract text from a PDF file
159
- async def extract_pdf():
160
- result = await extract_file("document.pdf")
161
- print(f"Extracted text: {result.content}")
162
- print(f"Output mime type: {result.mime_type}")
163
-
164
-
165
- # Extract text from an image
166
- async def extract_image():
167
- result = await extract_file("scan.png")
168
- print(f"Extracted text: {result.content}")
169
-
170
-
171
- # or use Path
172
-
173
- async def extract_pdf():
174
- result = await extract_file(Path("document.pdf"))
175
- print(f"Extracted text: {result.content}")
176
- print(f"Output mime type: {result.mime_type}")
177
- ```
178
-
179
- ### Extract from Bytes
180
-
181
- ```python
182
- from kreuzberg import extract_bytes
183
-
184
-
185
- # Extract text from PDF bytes
186
- async def process_uploaded_pdf(pdf_content: bytes):
187
- result = await extract_bytes(pdf_content, mime_type="application/pdf")
188
- return result.content
189
-
190
-
191
- # Extract text from image bytes
192
- async def process_uploaded_image(image_content: bytes):
193
- result = await extract_bytes(image_content, mime_type="image/jpeg")
194
- return result.content
195
- ```
196
-
197
- ### Forcing OCR
198
-
199
- When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
200
- You can do this by passing `force_ocr=True`:
201
-
202
- ```python
203
- from kreuzberg import extract_bytes
204
-
205
-
206
- # Extract text from PDF bytes and force OCR
207
- async def process_uploaded_pdf(pdf_content: bytes):
208
- result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
209
- return result.content
210
- ```
211
-
212
- ### Error Handling
213
-
214
- Kreuzberg raises two exception types:
215
-
216
- #### ValidationError
217
-
218
- Raised when there are issues with input validation:
219
-
220
- - Unsupported mime types
221
- - Undetectable mime types
222
- - Path doesn't point at an exist file
223
-
224
- #### ParsingError
225
-
226
- Raised when there are issues during the text extraction process:
227
-
228
- - PDF parsing failures
229
- - OCR errors
230
- - Pandoc conversion errors
231
-
232
- ```python
233
- from kreuzberg import extract_file
234
- from kreuzberg.exceptions import ValidationError, ParsingError
235
-
236
-
237
- async def safe_extract():
238
- try:
239
- result = await extract_file("document.doc")
240
- return result.content
241
- except ValidationError as e:
242
- print(f"Validation error: {e.message}")
243
- print(f"Context: {e.context}")
244
- except ParsingError as e:
245
- print(f"Parsing error: {e.message}")
246
- print(f"Context: {e.context}") # Contains detailed error information
247
- ```
248
-
249
- Both error types include helpful context information for debugging:
250
-
251
- ```python
252
- try:
253
- result = await extract_file("scanned.pdf")
254
- except ParsingError as e:
255
- # e.context might contain:
256
- # {
257
- # "file_path": "scanned.pdf",
258
- # "error": "Tesseract OCR failed: Unable to process image"
259
- # }
260
- ```
261
-
262
- ### ExtractionResult
263
-
264
- All extraction functions return an ExtractionResult named tuple containing:
265
-
266
- - `content`: The extracted text as a string
267
- - `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
268
-
269
- ```python
270
- from kreuzberg import ExtractionResult
271
-
272
-
273
- async def process_document(path: str) -> str:
274
- result: ExtractionResult = await extract_file(path)
275
- return result.content
276
-
277
-
278
- # or access the result as tuple
279
-
280
- async def process_document(path: str) -> str:
281
- content, mime_type = await extract_file(path)
282
- # do something with mime_type
283
- return content
284
- ```
285
-
286
- ## Contribution
287
-
288
- This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
289
- submitting PRs to avoid disappointment.
290
-
291
- ### Local Development
292
-
293
- 1. Clone the repo
294
- 2. Install the system dependencies
295
- 3. Install the full dependencies with `uv sync`
296
- 4. Install the pre-commit hooks with:
297
- ```shell
298
- pre-commit install && pre-commit install --hook-type commit-msg
299
- ```
300
- 5. Make your changes and submit a PR
301
-
302
- ## License
303
-
304
- This library uses the MIT license.