kreuzberg 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,10 +1,11 @@
1
+ from kreuzberg._gmft import GMFTConfig
1
2
  from kreuzberg._ocr._easyocr import EasyOCRConfig
2
3
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
3
4
  from kreuzberg._ocr._tesseract import TesseractConfig
4
5
 
5
6
  from ._ocr._tesseract import PSMMode
6
7
  from ._registry import ExtractorRegistry
7
- from ._types import ExtractionConfig, ExtractionResult, Metadata
8
+ from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
8
9
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
9
10
  from .extraction import (
10
11
  batch_extract_bytes,
@@ -22,6 +23,7 @@ __all__ = [
22
23
  "ExtractionConfig",
23
24
  "ExtractionResult",
24
25
  "ExtractorRegistry",
26
+ "GMFTConfig",
25
27
  "KreuzbergError",
26
28
  "Metadata",
27
29
  "MissingDependencyError",
@@ -29,6 +31,7 @@ __all__ = [
29
31
  "PSMMode",
30
32
  "PaddleOCRConfig",
31
33
  "ParsingError",
34
+ "TableData",
32
35
  "TesseractConfig",
33
36
  "ValidationError",
34
37
  "batch_extract_bytes",
@@ -45,20 +45,28 @@ class PDFExtractor(Extractor):
45
45
 
46
46
  async def extract_path_async(self, path: Path) -> ExtractionResult:
47
47
  content_bytes = await AsyncPath(path).read_bytes()
48
- metadata = await extract_pdf_metadata(content_bytes)
48
+
49
+ result: ExtractionResult | None = None
49
50
 
50
51
  if not self.config.force_ocr:
51
52
  content = await self._extract_pdf_searchable_text(path)
52
53
  if self._validate_extracted_text(content):
53
- return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
54
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
54
55
 
55
- if self.config.ocr_backend is not None:
56
+ if not result and self.config.ocr_backend is not None:
56
57
  result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
57
58
 
58
- result.metadata = metadata
59
- return result
59
+ if not result:
60
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
61
+
62
+ result.metadata = await extract_pdf_metadata(content_bytes)
63
+
64
+ if self.config.extract_tables:
65
+ from kreuzberg._gmft import extract_tables
66
+
67
+ result.tables = await extract_tables(path, self.config.gmft_config)
60
68
 
61
- return ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
69
+ return result
62
70
 
63
71
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
64
72
  return anyio.run(self.extract_bytes_async, content)
kreuzberg/_gmft.py ADDED
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ from kreuzberg._types import TableData
7
+ from kreuzberg._utils._sync import run_sync
8
+ from kreuzberg.exceptions import MissingDependencyError
9
+
10
+ if TYPE_CHECKING:
11
+ from os import PathLike
12
+
13
+ from gmft.detectors.base import CroppedTable
14
+ from pandas import DataFrame
15
+
16
+
17
+ @dataclass(unsafe_hash=True)
18
+ class GMFTConfig:
19
+ """Configuration options for GMFT.
20
+
21
+ This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
22
+ """
23
+
24
+ verbosity: int = 0
25
+ """
26
+ Verbosity level for logging.
27
+
28
+ 0: errors only
29
+ 1: print warnings
30
+ 2: print warnings and info
31
+ 3: print warnings, info, and debug
32
+ """
33
+ formatter_base_threshold: float = 0.3
34
+ """
35
+ Base threshold for the confidence demanded of a table feature (row/column).
36
+
37
+ Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
38
+ """
39
+ cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
40
+ default_factory=lambda: {
41
+ 0: 0.3,
42
+ 1: 0.3,
43
+ 2: 0.3,
44
+ 3: 0.3,
45
+ 4: 0.5,
46
+ 5: 0.5,
47
+ 6: 99,
48
+ },
49
+ hash=False,
50
+ )
51
+ """
52
+ Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
53
+
54
+ But low confidences may be better than too high confidence (see formatter_base_threshold)
55
+ """
56
+ detector_base_threshold: float = 0.9
57
+ """Minimum confidence score required for a table"""
58
+ remove_null_rows: bool = True
59
+ """
60
+ Flag to remove rows with no text.
61
+ """
62
+ enable_multi_header: bool = False
63
+ """
64
+ Enable multi-indices in the dataframe.
65
+
66
+ If false, then multiple headers will be merged column-wise.
67
+ """
68
+ semantic_spanning_cells: bool = False
69
+ """
70
+ [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
71
+ """
72
+ semantic_hierarchical_left_fill: str | None = "algorithm"
73
+ """
74
+ [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
75
+
76
+ Possible values: 'algorithm', 'deep', None.
77
+
78
+ 'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
79
+ 'deep': merges headers according to the spanning cells detected by the Table Transformer.
80
+ None: headers are not duplicated.
81
+ """
82
+ large_table_if_n_rows_removed: int = 8
83
+ """
84
+ If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
85
+ """
86
+ large_table_threshold: int = 10
87
+ """
88
+ With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
89
+
90
+ Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
91
+ """
92
+ large_table_row_overlap_threshold: float = 0.2
93
+ """
94
+ With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
95
+
96
+ Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
97
+ """
98
+ large_table_maximum_rows: int = 1000
99
+ """
100
+ Maximum number of rows allowed for a large table.
101
+ """
102
+ force_large_table_assumption: bool | None = None
103
+ """
104
+ Force the large table assumption to be applied, regardless of the number of rows and overlap.
105
+ """
106
+
107
+
108
+ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | None = None) -> list[TableData]:
109
+ """Extracts tables from a PDF file.
110
+
111
+ This function takes a file path to a PDF file, and an optional configuration object.
112
+ It returns a list of strings, where each string is a markdown-formatted table.
113
+
114
+ Args:
115
+ file_path: The path to the PDF file.
116
+ config: An optional configuration object.
117
+
118
+ Raises:
119
+ MissingDependencyError: Raised when the required dependencies are not installed.
120
+
121
+ Returns:
122
+ A list of table data dictionaries.
123
+ """
124
+ try:
125
+ from gmft.auto import AutoTableDetector, AutoTableFormatter
126
+ from gmft.detectors.tatr import TATRDetectorConfig
127
+ from gmft.formatters.tatr import TATRFormatConfig
128
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document
129
+
130
+ config = config or GMFTConfig()
131
+ formatter = AutoTableFormatter(
132
+ config=TATRFormatConfig(
133
+ verbosity=config.verbosity,
134
+ formatter_base_threshold=config.formatter_base_threshold,
135
+ cell_required_confidence=config.cell_required_confidence,
136
+ remove_null_rows=config.remove_null_rows,
137
+ enable_multi_header=config.enable_multi_header,
138
+ semantic_spanning_cells=config.semantic_spanning_cells,
139
+ semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
140
+ large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
141
+ large_table_threshold=config.large_table_threshold,
142
+ large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
143
+ large_table_maximum_rows=config.large_table_maximum_rows,
144
+ force_large_table_assumption=config.force_large_table_assumption,
145
+ )
146
+ )
147
+ detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))
148
+ doc = await run_sync(PyPDFium2Document, str(file_path))
149
+ cropped_tables: list[CroppedTable] = []
150
+ dataframes: list[DataFrame] = []
151
+ try:
152
+ for page in doc:
153
+ cropped_tables.extend(await run_sync(detector.extract, page))
154
+
155
+ for cropped_table in cropped_tables:
156
+ formatted_table = await run_sync(formatter.extract, cropped_table)
157
+ dataframes.append(await run_sync(formatted_table.df))
158
+
159
+ return [
160
+ TableData(
161
+ cropped_image=cropped_table.image(),
162
+ page_number=cropped_table.page.page_number,
163
+ text=data_frame.to_markdown(),
164
+ df=data_frame,
165
+ )
166
+ for data_frame, cropped_table in zip(dataframes, cropped_tables)
167
+ ]
168
+ finally:
169
+ await run_sync(doc.close)
170
+
171
+ except ImportError as e:
172
+ raise MissingDependencyError.create_for_package(
173
+ dependency_group="gmft", functionality="table extraction", package_name="gmft"
174
+ ) from e
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import platform
4
- import sys
5
4
  from dataclasses import dataclass
6
5
  from importlib.util import find_spec
7
6
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -233,17 +232,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
233
232
  Raises:
234
233
  MissingDependencyError: If PaddleOCR is not installed.
235
234
  OCRError: If initialization fails.
236
- ValidationError: If the python version is too high.
237
235
  """
238
236
  if cls._paddle_ocr is not None:
239
237
  return
240
238
 
241
- if sys.version_info >= (3, 13): # pragma: no cover
242
- raise ValidationError(
243
- "PaddleOCR is only available in python 3.12 and below. Please downgrade your Python or switch to a different OCR backend.",
244
- context={"issue": "https://github.com/PaddlePaddle/Paddle/issues/71616"},
245
- )
246
-
247
239
  try:
248
240
  from paddleocr import PaddleOCR
249
241
  except ImportError as e:
kreuzberg/_types.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import sys
4
4
  from collections.abc import Awaitable
5
- from dataclasses import asdict, dataclass
5
+ from dataclasses import asdict, dataclass, field
6
6
  from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
7
7
 
8
8
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
@@ -14,6 +14,10 @@ else: # pragma: no cover
14
14
  from typing import NotRequired
15
15
 
16
16
  if TYPE_CHECKING:
17
+ from pandas import DataFrame
18
+ from PIL.Image import Image
19
+
20
+ from kreuzberg._gmft import GMFTConfig
17
21
  from kreuzberg._ocr._easyocr import EasyOCRConfig
18
22
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
19
23
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -21,6 +25,19 @@ if TYPE_CHECKING:
21
25
  OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
22
26
 
23
27
 
28
+ class TableData(TypedDict):
29
+ """Table data, returned from table extraction."""
30
+
31
+ cropped_image: Image
32
+ """The cropped image of the table."""
33
+ df: DataFrame
34
+ """The table data as a pandas DataFrame."""
35
+ page_number: int
36
+ """The page number of the table."""
37
+ text: str
38
+ """The table text as a markdown string."""
39
+
40
+
24
41
  class Metadata(TypedDict, total=False):
25
42
  """Base metadata common to all document types.
26
43
 
@@ -88,12 +105,14 @@ class ExtractionResult:
88
105
 
89
106
  content: str
90
107
  """The extracted content."""
91
- chunks: list[str]
92
- """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
93
108
  mime_type: str
94
109
  """The mime type of the extracted content. Is either text/plain or text/markdown."""
95
110
  metadata: Metadata
96
111
  """The metadata of the content."""
112
+ tables: list[TableData] = field(default_factory=list)
113
+ """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
114
+ chunks: list[str] = field(default_factory=list)
115
+ """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
97
116
 
98
117
 
99
118
  PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
@@ -114,14 +133,22 @@ class ExtractionConfig:
114
133
  """Whether to force OCR."""
115
134
  chunk_content: bool = False
116
135
  """Whether to chunk the content into smaller chunks."""
136
+ extract_tables: bool = False
137
+ """Whether to extract tables from the content. This requires the 'gmft' dependency."""
117
138
  max_chars: int = DEFAULT_MAX_CHARACTERS
118
139
  """The size of each chunk in characters."""
119
140
  max_overlap: int = DEFAULT_MAX_OVERLAP
120
141
  """The overlap between chunks in characters."""
121
142
  ocr_backend: OcrBackendType | None = "tesseract"
122
- """The OCR backend to use."""
143
+ """The OCR backend to use.
144
+
145
+ Notes:
146
+ - If set to 'None', OCR will not be performed.
147
+ """
123
148
  ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
124
149
  """Configuration to pass to the OCR backend."""
150
+ gmft_config: GMFTConfig | None = None
151
+ """GMFT configuration."""
125
152
  post_processing_hooks: list[PostProcessingHook] | None = None
126
153
  """Post processing hooks to call after processing is done and before the final result is returned."""
127
154
  validators: list[ValidationHook] | None = None
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.0.1
3
+ Version: 3.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
9
  Classifier: Development Status :: 4 - Beta
10
10
  Classifier: Intended Audience :: Developers
11
11
  Classifier: License :: OSI Approved :: MIT License
@@ -27,7 +27,7 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.2.0
30
+ Requires-Dist: html-to-markdown>=1.2.1
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
33
  Requires-Dist: python-calamine>=0.3.1
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
35
35
  Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
- Requires-Dist: numpy>=2.0.2; extra == "all"
38
+ Requires-Dist: gmft>=0.4.1; extra == "all"
39
39
  Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
- Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
41
- Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
40
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
+ Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
42
42
  Requires-Dist: setuptools>=76.0.0; extra == "all"
43
43
  Provides-Extra: chunking
44
- Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
44
+ Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
45
45
  Provides-Extra: easyocr
46
46
  Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
47
+ Provides-Extra: gmft
48
+ Requires-Dist: gmft>=0.4.1; extra == "gmft"
47
49
  Provides-Extra: paddleocr
48
- Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
49
50
  Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
50
- Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
51
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
51
52
  Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
53
  Dynamic: license-file
53
54
 
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
66
67
  - **Resource Efficient**: Lightweight processing without GPU requirements
67
68
  - **Format Support**: Comprehensive support for documents, images, and text formats
68
69
  - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
70
+ - **Metadata Extraction**: Get document metadata alongside text content
71
+ - **Table Extraction**: Extract tables from documents using the excellent GMFT library
69
72
  - **Modern Python**: Built with async/await, type hints, and a functional-first approach
70
73
  - **Permissive OSS**: MIT licensed with permissively licensed dependencies
71
74
 
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
160
163
  ### Local Development
161
164
 
162
165
  1. Clone the repo
163
-
164
166
  1. Install the system dependencies
165
-
166
167
  1. Install the full dependencies with `uv sync`
167
-
168
- 1. Install the pre-commit hooks with:
169
-
170
- ```shell
171
- pre-commit install && pre-commit install --hook-type commit-msg
172
- ```
173
-
168
+ 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
174
169
  1. Make your changes and submit a PR
175
170
 
176
171
  ## License
@@ -1,10 +1,11 @@
1
- kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
1
+ kreuzberg/__init__.py,sha256=lT9OwIdf5CEhSX7IVmtSFPgRhz6B2z2A-RE8Zdm0PH4,1216
2
2
  kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
3
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
+ kreuzberg/_gmft.py,sha256=qLhfepQuaROjPOdI-tDRqqqnOcqDY1D411ZXzoywnpg,7229
4
5
  kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
5
6
  kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
6
7
  kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
7
- kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
8
+ kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
8
9
  kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
9
10
  kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
10
11
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -13,20 +14,20 @@ kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqN
13
14
  kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
14
15
  kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
15
16
  kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
16
- kreuzberg/_extractors/_pdf.py,sha256=dcSAXyqH8SZ-z45OUAjjwdboSEbrli0YekS8PxCaVGA,6384
17
+ kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
17
18
  kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
18
19
  kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
19
20
  kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
20
21
  kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
21
22
  kreuzberg/_ocr/_easyocr.py,sha256=VfYW66SkB2Bigbrtd7WEeJ6QZ_1Y5d8Z_rZYBPMsuk0,11037
22
- kreuzberg/_ocr/_paddleocr.py,sha256=X5es69QMl0P6DZuuRNKWHaRtLi1OJqFs-mWHR_gVKvY,10837
23
+ kreuzberg/_ocr/_paddleocr.py,sha256=NDKXiMtHjIy-Uq4hXe4qm5oUWwOrhjJaibyC708Cw5E,10422
23
24
  kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
24
25
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
26
  kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
26
27
  kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
27
28
  kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
28
- kreuzberg-3.0.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
29
- kreuzberg-3.0.1.dist-info/METADATA,sha256=5Kt0w9rFBAina8SzbO-m2umEMRJQL-4mcPGAQASko_k,6545
30
- kreuzberg-3.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
31
- kreuzberg-3.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
32
- kreuzberg-3.0.1.dist-info/RECORD,,
29
+ kreuzberg-3.1.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
30
+ kreuzberg-3.1.0.dist-info/METADATA,sha256=YemIJR6aygDxNgz9aoeg2oIGRHJjm897jD8sHuJYdMY,6651
31
+ kreuzberg-3.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
32
+ kreuzberg-3.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
33
+ kreuzberg-3.1.0.dist-info/RECORD,,