kreuzberg 3.1.1__tar.gz → 3.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/PKG-INFO +10 -10
  2. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/README.md +6 -6
  3. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_pandoc.py +1 -1
  4. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_ocr/_easyocr.py +3 -1
  5. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg.egg-info/PKG-INFO +10 -10
  6. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg.egg-info/requires.txt +3 -3
  7. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/pyproject.toml +6 -6
  8. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/LICENSE +0 -0
  9. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/__init__.py +0 -0
  10. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_chunker.py +0 -0
  11. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_constants.py +0 -0
  12. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/__init__.py +0 -0
  13. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_base.py +0 -0
  14. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_html.py +0 -0
  15. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_image.py +0 -0
  16. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_pdf.py +0 -0
  17. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_presentation.py +0 -0
  18. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  19. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_gmft.py +0 -0
  20. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_mime_types.py +0 -0
  21. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_ocr/__init__.py +0 -0
  22. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_ocr/_base.py +0 -0
  23. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_ocr/_paddleocr.py +0 -0
  24. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_ocr/_tesseract.py +0 -0
  25. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_playa.py +0 -0
  26. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_registry.py +0 -0
  27. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_types.py +0 -0
  28. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_utils/__init__.py +0 -0
  29. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_utils/_string.py +0 -0
  30. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_utils/_sync.py +0 -0
  31. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/_utils/_tmp.py +0 -0
  32. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/exceptions.py +0 -0
  33. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/extraction.py +0 -0
  34. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg/py.typed +0 -0
  35. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg.egg-info/SOURCES.txt +0 -0
  36. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg.egg-info/dependency_links.txt +0 -0
  37. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/kreuzberg.egg-info/top_level.txt +0 -0
  38. {kreuzberg-3.1.1 → kreuzberg-3.1.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.1.1
3
+ Version: 3.1.3
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -27,12 +27,12 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.2.1
30
+ Requires-Dist: html-to-markdown>=1.3.0
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
- Requires-Dist: python-calamine>=0.3.1
33
+ Requires-Dist: python-calamine>=0.3.2
34
34
  Requires-Dist: python-pptx>=1.0.2
35
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
35
+ Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
38
  Requires-Dist: gmft>=0.4.1; extra == "all"
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
140
140
 
141
141
  Kreuzberg supports a wide range of document formats:
142
142
 
143
- - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
143
+ - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
144
144
  - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
145
145
  - **Spreadsheets**: XLSX, XLS, CSV, etc.
146
146
  - **Presentations**: PPTX, PPT, etc.
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
162
162
 
163
163
  ### Local Development
164
164
 
165
- 1. Clone the repo
166
- 1. Install the system dependencies
167
- 1. Install the full dependencies with `uv sync`
168
- 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
169
- 1. Make your changes and submit a PR
165
+ - Clone the repo
166
+ - Install the system dependencies
167
+ - Install the full dependencies with `uv sync`
168
+ - Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
169
+ - Make your changes and submit a PR
170
170
 
171
171
  ## License
172
172
 
@@ -86,7 +86,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
86
86
 
87
87
  Kreuzberg supports a wide range of document formats:
88
88
 
89
- - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
89
+ - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
90
90
  - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
91
91
  - **Spreadsheets**: XLSX, XLS, CSV, etc.
92
92
  - **Presentations**: PPTX, PPT, etc.
@@ -108,11 +108,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
108
108
 
109
109
  ### Local Development
110
110
 
111
- 1. Clone the repo
112
- 1. Install the system dependencies
113
- 1. Install the full dependencies with `uv sync`
114
- 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
115
- 1. Make your changes and submit a PR
111
+ - Clone the repo
112
+ - Install the system dependencies
113
+ - Install the full dependencies with `uv sync`
114
+ - Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
115
+ - Make your changes and submit a PR
116
116
 
117
117
  ## License
118
118
 
@@ -228,7 +228,7 @@ class PandocExtractor(Extractor):
228
228
  command = ["pandoc", "--version"]
229
229
  result = await run_process(command)
230
230
 
231
- version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
231
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
232
232
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
233
233
  raise MissingDependencyError(
234
234
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
@@ -170,6 +170,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
170
170
  Raises:
171
171
  OCRError: If OCR processing fails.
172
172
  """
173
+ import numpy as np
174
+
173
175
  await self._init_easyocr(**kwargs)
174
176
 
175
177
  beam_width = kwargs.pop("beam_width")
@@ -180,7 +182,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
180
182
  try:
181
183
  result = await run_sync(
182
184
  self._reader.readtext,
183
- image.tobytes(),
185
+ np.array(image),
184
186
  beamWidth=beam_width,
185
187
  **kwargs,
186
188
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.1.1
3
+ Version: 3.1.3
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -27,12 +27,12 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.2.1
30
+ Requires-Dist: html-to-markdown>=1.3.0
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
- Requires-Dist: python-calamine>=0.3.1
33
+ Requires-Dist: python-calamine>=0.3.2
34
34
  Requires-Dist: python-pptx>=1.0.2
35
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
35
+ Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
38
  Requires-Dist: gmft>=0.4.1; extra == "all"
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
140
140
 
141
141
  Kreuzberg supports a wide range of document formats:
142
142
 
143
- - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
143
+ - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
144
144
  - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
145
145
  - **Spreadsheets**: XLSX, XLS, CSV, etc.
146
146
  - **Presentations**: PPTX, PPT, etc.
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
162
162
 
163
163
  ### Local Development
164
164
 
165
- 1. Clone the repo
166
- 1. Install the system dependencies
167
- 1. Install the full dependencies with `uv sync`
168
- 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
169
- 1. Make your changes and submit a PR
165
+ - Clone the repo
166
+ - Install the system dependencies
167
+ - Install the full dependencies with `uv sync`
168
+ - Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
169
+ - Make your changes and submit a PR
170
170
 
171
171
  ## License
172
172
 
@@ -1,16 +1,16 @@
1
1
  anyio>=4.9.0
2
2
  charset-normalizer>=3.4.1
3
- html-to-markdown>=1.2.1
3
+ html-to-markdown>=1.3.0
4
4
  playa-pdf>=0.4.1
5
5
  pypdfium2==4.30.0
6
- python-calamine>=0.3.1
6
+ python-calamine>=0.3.2
7
7
  python-pptx>=1.0.2
8
8
 
9
9
  [:python_version < "3.11"]
10
10
  exceptiongroup>=1.2.2
11
11
 
12
12
  [:python_version < "3.12"]
13
- typing-extensions>=4.12.2
13
+ typing-extensions>=4.13.1
14
14
 
15
15
  [all]
16
16
  easyocr>=1.7.2
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kreuzberg"
3
- version = "3.1.1"
3
+ version = "3.1.3"
4
4
  description = "A text extraction library supporting PDFs, images, office documents and more"
5
5
  readme = "README.md"
6
6
  keywords = [
@@ -40,12 +40,12 @@ dependencies = [
40
40
  "anyio>=4.9.0",
41
41
  "charset-normalizer>=3.4.1",
42
42
  "exceptiongroup>=1.2.2; python_version<'3.11'",
43
- "html-to-markdown>=1.2.1",
43
+ "html-to-markdown>=1.3.0",
44
44
  "playa-pdf>=0.4.1",
45
45
  "pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
46
- "python-calamine>=0.3.1",
46
+ "python-calamine>=0.3.2",
47
47
  "python-pptx>=1.0.2",
48
- "typing-extensions>=4.12.2; python_version<'3.12'",
48
+ "typing-extensions>=4.13.1; python_version<'3.12'",
49
49
  ]
50
50
 
51
51
  optional-dependencies.all = [
@@ -82,10 +82,10 @@ dev = [
82
82
  "mypy>=1.15.0",
83
83
  "pre-commit>=4.2.0",
84
84
  "pytest>=8.3.5",
85
- "pytest-cov>=6.1.0",
85
+ "pytest-cov>=6.1.1",
86
86
  "pytest-mock>=3.14.0",
87
87
  "pytest-timeout>=2.3.1",
88
- "ruff>=0.11.2",
88
+ "ruff>=0.11.4",
89
89
  "trio>=0.29.0",
90
90
  "uv-bump",
91
91
  ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes