kreuzberg 3.1.1__tar.gz → 3.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/PKG-INFO +10 -10
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/README.md +6 -6
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_pandoc.py +1 -1
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg.egg-info/PKG-INFO +10 -10
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg.egg-info/requires.txt +3 -3
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/pyproject.toml +6 -6
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/LICENSE +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-3.1.1 → kreuzberg-3.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.1.
|
3
|
+
Version: 3.1.2
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -27,12 +27,12 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.
|
30
|
+
Requires-Dist: html-to-markdown>=1.3.0
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
|
-
Requires-Dist: python-calamine>=0.3.
|
33
|
+
Requires-Dist: python-calamine>=0.3.2
|
34
34
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.
|
35
|
+
Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
38
|
Requires-Dist: gmft>=0.4.1; extra == "all"
|
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
|
|
140
140
|
|
141
141
|
Kreuzberg supports a wide range of document formats:
|
142
142
|
|
143
|
-
- **Documents**: PDF, DOCX,
|
143
|
+
- **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
|
144
144
|
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
145
145
|
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
146
146
|
- **Presentations**: PPTX, PPT, etc.
|
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
162
162
|
|
163
163
|
### Local Development
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
165
|
+
- Clone the repo
|
166
|
+
- Install the system dependencies
|
167
|
+
- Install the full dependencies with `uv sync`
|
168
|
+
- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
169
|
+
- Make your changes and submit a PR
|
170
170
|
|
171
171
|
## License
|
172
172
|
|
@@ -86,7 +86,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
|
|
86
86
|
|
87
87
|
Kreuzberg supports a wide range of document formats:
|
88
88
|
|
89
|
-
- **Documents**: PDF, DOCX,
|
89
|
+
- **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
|
90
90
|
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
91
91
|
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
92
92
|
- **Presentations**: PPTX, PPT, etc.
|
@@ -108,11 +108,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
108
108
|
|
109
109
|
### Local Development
|
110
110
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
111
|
+
- Clone the repo
|
112
|
+
- Install the system dependencies
|
113
|
+
- Install the full dependencies with `uv sync`
|
114
|
+
- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
115
|
+
- Make your changes and submit a PR
|
116
116
|
|
117
117
|
## License
|
118
118
|
|
@@ -228,7 +228,7 @@ class PandocExtractor(Extractor):
|
|
228
228
|
command = ["pandoc", "--version"]
|
229
229
|
result = await run_process(command)
|
230
230
|
|
231
|
-
version_match = re.search(r"pandoc\s+v?(\d+)\.\d
|
231
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
|
232
232
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
233
233
|
raise MissingDependencyError(
|
234
234
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.1.
|
3
|
+
Version: 3.1.2
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -27,12 +27,12 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.
|
30
|
+
Requires-Dist: html-to-markdown>=1.3.0
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
|
-
Requires-Dist: python-calamine>=0.3.
|
33
|
+
Requires-Dist: python-calamine>=0.3.2
|
34
34
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.
|
35
|
+
Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
38
|
Requires-Dist: gmft>=0.4.1; extra == "all"
|
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
|
|
140
140
|
|
141
141
|
Kreuzberg supports a wide range of document formats:
|
142
142
|
|
143
|
-
- **Documents**: PDF, DOCX,
|
143
|
+
- **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
|
144
144
|
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
145
145
|
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
146
146
|
- **Presentations**: PPTX, PPT, etc.
|
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
162
162
|
|
163
163
|
### Local Development
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
165
|
+
- Clone the repo
|
166
|
+
- Install the system dependencies
|
167
|
+
- Install the full dependencies with `uv sync`
|
168
|
+
- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
169
|
+
- Make your changes and submit a PR
|
170
170
|
|
171
171
|
## License
|
172
172
|
|
@@ -1,16 +1,16 @@
|
|
1
1
|
anyio>=4.9.0
|
2
2
|
charset-normalizer>=3.4.1
|
3
|
-
html-to-markdown>=1.
|
3
|
+
html-to-markdown>=1.3.0
|
4
4
|
playa-pdf>=0.4.1
|
5
5
|
pypdfium2==4.30.0
|
6
|
-
python-calamine>=0.3.
|
6
|
+
python-calamine>=0.3.2
|
7
7
|
python-pptx>=1.0.2
|
8
8
|
|
9
9
|
[:python_version < "3.11"]
|
10
10
|
exceptiongroup>=1.2.2
|
11
11
|
|
12
12
|
[:python_version < "3.12"]
|
13
|
-
typing-extensions>=4.
|
13
|
+
typing-extensions>=4.13.1
|
14
14
|
|
15
15
|
[all]
|
16
16
|
easyocr>=1.7.2
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "3.1.
|
3
|
+
version = "3.1.2"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
@@ -40,12 +40,12 @@ dependencies = [
|
|
40
40
|
"anyio>=4.9.0",
|
41
41
|
"charset-normalizer>=3.4.1",
|
42
42
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
43
|
-
"html-to-markdown>=1.
|
43
|
+
"html-to-markdown>=1.3.0",
|
44
44
|
"playa-pdf>=0.4.1",
|
45
45
|
"pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
|
46
|
-
"python-calamine>=0.3.
|
46
|
+
"python-calamine>=0.3.2",
|
47
47
|
"python-pptx>=1.0.2",
|
48
|
-
"typing-extensions>=4.
|
48
|
+
"typing-extensions>=4.13.1; python_version<'3.12'",
|
49
49
|
]
|
50
50
|
|
51
51
|
optional-dependencies.all = [
|
@@ -82,10 +82,10 @@ dev = [
|
|
82
82
|
"mypy>=1.15.0",
|
83
83
|
"pre-commit>=4.2.0",
|
84
84
|
"pytest>=8.3.5",
|
85
|
-
"pytest-cov>=6.1.
|
85
|
+
"pytest-cov>=6.1.1",
|
86
86
|
"pytest-mock>=3.14.0",
|
87
87
|
"pytest-timeout>=2.3.1",
|
88
|
-
"ruff>=0.11.
|
88
|
+
"ruff>=0.11.4",
|
89
89
|
"trio>=0.29.0",
|
90
90
|
"uv-bump",
|
91
91
|
]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|