kreuzberg 3.1.3__tar.gz → 3.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/PKG-INFO +5 -5
  2. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_pandoc.py +46 -6
  3. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_ocr/_easyocr.py +20 -16
  4. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_ocr/_paddleocr.py +4 -0
  5. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg.egg-info/PKG-INFO +5 -5
  6. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg.egg-info/requires.txt +4 -4
  7. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/pyproject.toml +7 -7
  8. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/LICENSE +0 -0
  9. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/README.md +0 -0
  10. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/__init__.py +0 -0
  11. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_chunker.py +0 -0
  12. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_constants.py +0 -0
  13. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/__init__.py +0 -0
  14. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_base.py +0 -0
  15. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_html.py +0 -0
  16. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_image.py +0 -0
  17. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_pdf.py +0 -0
  18. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_presentation.py +0 -0
  19. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  20. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_gmft.py +0 -0
  21. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_mime_types.py +0 -0
  22. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_ocr/__init__.py +0 -0
  23. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_ocr/_base.py +0 -0
  24. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_ocr/_tesseract.py +0 -0
  25. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_playa.py +0 -0
  26. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_registry.py +0 -0
  27. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_types.py +0 -0
  28. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_utils/__init__.py +0 -0
  29. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_utils/_string.py +0 -0
  30. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_utils/_sync.py +0 -0
  31. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/_utils/_tmp.py +0 -0
  32. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/exceptions.py +0 -0
  33. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/extraction.py +0 -0
  34. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg/py.typed +0 -0
  35. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg.egg-info/SOURCES.txt +0 -0
  36. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg.egg-info/dependency_links.txt +0 -0
  37. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/kreuzberg.egg-info/top_level.txt +0 -0
  38. {kreuzberg-3.1.3 → kreuzberg-3.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.1.3
3
+ Version: 3.1.4
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -27,19 +27,19 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.3.0
30
+ Requires-Dist: html-to-markdown>=1.3.2
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
33
  Requires-Dist: python-calamine>=0.3.2
34
34
  Requires-Dist: python-pptx>=1.0.2
35
- Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
35
+ Requires-Dist: typing-extensions>=4.13.2; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
38
  Requires-Dist: gmft>=0.4.1; extra == "all"
39
39
  Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
40
  Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
41
  Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
42
- Requires-Dist: setuptools>=76.0.0; extra == "all"
42
+ Requires-Dist: setuptools>=79.0.1; extra == "all"
43
43
  Provides-Extra: chunking
44
44
  Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
45
45
  Provides-Extra: easyocr
@@ -49,7 +49,7 @@ Requires-Dist: gmft>=0.4.1; extra == "gmft"
49
49
  Provides-Extra: paddleocr
50
50
  Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
51
51
  Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
52
- Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
+ Requires-Dist: setuptools>=79.0.1; extra == "paddleocr"
53
53
  Dynamic: license-file
54
54
 
55
55
  # Kreuzberg
@@ -227,14 +227,54 @@ class PandocExtractor(Extractor):
227
227
 
228
228
  command = ["pandoc", "--version"]
229
229
  result = await run_process(command)
230
+ stdout = result.stdout.decode()
230
231
 
231
- version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
232
- if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
233
- raise MissingDependencyError(
234
- "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
235
- )
232
+ # Try more inclusive patterns to detect the pandoc version
233
+ # Try common formats first
234
+ version_match = re.search(
235
+ r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
236
+ )
237
+
238
+ # Try version in parentheses format
239
+ if not version_match:
240
+ version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
241
+
242
+ # Try hyphenated format
243
+ if not version_match:
244
+ version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
245
+
246
+ # If still no match, check for version at the beginning of the output or any line
247
+ if not version_match:
248
+ # Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
249
+ version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
250
+
251
+ # Try finding version-like patterns elsewhere in the text
252
+ if not version_match:
253
+ # Search for version-like patterns at the beginning of lines or after spaces
254
+ version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
255
+
256
+ # As a last resort, check any sequence of digits that might be a version
257
+ if not version_match:
258
+ out_lines = stdout.splitlines()
259
+ for line in out_lines:
260
+ for token in line.split():
261
+ # Match standalone version patterns like 2.11 or 2.11.4
262
+ version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
263
+ if version_pattern:
264
+ version_match = version_pattern
265
+ break
266
+ if version_match:
267
+ break
268
+
269
+ # If we found a version, check that the major version is at least the minimum required
270
+ if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
271
+ self._checked_version = True
272
+ return
236
273
 
237
- self._checked_version = True
274
+ # If we get here, we either didn't find a version or it's too low
275
+ raise MissingDependencyError(
276
+ "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
277
+ )
238
278
 
239
279
  except FileNotFoundError as e:
240
280
  raise MissingDependencyError(
@@ -127,7 +127,8 @@ class EasyOCRConfig:
127
127
  height_ths: float = 0.5
128
128
  """Maximum difference in box height for merging."""
129
129
  language: str | list[str] = "en"
130
- """Language or languages to use for OCR."""
130
+ """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
131
+ a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
131
132
  link_threshold: float = 0.4
132
133
  """Link confidence threshold."""
133
134
  low_text: float = 0.4
@@ -354,29 +355,32 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
354
355
 
355
356
  @staticmethod
356
357
  def _validate_language_code(language_codes: str | list[str]) -> list[str]:
357
- """Validate and normalize a provided language code.
358
+ """Validate and normalize provided language codes.
358
359
 
359
360
  Args:
360
- language_codes: The language code string.
361
+ language_codes: The language code(s), either as a string (single or comma-separated) or a list.
361
362
 
362
363
  Raises:
363
- ValidationError: If the language is not supported by EasyOCR
364
+ ValidationError: If any of the languages are not supported by EasyOCR
364
365
 
365
366
  Returns:
366
- A list with the normalized language code.
367
+ A list with the normalized language codes.
367
368
  """
368
- if not isinstance(language_codes, list):
369
- languages = [language_codes.lower()]
369
+ if isinstance(language_codes, str):
370
+ # Handle comma-separated language codes
371
+ languages = [lang.strip().lower() for lang in language_codes.split(",")]
370
372
  else:
373
+ # Handle list of language codes
371
374
  languages = [lang.lower() for lang in language_codes]
372
375
 
373
- if all(lang in EASYOCR_SUPPORTED_LANGUAGE_CODES for lang in languages):
374
- return languages
376
+ unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
377
+ if unsupported_langs:
378
+ raise ValidationError(
379
+ "The provided language codes are not supported by EasyOCR",
380
+ context={
381
+ "language_code": ",".join(unsupported_langs),
382
+ "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
383
+ },
384
+ )
375
385
 
376
- raise ValidationError(
377
- "The provided language codes are not supported by EasyOCR",
378
- context={
379
- "language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
380
- "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
381
- },
382
- )
386
+ return languages
@@ -52,6 +52,8 @@ class PaddleOCRConfig:
52
52
  """Binarization threshold for EAST output map."""
53
53
  det_max_side_len: int = 960
54
54
  """Maximum size of image long side. Images exceeding this will be proportionally resized."""
55
+ det_model_dir: str | None = None
56
+ """Directory for detection model. If None, uses default model location."""
55
57
  drop_score: float = 0.5
56
58
  """Filter recognition results by confidence score. Results below this are discarded."""
57
59
  enable_mkldnn: bool = False
@@ -82,6 +84,8 @@ class PaddleOCRConfig:
82
84
  """Recognition algorithm."""
83
85
  rec_image_shape: str = "3,32,320"
84
86
  """Image shape for recognition algorithm in format 'channels,height,width'."""
87
+ rec_model_dir: str | None = None
88
+ """Directory for recognition model. If None, uses default model location."""
85
89
  table: bool = True
86
90
  """Whether to enable table recognition."""
87
91
  use_angle_cls: bool = True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.1.3
3
+ Version: 3.1.4
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -27,19 +27,19 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.3.0
30
+ Requires-Dist: html-to-markdown>=1.3.2
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
33
  Requires-Dist: python-calamine>=0.3.2
34
34
  Requires-Dist: python-pptx>=1.0.2
35
- Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
35
+ Requires-Dist: typing-extensions>=4.13.2; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
38
  Requires-Dist: gmft>=0.4.1; extra == "all"
39
39
  Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
40
  Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
41
  Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
42
- Requires-Dist: setuptools>=76.0.0; extra == "all"
42
+ Requires-Dist: setuptools>=79.0.1; extra == "all"
43
43
  Provides-Extra: chunking
44
44
  Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
45
45
  Provides-Extra: easyocr
@@ -49,7 +49,7 @@ Requires-Dist: gmft>=0.4.1; extra == "gmft"
49
49
  Provides-Extra: paddleocr
50
50
  Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
51
51
  Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
52
- Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
+ Requires-Dist: setuptools>=79.0.1; extra == "paddleocr"
53
53
  Dynamic: license-file
54
54
 
55
55
  # Kreuzberg
@@ -1,6 +1,6 @@
1
1
  anyio>=4.9.0
2
2
  charset-normalizer>=3.4.1
3
- html-to-markdown>=1.3.0
3
+ html-to-markdown>=1.3.2
4
4
  playa-pdf>=0.4.1
5
5
  pypdfium2==4.30.0
6
6
  python-calamine>=0.3.2
@@ -10,7 +10,7 @@ python-pptx>=1.0.2
10
10
  exceptiongroup>=1.2.2
11
11
 
12
12
  [:python_version < "3.12"]
13
- typing-extensions>=4.13.1
13
+ typing-extensions>=4.13.2
14
14
 
15
15
  [all]
16
16
  easyocr>=1.7.2
@@ -18,7 +18,7 @@ gmft>=0.4.1
18
18
  paddleocr>=2.10.0
19
19
  paddlepaddle>=3.0.0
20
20
  semantic-text-splitter>=0.25.1
21
- setuptools>=76.0.0
21
+ setuptools>=79.0.1
22
22
 
23
23
  [chunking]
24
24
  semantic-text-splitter>=0.25.1
@@ -32,4 +32,4 @@ gmft>=0.4.1
32
32
  [paddleocr]
33
33
  paddleocr>=2.10.0
34
34
  paddlepaddle>=3.0.0
35
- setuptools>=76.0.0
35
+ setuptools>=79.0.1
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kreuzberg"
3
- version = "3.1.3"
3
+ version = "3.1.4"
4
4
  description = "A text extraction library supporting PDFs, images, office documents and more"
5
5
  readme = "README.md"
6
6
  keywords = [
@@ -40,12 +40,12 @@ dependencies = [
40
40
  "anyio>=4.9.0",
41
41
  "charset-normalizer>=3.4.1",
42
42
  "exceptiongroup>=1.2.2; python_version<'3.11'",
43
- "html-to-markdown>=1.3.0",
43
+ "html-to-markdown>=1.3.2",
44
44
  "playa-pdf>=0.4.1",
45
45
  "pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
46
46
  "python-calamine>=0.3.2",
47
47
  "python-pptx>=1.0.2",
48
- "typing-extensions>=4.13.1; python_version<'3.12'",
48
+ "typing-extensions>=4.13.2; python_version<'3.12'",
49
49
  ]
50
50
 
51
51
  optional-dependencies.all = [
@@ -58,7 +58,7 @@ optional-dependencies.all = [
58
58
  "paddlepaddle>=3.0.0",
59
59
  # chunking
60
60
  "semantic-text-splitter>=0.25.1",
61
- "setuptools>=76.0.0",
61
+ "setuptools>=79.0.1",
62
62
  ]
63
63
  optional-dependencies.chunking = [
64
64
  "semantic-text-splitter>=0.25.1",
@@ -72,7 +72,7 @@ optional-dependencies.gmft = [
72
72
  optional-dependencies.paddleocr = [
73
73
  "paddleocr>=2.10.0",
74
74
  "paddlepaddle>=3.0.0",
75
- "setuptools>=76.0.0",
75
+ "setuptools>=79.0.1",
76
76
  ]
77
77
  urls.homepage = "https://github.com/Goldziher/kreuzberg"
78
78
 
@@ -85,8 +85,8 @@ dev = [
85
85
  "pytest-cov>=6.1.1",
86
86
  "pytest-mock>=3.14.0",
87
87
  "pytest-timeout>=2.3.1",
88
- "ruff>=0.11.4",
89
- "trio>=0.29.0",
88
+ "ruff>=0.11.7",
89
+ "trio>=0.30.0",
90
90
  "uv-bump",
91
91
  ]
92
92
  doc = [
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes