kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_playa.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from datetime import datetime
3
+ from datetime import datetime, timezone
4
4
  from typing import TYPE_CHECKING, Any, cast
5
5
 
6
6
  from playa import asobj, parse
@@ -25,18 +25,6 @@ BOM_CHAR = "\ufeff"
25
25
 
26
26
 
27
27
  async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
28
- """Extract metadata from a PDF document.
29
-
30
- Args:
31
- pdf_content: The bytes of the PDF document.
32
- password: Password for encrypted PDF files.
33
-
34
- Raises:
35
- ParsingError: If the PDF metadata could not be extracted.
36
-
37
- Returns:
38
- A dictionary of metadata extracted from the PDF.
39
- """
40
28
  try:
41
29
  document = parse(pdf_content, max_workers=1, password=password)
42
30
  metadata: Metadata = {}
@@ -115,7 +103,6 @@ def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> Non
115
103
  if keywords := pdf_info.get("keywords"):
116
104
  if isinstance(keywords, (str, bytes)):
117
105
  kw_str = decode_text(keywords)
118
- # Combine multiple operations into a single comprehension
119
106
  result["keywords"] = [k.strip() for part in kw_str.replace(";", ",").split(",") if (k := part.strip())]
120
107
  elif isinstance(keywords, list):
121
108
  result["keywords"] = [decode_text(k) for k in keywords]
@@ -144,8 +131,10 @@ def _parse_date_string(date_str: str) -> str:
144
131
  second = date_str[12:14]
145
132
  time_part = f"T{hour}:{minute}:{second}"
146
133
  if time_part:
147
- return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat() # noqa: DTZ007
148
- return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat() # noqa: DTZ007
134
+ dt = datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
135
+ return dt.isoformat()
136
+ dt = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").replace(tzinfo=timezone.utc)
137
+ return dt.isoformat()
149
138
  return date_str
150
139
 
151
140
 
@@ -246,7 +235,6 @@ def _collect_document_permissions(document: Document) -> list[str]:
246
235
 
247
236
 
248
237
  def _extract_structure_information(document: Document, result: Metadata) -> None:
249
- """Extract language and subtitle from document structure."""
250
238
  if document.structure:
251
239
  languages = set()
252
240
  subtitle = None
@@ -279,20 +267,6 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
279
267
 
280
268
 
281
269
  def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
282
- """Synchronous version of extract_pdf_metadata.
283
-
284
- Extract metadata from a PDF document without using async/await.
285
-
286
- Args:
287
- pdf_content: The bytes of the PDF document.
288
- password: Password for encrypted PDF files.
289
-
290
- Raises:
291
- ParsingError: If the PDF metadata could not be extracted.
292
-
293
- Returns:
294
- A dictionary of metadata extracted from the PDF.
295
- """
296
270
  try:
297
271
  document = parse(pdf_content, max_workers=1, password=password)
298
272
  metadata: Metadata = {}
kreuzberg/_registry.py CHANGED
@@ -28,14 +28,6 @@ if TYPE_CHECKING:
28
28
 
29
29
 
30
30
  class ExtractorRegistry:
31
- """Manages extractors for different MIME types and their configurations.
32
-
33
- This class provides functionality to register, unregister, and retrieve
34
- extractors based on MIME types. It supports both synchronous and asynchronous
35
- operations for managing extractors. A default set of extractors is also
36
- maintained alongside user-registered extractors.
37
- """
38
-
39
31
  _default_extractors: ClassVar[list[type[Extractor]]] = [
40
32
  PDFExtractor,
41
33
  OfficeDocumentExtractor,
@@ -59,15 +51,6 @@ class ExtractorRegistry:
59
51
  @classmethod
60
52
  @lru_cache
61
53
  def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
62
- """Gets the extractor for the mimetype.
63
-
64
- Args:
65
- mime_type: The mime type of the content.
66
- config: Extraction options object, defaults to the default object.
67
-
68
- Returns:
69
- The extractor
70
- """
71
54
  extractors: list[type[Extractor]] = [
72
55
  *cls._registered_extractors,
73
56
  *cls._default_extractors,
@@ -81,30 +64,11 @@ class ExtractorRegistry:
81
64
 
82
65
  @classmethod
83
66
  def add_extractor(cls, extractor: type[Extractor]) -> None:
84
- """Add an extractor to the registry.
85
-
86
- Note:
87
- Extractors are tried in the order they are added: first added, first tried.
88
-
89
- Args:
90
- extractor: The extractor to add.
91
-
92
- Returns:
93
- None
94
- """
95
67
  cls._registered_extractors.append(extractor)
96
68
  cls.get_extractor.cache_clear()
97
69
 
98
70
  @classmethod
99
71
  def remove_extractor(cls, extractor: type[Extractor]) -> None:
100
- """Remove an extractor from the registry.
101
-
102
- Args:
103
- extractor: The extractor to remove.
104
-
105
- Returns:
106
- None
107
- """
108
72
  try:
109
73
  cls._registered_extractors.remove(extractor)
110
74
  cls.get_extractor.cache_clear()