kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_mcp/server.py CHANGED
@@ -22,7 +22,6 @@ from kreuzberg.extraction import (
22
22
 
23
23
  mcp = FastMCP("Kreuzberg Text Extraction")
24
24
 
25
- # Security and performance limits
26
25
  MAX_BATCH_SIZE = 100
27
26
 
28
27
 
@@ -46,7 +45,6 @@ def _validate_file_path(file_path: str) -> Path:
46
45
  context={"file_path": file_path, "error": str(e)},
47
46
  ) from e
48
47
 
49
- # Check for path traversal attempts
50
48
  if ".." in file_path and not file_path.startswith("/"):
51
49
  raise ValidationError(
52
50
  "Path traversal detected in file path",
@@ -73,7 +71,6 @@ def _validate_file_path_with_context(file_path: str, index: int, total: int) ->
73
71
  try:
74
72
  return _validate_file_path(file_path)
75
73
  except ValidationError as e:
76
- # Add context about which file in the batch failed
77
74
  e.context = e.context or {}
78
75
  e.context["batch_index"] = index
79
76
  e.context["total_files"] = total
@@ -99,7 +96,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
99
96
  context={"context": context_info},
100
97
  )
101
98
 
102
- # Check for whitespace-only content
103
99
  if not content_base64.strip():
104
100
  raise ValidationError(
105
101
  "Base64 content cannot be whitespace only",
@@ -126,7 +122,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
126
122
  def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
127
123
  base_config = discover_config()
128
124
 
129
- # Extract Tesseract-specific parameters from kwargs first
130
125
  tesseract_lang = kwargs.pop("tesseract_lang", None)
131
126
  tesseract_psm = kwargs.pop("tesseract_psm", None)
132
127
  tesseract_output_format = kwargs.pop("tesseract_output_format", None)
@@ -151,7 +146,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
151
146
  }
152
147
  config_dict = config_dict | kwargs
153
148
 
154
- # Handle Tesseract OCR configuration
155
149
  ocr_backend = config_dict.get("ocr_backend")
156
150
  if ocr_backend == "tesseract" and (
157
151
  tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
@@ -174,10 +168,8 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
174
168
  tesseract_config_dict["enable_table_detection"] = True
175
169
 
176
170
  if tesseract_config_dict:
177
- # Merge with existing tesseract config if present
178
171
  existing_ocr_config = config_dict.get("ocr_config")
179
172
  if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
180
- # Convert existing config to dict, merge, and recreate
181
173
  existing_dict = existing_ocr_config.to_dict()
182
174
  merged_dict = existing_dict | tesseract_config_dict
183
175
  config_dict["ocr_config"] = TesseractConfig(**merged_dict)
@@ -206,7 +198,6 @@ def extract_document( # noqa: PLR0913
206
198
  tesseract_output_format: str | None = None,
207
199
  enable_table_detection: bool | None = None,
208
200
  ) -> dict[str, Any]:
209
- # Validate file path for security
210
201
  validated_path = _validate_file_path(file_path)
211
202
  config = _create_config_with_overrides(
212
203
  force_ocr=force_ocr,
@@ -289,7 +280,6 @@ def batch_extract_document( # noqa: PLR0913
289
280
  tesseract_output_format: str | None = None,
290
281
  enable_table_detection: bool | None = None,
291
282
  ) -> list[dict[str, Any]]:
292
- # Validate batch size
293
283
  if len(file_paths) > MAX_BATCH_SIZE:
294
284
  raise ValidationError(
295
285
  f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -302,7 +292,6 @@ def batch_extract_document( # noqa: PLR0913
302
292
  context={"file_paths": file_paths},
303
293
  )
304
294
 
305
- # Validate all file paths for security
306
295
  validated_paths = []
307
296
  for i, file_path in enumerate(file_paths):
308
297
  validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
@@ -346,7 +335,6 @@ def batch_extract_bytes( # noqa: PLR0913
346
335
  tesseract_output_format: str | None = None,
347
336
  enable_table_detection: bool | None = None,
348
337
  ) -> list[dict[str, Any]]:
349
- # Validate input
350
338
  if not content_items:
351
339
  raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
352
340
 
@@ -355,7 +343,6 @@ def batch_extract_bytes( # noqa: PLR0913
355
343
  "content_items must be a list", context={"content_items_type": type(content_items).__name__}
356
344
  )
357
345
 
358
- # Validate batch size
359
346
  if len(content_items) > MAX_BATCH_SIZE:
360
347
  raise ValidationError(
361
348
  f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -379,17 +366,14 @@ def batch_extract_bytes( # noqa: PLR0913
379
366
  enable_table_detection=enable_table_detection,
380
367
  )
381
368
 
382
- # Convert list of dicts to list of tuples (bytes, mime_type)
383
369
  contents = []
384
370
  for i, item in enumerate(content_items):
385
- # Validate item structure
386
371
  if not isinstance(item, dict):
387
372
  raise ValidationError(
388
373
  f"Item at index {i} must be a dictionary",
389
374
  context={"item_index": i, "item_type": type(item).__name__, "item": item},
390
375
  )
391
376
 
392
- # Check for required keys
393
377
  if "content_base64" not in item:
394
378
  raise ValidationError(
395
379
  f"Item at index {i} is missing required key 'content_base64'",
@@ -405,11 +389,9 @@ def batch_extract_bytes( # noqa: PLR0913
405
389
  content_base64 = item["content_base64"]
406
390
  mime_type = item["mime_type"]
407
391
 
408
- # Validate base64 content
409
392
  try:
410
393
  content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
411
394
  except ValidationError as e:
412
- # Add batch-specific context
413
395
  e.context = e.context or {}
414
396
  e.context["item_index"] = i
415
397
  e.context["total_items"] = len(content_items)
@@ -426,7 +408,6 @@ def extract_simple(
426
408
  file_path: str,
427
409
  mime_type: str | None = None,
428
410
  ) -> str:
429
- # Validate file path for security
430
411
  validated_path = _validate_file_path(file_path)
431
412
  config = _create_config_with_overrides()
432
413
  result = extract_file_sync(str(validated_path), mime_type, config)
@@ -467,7 +448,6 @@ def get_supported_formats() -> str:
467
448
 
468
449
  @mcp.prompt()
469
450
  def extract_and_summarize(file_path: str) -> list[TextContent]:
470
- # Validate file path for security
471
451
  validated_path = _validate_file_path(file_path)
472
452
  result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
473
453
 
@@ -481,7 +461,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
481
461
 
482
462
  @mcp.prompt()
483
463
  def extract_structured(file_path: str) -> list[TextContent]:
484
- # Validate file path for security
485
464
  validated_path = _validate_file_path(file_path)
486
465
  config = _create_config_with_overrides(
487
466
  extract_entities=True,
@@ -33,22 +33,39 @@ except ImportError: # pragma: no cover
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  import easyocr
36
- import numpy as np
37
36
  import torch
37
+ else:
38
+ easyocr: Any = None
39
+ torch: Any = None
40
+
41
+ HAS_EASYOCR: bool = False
42
+
43
+
44
+ def _import_easyocr() -> tuple[Any, Any]:
45
+ global HAS_EASYOCR, easyocr, torch
46
+
47
+ # If easyocr is already set (either real module or mock), return it
48
+ if easyocr is not None:
49
+ return easyocr, torch
50
+
51
+ # If explicitly disabled for testing
52
+ if not HAS_EASYOCR and easyocr is None:
53
+ return None, None
38
54
 
39
- HAS_EASYOCR: bool
40
- if not TYPE_CHECKING:
41
55
  try:
42
- import easyocr
43
- import numpy as np
44
- import torch
56
+ import easyocr as _easyocr # noqa: PLC0415
45
57
 
58
+ try:
59
+ import torch as _torch # noqa: PLC0415
60
+ except ImportError:
61
+ _torch = None # type: ignore[assignment]
62
+
63
+ easyocr = _easyocr
64
+ torch = _torch
46
65
  HAS_EASYOCR = True
66
+ return easyocr, torch
47
67
  except ImportError:
48
- HAS_EASYOCR = False
49
- easyocr: Any = None
50
- np: Any = None
51
- torch: Any = None
68
+ return None, None
52
69
 
53
70
 
54
71
  EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -142,6 +159,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
142
159
  _reader: ClassVar[Any] = None
143
160
 
144
161
  async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
162
+ try:
163
+ import numpy as np # noqa: PLC0415
164
+ except ImportError as e:
165
+ raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
166
+
145
167
  use_cache = kwargs.pop("use_cache", True)
146
168
 
147
169
  cache_kwargs = None
@@ -292,7 +314,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
292
314
 
293
315
  @classmethod
294
316
  def _is_gpu_available(cls) -> bool:
295
- if not HAS_EASYOCR or torch is None:
317
+ # Use the module-level torch variable directly to respect patches
318
+ if torch is None:
296
319
  return False
297
320
  return bool(torch.cuda.is_available())
298
321
 
@@ -301,13 +324,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
301
324
  if cls._reader is not None:
302
325
  return
303
326
 
304
- if not HAS_EASYOCR or easyocr is None:
327
+ # Validate language first before attempting import
328
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
329
+
330
+ easyocr_module, _ = _import_easyocr()
331
+ if easyocr_module is None:
305
332
  raise MissingDependencyError.create_for_package(
306
333
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
307
334
  )
308
335
 
309
- languages = cls._validate_language_code(kwargs.pop("language", "en"))
310
-
311
336
  device_info = cls._resolve_device_config(**kwargs)
312
337
  use_gpu = device_info.device_type in ("cuda", "mps")
313
338
 
@@ -318,7 +343,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
318
343
 
319
344
  try:
320
345
  cls._reader = await run_sync(
321
- easyocr.Reader,
346
+ easyocr_module.Reader,
322
347
  languages,
323
348
  gpu=use_gpu,
324
349
  verbose=False,
@@ -382,6 +407,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
382
407
  return languages
383
408
 
384
409
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
410
+ try:
411
+ import numpy as np # noqa: PLC0415
412
+ except ImportError as e:
413
+ raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
414
+
385
415
  use_cache = kwargs.pop("use_cache", True)
386
416
 
387
417
  cache_kwargs = None
@@ -453,13 +483,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
453
483
  if cls._reader is not None:
454
484
  return
455
485
 
456
- if not HAS_EASYOCR or easyocr is None:
486
+ # Validate language first before attempting import
487
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
488
+
489
+ easyocr_module, _ = _import_easyocr()
490
+ if easyocr_module is None:
457
491
  raise MissingDependencyError.create_for_package(
458
492
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
459
493
  )
460
494
 
461
- languages = cls._validate_language_code(kwargs.pop("language", "en"))
462
-
463
495
  device_info = cls._resolve_device_config(**kwargs)
464
496
  use_gpu = device_info.device_type in ("cuda", "mps")
465
497
 
@@ -469,7 +501,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
469
501
  kwargs.setdefault("recog_network", "standard")
470
502
 
471
503
  try:
472
- cls._reader = easyocr.Reader(
504
+ cls._reader = easyocr_module.Reader(
473
505
  languages,
474
506
  gpu=use_gpu,
475
507
  verbose=False,
@@ -29,6 +29,7 @@ from kreuzberg._ocr._base import OCRBackend
29
29
  from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
30
30
  from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
31
31
  from kreuzberg._utils._cache import get_ocr_cache
32
+ from kreuzberg._utils._html_streaming import should_use_streaming
32
33
  from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
33
34
  from kreuzberg._utils._string import normalize_spaces
34
35
  from kreuzberg._utils._sync import run_sync
@@ -510,7 +511,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
510
511
  escape_asterisks=False,
511
512
  escape_underscores=False,
512
513
  extract_metadata=False,
513
- strip="meta title",
514
+ strip=["meta", "title"],
514
515
  )
515
516
 
516
517
  tables: list[TableData] = []
@@ -532,6 +533,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
532
533
  config_dict = config.to_dict()
533
534
  config_dict["custom_converters"] = all_converters
534
535
 
536
+ use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
537
+ config_dict["stream_processing"] = use_streaming
538
+ config_dict["chunk_size"] = chunk_size
539
+
535
540
  try:
536
541
  markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
537
542
  markdown_content = normalize_spaces(markdown_content)
@@ -676,12 +681,18 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
676
681
  escape_asterisks=False,
677
682
  escape_underscores=False,
678
683
  extract_metadata=False,
679
- strip="meta title",
684
+ strip=["meta", "title"],
680
685
  )
681
686
 
687
+ config_dict = html_config.to_dict()
688
+
689
+ use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
690
+ config_dict["stream_processing"] = use_streaming
691
+ config_dict["chunk_size"] = chunk_size
692
+
682
693
  markdown_content = html_to_markdown.convert_to_markdown(
683
694
  hocr_content,
684
- **html_config.to_dict(),
695
+ **config_dict,
685
696
  )
686
697
 
687
698
  markdown_content = normalize_spaces(markdown_content)
kreuzberg/_types.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable, Callable, Iterable, Mapping
4
+ from collections.abc import Awaitable, Callable, Mapping
5
5
  from dataclasses import asdict, dataclass, field
6
6
  from enum import Enum
7
7
  from pathlib import Path
@@ -591,6 +591,8 @@ class ImagePreprocessingMetadata(NamedTuple):
591
591
 
592
592
 
593
593
  class Metadata(TypedDict, total=False):
594
+ abstract: NotRequired[str]
595
+ """Document abstract or summary."""
594
596
  authors: NotRequired[list[str]]
595
597
  """List of document authors."""
596
598
  categories: NotRequired[list[str]]
@@ -677,9 +679,26 @@ class Metadata(TypedDict, total=False):
677
679
  """Error message if extraction failed."""
678
680
  error_context: NotRequired[dict[str, Any]]
679
681
  """Error context information for debugging."""
682
+ json_schema: NotRequired[dict[str, Any]]
683
+ """JSON schema information extracted from structured data."""
684
+ notes: NotRequired[list[str]]
685
+ """Notes or additional information extracted from documents."""
686
+ note: NotRequired[str]
687
+ """Single note or annotation."""
688
+ name: NotRequired[str]
689
+ """Name field from structured data."""
690
+ body: NotRequired[str]
691
+ """Body text content."""
692
+ text: NotRequired[str]
693
+ """Generic text content."""
694
+ message: NotRequired[str]
695
+ """Message or communication content."""
696
+ attributes: NotRequired[dict[str, Any]]
697
+ """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
680
698
 
681
699
 
682
700
  _VALID_METADATA_KEYS = {
701
+ "abstract",
683
702
  "authors",
684
703
  "categories",
685
704
  "citations",
@@ -722,6 +741,14 @@ _VALID_METADATA_KEYS = {
722
741
  "source_format",
723
742
  "error",
724
743
  "error_context",
744
+ "json_schema",
745
+ "notes",
746
+ "note",
747
+ "name",
748
+ "body",
749
+ "text",
750
+ "message",
751
+ "attributes",
725
752
  }
726
753
 
727
754
 
@@ -730,9 +757,29 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
730
757
  return {}
731
758
 
732
759
  normalized: Metadata = {}
760
+ attributes: dict[str, Any] = {}
761
+
733
762
  for key, value in data.items():
734
- if key in _VALID_METADATA_KEYS and value is not None:
735
- normalized[key] = value # type: ignore[literal-required]
763
+ if value is not None:
764
+ if key in _VALID_METADATA_KEYS:
765
+ normalized[key] = value # type: ignore[literal-required]
766
+ elif "." in key and key.split(".")[-1] in {
767
+ "title",
768
+ "name",
769
+ "subject",
770
+ "description",
771
+ "content",
772
+ "body",
773
+ "text",
774
+ "message",
775
+ "note",
776
+ "abstract",
777
+ "summary",
778
+ }:
779
+ attributes[key] = value
780
+
781
+ if attributes:
782
+ normalized["attributes"] = attributes
736
783
 
737
784
  return normalized
738
785
 
@@ -835,6 +882,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
835
882
  ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
836
883
 
837
884
 
885
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
886
+ class JSONExtractionConfig(ConfigDict):
887
+ extract_schema: bool = False
888
+ """Extract and include JSON schema information in metadata."""
889
+ custom_text_field_patterns: frozenset[str] | None = None
890
+ """Custom patterns to identify text fields beyond default keywords."""
891
+ max_depth: int = 10
892
+ """Maximum nesting depth to process in JSON structures."""
893
+ array_item_limit: int = 1000
894
+ """Maximum number of array items to process to prevent memory issues."""
895
+ include_type_info: bool = False
896
+ """Include data type information in extracted content."""
897
+ flatten_nested_objects: bool = True
898
+ """Flatten nested objects using dot notation for better text extraction."""
899
+
900
+ def __post_init__(self) -> None:
901
+ if self.max_depth <= 0:
902
+ raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
903
+ if self.array_item_limit <= 0:
904
+ raise ValidationError(
905
+ "array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
906
+ )
907
+
908
+
838
909
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
839
910
  class ExtractionConfig(ConfigDict):
840
911
  force_ocr: bool = False
@@ -924,6 +995,8 @@ class ExtractionConfig(ConfigDict):
924
995
  """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
925
996
  html_to_markdown_config: HTMLToMarkdownConfig | None = None
926
997
  """Configuration for HTML to Markdown conversion. If None, uses default settings."""
998
+ json_config: JSONExtractionConfig | None = None
999
+ """Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
927
1000
  use_cache: bool = True
928
1001
  """Whether to use caching for extraction results. Set to False to disable all caching."""
929
1002
  target_dpi: int = 150
@@ -1060,70 +1133,68 @@ class ExtractionConfig(ConfigDict):
1060
1133
 
1061
1134
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
1062
1135
  class HTMLToMarkdownConfig:
1063
- stream_processing: bool = False
1064
- """Enable streaming mode for processing large HTML documents."""
1065
- chunk_size: int = 1024
1066
- """Size of chunks when stream_processing is enabled."""
1067
- chunk_callback: Callable[[str], None] | None = None
1068
- """Callback function invoked for each chunk during stream processing."""
1069
- progress_callback: Callable[[int, int], None] | None = None
1070
- """Callback function for progress updates (current, total)."""
1071
- parser: str | None = "lxml"
1072
- """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
1073
1136
  autolinks: bool = True
1074
- """Convert URLs to clickable links automatically."""
1137
+ """Automatically convert valid URLs to Markdown links."""
1138
+ br_in_tables: bool = False
1139
+ """Use <br> tags for line breaks in table cells instead of spaces."""
1075
1140
  bullets: str = "*+-"
1076
1141
  """Characters to use for unordered list bullets."""
1077
1142
  code_language: str = ""
1078
- """Default language for code blocks."""
1143
+ """Default language identifier for fenced code blocks."""
1079
1144
  code_language_callback: Callable[[Any], str] | None = None
1080
- """Callback to determine code language dynamically."""
1081
- convert: str | Iterable[str] | None = None
1082
- """HTML tags to convert. If None, all supported tags are converted."""
1145
+ """Function to dynamically determine code block language."""
1146
+ convert: list[str] | None = None
1147
+ """List of HTML tags to convert (None = all supported tags)."""
1083
1148
  convert_as_inline: bool = False
1084
- """Convert block elements as inline elements."""
1085
- custom_converters: Mapping[Any, Any] | None = None
1086
- """Custom converters for specific HTML elements."""
1149
+ """Treat content as inline elements only."""
1150
+ custom_converters: Mapping[str, Callable[..., str]] | None = None
1151
+ """Mapping of HTML tag names to custom converter functions."""
1087
1152
  default_title: bool = False
1088
- """Use a default title if none is found."""
1153
+ """Use default titles for elements like links."""
1089
1154
  escape_asterisks: bool = True
1090
- """Escape asterisks in text to prevent unintended emphasis."""
1155
+ """Escape * characters to prevent unintended formatting."""
1091
1156
  escape_misc: bool = True
1092
- """Escape miscellaneous characters that have special meaning in Markdown."""
1157
+ """Escape miscellaneous characters to prevent Markdown conflicts."""
1093
1158
  escape_underscores: bool = True
1094
- """Escape underscores in text to prevent unintended emphasis."""
1159
+ """Escape _ characters to prevent unintended formatting."""
1095
1160
  extract_metadata: bool = True
1096
- """Extract metadata from HTML head section."""
1161
+ """Extract document metadata as comment header."""
1097
1162
  heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
1098
1163
  """Style for markdown headings."""
1099
1164
  highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
1100
1165
  """Style for highlighting text."""
1101
- keep_inline_images_in: Iterable[str] | None = None
1102
- """HTML tags where inline images should be preserved."""
1166
+ keep_inline_images_in: list[str] | None = None
1167
+ """Tags where inline images should be preserved."""
1168
+ list_indent_type: Literal["spaces", "tabs"] = "spaces"
1169
+ """Type of indentation to use for lists."""
1170
+ list_indent_width: int = 4
1171
+ """Number of spaces per indentation level (use 2 for Discord/Slack)."""
1103
1172
  newline_style: Literal["spaces", "backslash"] = "spaces"
1104
1173
  """Style for line breaks in markdown."""
1105
- strip: str | Iterable[str] | None = None
1106
- """HTML tags to strip completely from output."""
1174
+ preprocess_html: bool = False
1175
+ """Enable HTML preprocessing to clean messy HTML."""
1176
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
1177
+ """Preprocessing level for cleaning HTML."""
1178
+ remove_forms: bool = True
1179
+ """Remove form elements during preprocessing."""
1180
+ remove_navigation: bool = True
1181
+ """Remove navigation elements during preprocessing."""
1182
+ strip: list[str] | None = None
1183
+ """List of HTML tags to remove from output."""
1107
1184
  strip_newlines: bool = False
1108
- """Strip newlines from the output."""
1185
+ """Remove newlines from HTML input before processing."""
1109
1186
  strong_em_symbol: Literal["*", "_"] = "*"
1110
1187
  """Symbol to use for strong/emphasis formatting."""
1111
1188
  sub_symbol: str = ""
1112
1189
  """Symbol to use for subscript text."""
1113
1190
  sup_symbol: str = ""
1114
1191
  """Symbol to use for superscript text."""
1192
+ whitespace_mode: Literal["normalized", "strict"] = "normalized"
1193
+ """Whitespace handling mode."""
1115
1194
  wrap: bool = False
1116
1195
  """Enable text wrapping."""
1117
1196
  wrap_width: int = 80
1118
- """Width for text wrapping when wrap is True."""
1119
- preprocess_html: bool = True
1120
- """Enable HTML preprocessing to clean up the input."""
1121
- preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
1122
- """Preprocessing level for cleaning HTML."""
1123
- remove_navigation: bool = True
1124
- """Remove navigation elements from HTML."""
1125
- remove_forms: bool = True
1126
- """Remove form elements from HTML."""
1197
+ """Width for text wrapping."""
1127
1198
 
1128
1199
  def to_dict(self) -> dict[str, Any]:
1129
1200
  result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ _STREAMING_THRESHOLD_KB = 10
4
+ _LARGE_FILE_THRESHOLD_MB = 1
5
+ _DEFAULT_CHUNK_SIZE = 2048
6
+ _LARGE_FILE_CHUNK_SIZE = 4096
7
+
8
+ _STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
9
+ _LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
10
+
11
+
12
+ def should_use_streaming(content_size: int) -> tuple[bool, int]:
13
+ if content_size < 0:
14
+ return False, _DEFAULT_CHUNK_SIZE
15
+
16
+ if content_size > _STREAMING_THRESHOLD_BYTES:
17
+ if content_size > _LARGE_FILE_THRESHOLD_BYTES:
18
+ return True, _LARGE_FILE_CHUNK_SIZE
19
+ return True, _DEFAULT_CHUNK_SIZE
20
+ return False, _DEFAULT_CHUNK_SIZE
@@ -1,11 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import is_dataclass
4
- from typing import Any, TypeVar, cast
4
+ from typing import Any, TypeVar
5
5
 
6
6
  import msgspec
7
7
  from msgspec import MsgspecError
8
- from msgspec.msgpack import decode, encode
9
8
 
10
9
  T = TypeVar("T")
11
10
 
@@ -42,18 +41,26 @@ def encode_hook(obj: Any) -> Any:
42
41
  raise TypeError(f"Unsupported type: {type(obj)!r}")
43
42
 
44
43
 
45
- def deserialize(value: str | bytes, target_type: type[T]) -> T:
44
+ def deserialize(value: str | bytes, target_type: type[T], json: bool = False) -> T:
45
+ decoder = msgspec.json.decode if json else msgspec.msgpack.decode
46
+
47
+ if json:
48
+ data = value.encode() if isinstance(value, str) else value
49
+ else:
50
+ data = value.encode() if isinstance(value, str) else value
51
+
46
52
  try:
47
- return decode(cast("bytes", value), type=target_type, strict=False)
53
+ return decoder(data, type=target_type, strict=False)
48
54
  except MsgspecError as e:
49
55
  raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
50
56
 
51
57
 
52
- def serialize(value: Any, **kwargs: Any) -> bytes:
58
+ def serialize(value: Any, json: bool = False, **kwargs: Any) -> bytes:
53
59
  if isinstance(value, dict) and kwargs:
54
60
  value = value | kwargs
55
61
 
62
+ encoder = msgspec.json.encode if json else msgspec.msgpack.encode
56
63
  try:
57
- return encode(value, enc_hook=encode_hook)
64
+ return encoder(value, enc_hook=encode_hook)
58
65
  except (MsgspecError, TypeError) as e:
59
66
  raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e