deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,169 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+ from pathlib import Path
7
+
8
+
9
+ def _parse_args(argv: list[str]) -> argparse.Namespace:
10
+ from deepdoc.common import model_store
11
+
12
+ parser = argparse.ArgumentParser(
13
+ prog="deepdoc-download-models",
14
+ description="Download/cache all Deepdoc model bundles, NLTK data, and tiktoken assets for offline use.",
15
+ )
16
+
17
+ # Default behavior: no args downloads everything using the remote provider into the default cache dirs
18
+ # (~/.cache/deepdoc unless DEEPDOC_MODEL_HOME is set).
19
+ parser.add_argument(
20
+ "--provider",
21
+ default="modelscope",
22
+ choices=("auto", "local", "modelscope"),
23
+ help="Model provider to use (default: %(default)s).",
24
+ )
25
+ parser.add_argument(
26
+ "--model-home",
27
+ default=None,
28
+ help="Model cache root (default: $DEEPDOC_MODEL_HOME or ~/.cache/deepdoc).",
29
+ )
30
+ parser.add_argument(
31
+ "--offline",
32
+ action="store_true",
33
+ help="Disable remote downloads (also disables NLTK auto-download).",
34
+ )
35
+ parser.add_argument(
36
+ "--bundle",
37
+ action="append",
38
+ choices=tuple(model_store.BUNDLES.keys()),
39
+ help="Bundle(s) to download. Repeatable. Default: all bundles.",
40
+ )
41
+ parser.add_argument(
42
+ "--no-nltk",
43
+ action="store_true",
44
+ help="Skip downloading required NLTK resources used by the tokenizer.",
45
+ )
46
+ parser.add_argument(
47
+ "--nltk-data-dir",
48
+ default=None,
49
+ help="Where to store NLTK data (default: $DEEPDOC_NLTK_DATA_DIR, $NLTK_DATA, or ~/.cache/deepdoc/nltk_data).",
50
+ )
51
+ parser.add_argument(
52
+ "--no-tiktoken",
53
+ action="store_true",
54
+ help="Skip downloading the cached cl100k_base tiktoken file used by token_utils.",
55
+ )
56
+ parser.add_argument(
57
+ "--tiktoken-cache-dir",
58
+ default=None,
59
+ help="Where to store the cached tiktoken file (default: $DEEPDOC_TIKTOKEN_CACHE_DIR, $TIKTOKEN_CACHE_DIR, $DEEPDOC_MODEL_HOME/tiktoken_cache, or ~/.cache/deepdoc/tiktoken_cache).",
60
+ )
61
+ parser.add_argument(
62
+ "-v",
63
+ "--verbose",
64
+ action="count",
65
+ default=0,
66
+ help="Increase logging verbosity (repeatable).",
67
+ )
68
+
69
+ args = parser.parse_args(argv)
70
+ if not args.bundle:
71
+ args.bundle = list(model_store.BUNDLES.keys())
72
+ return args
73
+
74
+
75
+ def download_all(*, provider: str = "modelscope", model_home: str | None = None, offline: bool = False) -> dict[str, str]:
76
+ """Download/cache all bundles into the configured cache directories."""
77
+ from deepdoc.common import model_store
78
+
79
+ resolved: dict[str, str] = {}
80
+ for bundle in model_store.BUNDLES:
81
+ resolved[bundle] = model_store.resolve_bundle_dir(
82
+ bundle,
83
+ model_home=model_home,
84
+ provider=provider,
85
+ offline=offline,
86
+ )
87
+ return resolved
88
+
89
+
90
+ def main(argv: list[str] | None = None) -> int:
91
+ argv = list(sys.argv[1:] if argv is None else argv)
92
+ args = _parse_args(argv)
93
+
94
+ log_level = logging.WARNING
95
+ if args.verbose == 1:
96
+ log_level = logging.INFO
97
+ elif args.verbose >= 2:
98
+ log_level = logging.DEBUG
99
+ logging.basicConfig(level=log_level, format="%(levelname)s %(message)s")
100
+
101
+ model_home: str | None
102
+ if args.model_home:
103
+ model_home = str(Path(args.model_home).expanduser().resolve())
104
+ else:
105
+ model_home = None
106
+
107
+ from deepdoc.common import model_store
108
+
109
+ failures: list[str] = []
110
+ resolved: dict[str, str] = {}
111
+ tiktoken_cache_path: str | None = None
112
+
113
+ for bundle in args.bundle:
114
+ try:
115
+ resolved_dir = model_store.resolve_bundle_dir(
116
+ bundle,
117
+ model_home=model_home,
118
+ provider=args.provider,
119
+ offline=args.offline,
120
+ )
121
+ resolved[bundle] = resolved_dir
122
+ except Exception as exc:
123
+ failures.append(f"{bundle}: {exc}")
124
+
125
+ if not args.no_nltk:
126
+ try:
127
+ from deepdoc.depend.nltk_manager import ensure_nltk_data
128
+
129
+ ensure_nltk_data(
130
+ data_dir=args.nltk_data_dir,
131
+ offline=args.offline,
132
+ )
133
+ except Exception as exc:
134
+ failures.append(f"nltk: {exc}")
135
+
136
+ if not args.no_tiktoken:
137
+ try:
138
+ from deepdoc.common.tiktoken_cache import configure_tiktoken_cache_env, download_cl100k_base
139
+
140
+ tiktoken_cache_path = str(
141
+ download_cl100k_base(
142
+ cache_dir=args.tiktoken_cache_dir,
143
+ model_home=model_home,
144
+ offline=args.offline,
145
+ )
146
+ )
147
+ configure_tiktoken_cache_env(
148
+ cache_dir=args.tiktoken_cache_dir,
149
+ model_home=model_home,
150
+ )
151
+ except Exception as exc:
152
+ failures.append(f"tiktoken: {exc}")
153
+
154
+ for bundle_name in sorted(resolved):
155
+ print(f"{bundle_name}\t{resolved[bundle_name]}")
156
+
157
+ if tiktoken_cache_path:
158
+ print(f"tiktoken\t{tiktoken_cache_path}")
159
+
160
+ if failures:
161
+ for item in failures:
162
+ print(f"ERROR\t{item}", file=sys.stderr)
163
+ return 1
164
+
165
+ return 0
166
+
167
+
168
+ if __name__ == "__main__":
169
+ raise SystemExit(main())
@@ -0,0 +1,15 @@
1
+ """
2
+ LLM Adapter Layer for DeepDoc
3
+
4
+ This module provides a thin adapter layer that handles LLM-related dependencies,
5
+ allowing DeepDoc to work in both FenixAOS environments and standalone configurations.
6
+ """
7
+
8
+ from .adapter import LLMAdapter, LLMType
9
+ from .vision import vision_llm_chunk
10
+
11
+ __all__ = [
12
+ "LLMAdapter",
13
+ "LLMType",
14
+ "vision_llm_chunk",
15
+ ]
@@ -0,0 +1,223 @@
1
+ """
2
+ LLM Adapter - Thin wrapper for LLM services
3
+
4
+ This module provides a unified interface for LLM services that works in both
5
+ FenixAOS environments and standalone configurations.
6
+ """
7
+
8
+ import logging
9
+ from enum import Enum
10
+ from typing import Any, Optional, Union
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Try to import from FenixAOS first, fallback to local implementations
15
+ try:
16
+ from fenixaos.core.model import LLMType as FenixLLMType
17
+ from fenixaos.core.model.chat.basic.adapter import create_base_llm
18
+ from fenixaos.core.model.model import ChatModelConfig, ImageModelConfig
19
+ FENIXAOS_AVAILABLE = True
20
+ logger.info("Using FenixAOS LLM services")
21
+ except ImportError:
22
+ FENIXAOS_AVAILABLE = False
23
+ logger.info("FenixAOS not available, using local LLM implementations")
24
+
25
+ # Local LLM implementations
26
+ try:
27
+ from ..depend.simple_cv_model import create_vision_model
28
+ LOCAL_VISION_AVAILABLE = True
29
+ except ImportError:
30
+ LOCAL_VISION_AVAILABLE = False
31
+
32
+
33
+ class LLMType(str, Enum):
34
+ """Unified LLM Type enumeration"""
35
+ CHAT = "chat"
36
+ EMBEDDING = "embedding"
37
+ IMAGE2TEXT = "image2text"
38
+ SPEECH2TEXT = "speech2text"
39
+ RERANK = "rerank"
40
+
41
+ @classmethod
42
+ def from_fenix(cls, fenix_type: Any) -> 'LLMType':
43
+ """Convert FenixAOS LLMType to unified type"""
44
+ if FENIXAOS_AVAILABLE:
45
+ # Map FenixAOS types to unified types
46
+ type_mapping = {
47
+ 'chat': cls.CHAT,
48
+ 'embedding': cls.EMBEDDING,
49
+ 'image2text': cls.IMAGE2TEXT,
50
+ 'speech2text': cls.SPEECH2TEXT,
51
+ 'rerank': cls.RERANK,
52
+ }
53
+ return type_mapping.get(str(fenix_type).lower(), cls.CHAT)
54
+ return cls.CHAT
55
+
56
+
57
+ class LLMServiceInterface:
58
+ """Unified interface for LLM services"""
59
+
60
+ def describe_with_prompt(self, image: Union[bytes, Any], prompt: Optional[str] = None) -> str:
61
+ """Describe an image with optional prompt"""
62
+ raise NotImplementedError
63
+
64
+ def encode(self, texts: list) -> tuple:
65
+ """Encode texts to embeddings"""
66
+ raise NotImplementedError
67
+
68
+ def chat(self, messages: list, **kwargs) -> str:
69
+ """Chat completion"""
70
+ raise NotImplementedError
71
+
72
+
73
+ class FenixAOSLLMService(LLMServiceInterface):
74
+ """LLM Service adapter for FenixAOS environment"""
75
+
76
+ def __init__(self, tenant_id: Optional[str], llm_type: LLMType, llm_name: Optional[str] = None, **kwargs):
77
+ self.tenant_id = tenant_id
78
+ self.llm_type = llm_type
79
+ self.llm_name = llm_name
80
+ self._api_key = kwargs.get('api_key')
81
+ self._base_url = kwargs.get('base_url')
82
+
83
+ # Try to create vision service from FenixAOS
84
+ try:
85
+ self._service = self._create_fenix_service()
86
+ except Exception as e:
87
+ logger.warning(f"Failed to create FenixAOS LLM service: {e}")
88
+ raise
89
+
90
+ def _create_fenix_service(self):
91
+ """Create service using FenixAOS APIs"""
92
+ # Try to get vision model from FenixAOS
93
+ try:
94
+ # Import FenixAOS components
95
+ from fenixaos.core.model.image.adapter import ImageModelAdapter
96
+ from fenixaos.core.model.model import ImageModelConfig
97
+
98
+ # Create vision model config
99
+ config = ImageModelConfig(
100
+ id=f"deepdoc_vision_{self.llm_name or 'default'}",
101
+ model_name=self.llm_name or "gpt-4-vision-preview",
102
+ model_provider="openai", # Default to OpenAI
103
+ api_key=getattr(self, '_api_key', None),
104
+ base_url=getattr(self, '_base_url', None),
105
+ )
106
+
107
+ # Create and return adapter
108
+ return ImageModelAdapter(config)
109
+
110
+ except Exception as e:
111
+ logger.error(f"Failed to create FenixAOS vision service: {e}")
112
+ raise
113
+
114
+ def describe_with_prompt(self, image: Union[bytes, Any], prompt: Optional[str] = None) -> str:
115
+ return self._service.describe_with_prompt(image, prompt)
116
+
117
+ def encode(self, texts: list) -> tuple:
118
+ return self._service.encode(texts)
119
+
120
+ def chat(self, messages: list, **kwargs) -> str:
121
+ return self._service.chat(messages, **kwargs)
122
+
123
+
124
+ class LocalLLMService(LLMServiceInterface):
125
+ """LLM Service using local implementations"""
126
+
127
+ def __init__(self, config: Optional[dict] = None, **kwargs):
128
+ self.config = config or {}
129
+ self._vision_model = None
130
+
131
+ if LOCAL_VISION_AVAILABLE:
132
+ try:
133
+ self._vision_model = create_vision_model()
134
+ logger.info("Local vision model initialized")
135
+ except Exception as e:
136
+ logger.warning(f"Failed to initialize local vision model: {e}")
137
+
138
+ def describe_with_prompt(self, image: Union[bytes, Any], prompt: Optional[str] = None) -> str:
139
+ if self._vision_model:
140
+ return self._vision_model.describe_with_prompt(image, prompt)
141
+ else:
142
+ logger.warning("No vision model available")
143
+ return "Vision model not available"
144
+
145
+ def encode(self, texts: list) -> tuple:
146
+ # Placeholder for embedding functionality
147
+ logger.warning("Local embedding not implemented")
148
+ return [], 0
149
+
150
+ def chat(self, messages: list, **kwargs) -> str:
151
+ # Placeholder for chat functionality
152
+ logger.warning("Local chat not implemented")
153
+ return "Local chat not available"
154
+
155
+
156
+ class LLMAdapter:
157
+ """Main adapter class that provides unified LLM service access"""
158
+
159
+ def __init__(self, tenant_id: Optional[str] = None, llm_type: LLMType = LLMType.IMAGE2TEXT,
160
+ llm_name: Optional[str] = None, **kwargs):
161
+ self.tenant_id = tenant_id
162
+ self.llm_type = llm_type
163
+ self.llm_name = llm_name
164
+ self.kwargs = kwargs
165
+
166
+ # Try FenixAOS first, then fallback to local
167
+ self._service = self._create_service()
168
+
169
+ def _create_service(self) -> LLMServiceInterface:
170
+ """Create appropriate LLM service based on environment"""
171
+ # Always try FenixAOS first if available, regardless of tenant_id
172
+ if FENIXAOS_AVAILABLE:
173
+ try:
174
+ return FenixAOSLLMService(self.tenant_id, self.llm_type, self.llm_name, **self.kwargs)
175
+ except Exception as e:
176
+ logger.warning(f"FenixAOS LLM service creation failed: {e}, falling back to local")
177
+
178
+ # Fallback to local implementation
179
+ return LocalLLMService(**self.kwargs)
180
+
181
+ def describe_with_prompt(self, image: Union[bytes, Any], prompt: Optional[str] = None) -> str:
182
+ """Describe an image with optional prompt"""
183
+ return self._service.describe_with_prompt(image, prompt)
184
+
185
+ def encode(self, texts: list) -> tuple:
186
+ """Encode texts to embeddings"""
187
+ return self._service.encode(texts)
188
+
189
+ def chat(self, messages: list, **kwargs) -> str:
190
+ """Chat completion"""
191
+ return self._service.chat(messages, **kwargs)
192
+
193
+ # Compatibility methods for LLMBundle-like interface
194
+ def bind_tools(self, toolcall_session, tools):
195
+ """Bind tools (placeholder for compatibility)"""
196
+ logger.debug("Tool binding not implemented in adapter")
197
+
198
+ @property
199
+ def is_tools(self) -> bool:
200
+ """Check if tools are supported"""
201
+ return False
202
+
203
+ @property
204
+ def max_length(self) -> int:
205
+ """Maximum context length"""
206
+ return 4096 # Default value
207
+
208
+
209
+ def create_llm_service(tenant_id: Optional[str] = None, llm_type: LLMType = LLMType.IMAGE2TEXT,
210
+ llm_name: Optional[str] = None, **kwargs) -> LLMAdapter:
211
+ """
212
+ Factory function to create LLM service
213
+
214
+ Args:
215
+ tenant_id: Tenant ID (required for FenixAOS)
216
+ llm_type: Type of LLM service
217
+ llm_name: Specific model name
218
+ **kwargs: Additional configuration
219
+
220
+ Returns:
221
+ LLMAdapter instance
222
+ """
223
+ return LLMAdapter(tenant_id, llm_type, llm_name, **kwargs)
@@ -0,0 +1,104 @@
1
+ """
2
+ Utility functions for LLM adapter
3
+ """
4
+
5
+ import re
6
+
7
+
8
+ def clean_markdown_block(text: str) -> str:
9
+ """
10
+ Clean markdown block formatting from text.
11
+
12
+ Args:
13
+ text: Text that may contain markdown code blocks
14
+
15
+ Returns:
16
+ str: Cleaned text without markdown formatting
17
+ """
18
+ if not text:
19
+ return ""
20
+
21
+ # Remove markdown code block markers
22
+ text = re.sub(r'^\s*```markdown\s*\n?', '', text)
23
+ text = re.sub(r'\n?\s*```\s*$', '', text)
24
+
25
+ return text.strip()
26
+
27
+
28
+ def extract_image_description(text: str) -> str:
29
+ """
30
+ Extract the main description from vision model output.
31
+
32
+ Args:
33
+ text: Raw output from vision model
34
+
35
+ Returns:
36
+ str: Cleaned description
37
+ """
38
+ # Clean markdown formatting
39
+ text = clean_markdown_block(text)
40
+
41
+ # Remove common prefixes that vision models might add
42
+ prefixes_to_remove = [
43
+ "The image shows",
44
+ "This image depicts",
45
+ "The picture shows",
46
+ "This is an image of",
47
+ "The photo shows",
48
+ ]
49
+
50
+ for prefix in prefixes_to_remove:
51
+ if text.lower().startswith(prefix.lower()):
52
+ # Only remove if it's followed by actual content
53
+ remaining = text[len(prefix):].strip()
54
+ if remaining and not remaining.startswith(("a", "an", "the")):
55
+ continue
56
+ text = remaining
57
+ break
58
+
59
+ return text.strip()
60
+
61
+
62
+ def validate_image_data(image_data: bytes, max_size: int = 10 * 1024 * 1024) -> bool:
63
+ """
64
+ Validate image data.
65
+
66
+ Args:
67
+ image_data: Raw image bytes
68
+ max_size: Maximum allowed size in bytes
69
+
70
+ Returns:
71
+ bool: True if valid
72
+ """
73
+ if not image_data:
74
+ return False
75
+
76
+ if len(image_data) > max_size:
77
+ return False
78
+
79
+ # Check for common image signatures
80
+ if len(image_data) < 8:
81
+ return False
82
+
83
+ # JPEG signature
84
+ if image_data.startswith(b'\xff\xd8\xff'):
85
+ return True
86
+
87
+ # PNG signature
88
+ if image_data.startswith(b'\x89PNG\r\n\x1a\n'):
89
+ return True
90
+
91
+ # GIF signature
92
+ if image_data.startswith(b'GIF87a') or image_data.startswith(b'GIF89a'):
93
+ return True
94
+
95
+ # BMP signature
96
+ if image_data.startswith(b'BM'):
97
+ return True
98
+
99
+ # WebP signature
100
+ if image_data.startswith(b'RIFF') and len(image_data) >= 12:
101
+ if image_data[8:12] == b'WEBP':
102
+ return True
103
+
104
+ return False
@@ -0,0 +1,163 @@
1
+ """
2
+ Vision LLM utilities for DeepDoc
3
+
4
+ Enhanced vision_llm_chunk with better error handling and format support.
5
+ """
6
+
7
+ import io
8
+ import logging
9
+ from typing import Any, Callable, Optional, Union
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Try to import markdown cleaning function
14
+ try:
15
+ from .utils import clean_markdown_block
16
+ except ImportError:
17
+ def clean_markdown_block(text: str) -> str:
18
+ """Fallback markdown cleaning function"""
19
+ import re
20
+ text = re.sub(r'^\s*```markdown\s*\n?', '', text)
21
+ text = re.sub(r'\n?\s*```\s*$', '', text)
22
+ return text.strip()
23
+
24
+
25
+ def vision_llm_chunk(
26
+ binary: Any,
27
+ vision_model: Any,
28
+ prompt: Optional[str] = None,
29
+ callback: Optional[Callable] = None
30
+ ) -> str:
31
+ """
32
+ Enhanced vision LLM chunk processing with better error handling and format support.
33
+
34
+ This is an improved version that supports multiple image formats and provides
35
+ better error handling compared to the basic depend/vision_llm_chunk.py.
36
+
37
+ Args:
38
+ binary: Image binary data (PIL Image, bytes, or BytesIO)
39
+ vision_model: Vision model instance with describe_with_prompt method
40
+ prompt: Optional prompt for image description
41
+ callback: Optional callback function for progress reporting
42
+
43
+ Returns:
44
+ str: Processed markdown text from vision model
45
+ """
46
+ callback = callback or (lambda prog, msg: None)
47
+
48
+ img = binary
49
+ txt = ""
50
+
51
+ try:
52
+ # Convert image to bytes with format fallback
53
+ img_binary = io.BytesIO()
54
+
55
+ # Try different formats in order of preference
56
+ formats_to_try = ['JPEG', 'PNG', 'WEBP', 'BMP']
57
+
58
+ saved_successfully = False
59
+ for fmt in formats_to_try:
60
+ try:
61
+ img.save(img_binary, format=fmt)
62
+ saved_successfully = True
63
+ break
64
+ except Exception:
65
+ img_binary.seek(0)
66
+ img_binary.truncate() # Clear buffer for next attempt
67
+ continue
68
+
69
+ if not saved_successfully:
70
+ raise ValueError("Unable to save image in any supported format")
71
+
72
+ img_binary.seek(0)
73
+
74
+ # Call vision model
75
+ ans = clean_markdown_block(
76
+ vision_model.describe_with_prompt(img_binary.read(), prompt)
77
+ )
78
+
79
+ txt += "\n" + ans
80
+
81
+ return txt
82
+
83
+ except Exception as e:
84
+ error_msg = f"Vision model processing failed: {str(e)}"
85
+ logger.error(error_msg)
86
+ callback(-1, error_msg)
87
+ return ""
88
+
89
+
90
+ def vision_llm_chunk_with_fallback(
91
+ binary: Any,
92
+ vision_model: Any,
93
+ prompt: Optional[str] = None,
94
+ callback: Optional[Callable] = None,
95
+ fallback_text: str = "Image processing failed"
96
+ ) -> str:
97
+ """
98
+ Vision LLM chunk processing with fallback text on failure.
99
+
100
+ Args:
101
+ binary: Image binary data
102
+ vision_model: Vision model instance
103
+ prompt: Optional prompt
104
+ callback: Optional callback
105
+ fallback_text: Text to return on failure
106
+
107
+ Returns:
108
+ str: Processed text or fallback text
109
+ """
110
+ result = vision_llm_chunk(binary, vision_model, prompt, callback)
111
+ return result if result.strip() else fallback_text
112
+
113
+
114
+ def batch_vision_llm_chunk(
115
+ images: list,
116
+ vision_model: Any,
117
+ prompts: Optional[list] = None,
118
+ callback: Optional[Callable] = None,
119
+ max_workers: int = 3
120
+ ) -> list:
121
+ """
122
+ Process multiple images in parallel.
123
+
124
+ Args:
125
+ images: List of image binary data
126
+ vision_model: Vision model instance
127
+ prompts: Optional list of prompts (same length as images)
128
+ callback: Optional callback function
129
+ max_workers: Maximum number of parallel workers
130
+
131
+ Returns:
132
+ list: List of processed text results
133
+ """
134
+ from concurrent.futures import ThreadPoolExecutor, as_completed
135
+
136
+ if not images:
137
+ return []
138
+
139
+ if prompts and len(prompts) != len(images):
140
+ raise ValueError("prompts list must have same length as images list")
141
+
142
+ results = [None] * len(images)
143
+ prompts = prompts or [None] * len(images)
144
+
145
+ def process_single(idx: int, img: Any, prompt: Optional[str]) -> tuple:
146
+ try:
147
+ result = vision_llm_chunk(img, vision_model, prompt, callback)
148
+ return idx, result
149
+ except Exception as e:
150
+ logger.error(f"Failed to process image {idx}: {e}")
151
+ return idx, ""
152
+
153
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
154
+ futures = [
155
+ executor.submit(process_single, i, img, prompt)
156
+ for i, (img, prompt) in enumerate(zip(images, prompts))
157
+ ]
158
+
159
+ for future in as_completed(futures):
160
+ idx, result = future.result()
161
+ results[idx] = result
162
+
163
+ return results