xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,67 @@
1
+ # xgen_doc2chunk/ocr/__init__.py
2
+ # OCR module package initialization
3
+ """
4
+ OCR Processing Module
5
+
6
+ This module provides OCR functionality to extract text from images
7
+ using various LLM Vision models.
8
+
9
+ Usage Examples:
10
+ ```python
11
+ from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR, AnthropicOCR, GeminiOCR, VllmOCR
12
+
13
+ # OCR processing with OpenAI Vision model
14
+ ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
15
+ result = ocr.convert_image_to_text("/path/to/image.png")
16
+
17
+ # OCR processing with Anthropic Claude Vision model
18
+ ocr = AnthropicOCR(api_key="sk-ant-...", model="claude-sonnet-4-20250514")
19
+ result = ocr.convert_image_to_text("/path/to/image.png")
20
+
21
+ # OCR processing with Google Gemini Vision model
22
+ ocr = GeminiOCR(api_key="...", model="gemini-2.0-flash")
23
+ result = ocr.convert_image_to_text("/path/to/image.png")
24
+
25
+ # OCR processing with vLLM-based Vision model
26
+ ocr = VllmOCR(base_url="http://localhost:8000/v1", model="Qwen/Qwen2-VL-7B-Instruct")
27
+ result = ocr.convert_image_to_text("/path/to/image.png")
28
+ ```
29
+
30
+ Classes:
31
+ - BaseOCR: Abstract base class for OCR processing
32
+ - OpenAIOCR: OpenAI Vision model based OCR (ocr_engine module)
33
+ - AnthropicOCR: Anthropic Claude Vision model based OCR (ocr_engine module)
34
+ - GeminiOCR: Google Gemini Vision model based OCR (ocr_engine module)
35
+ - VllmOCR: vLLM-based Vision model OCR (ocr_engine module)
36
+ """
37
+
38
+ from xgen_doc2chunk.ocr.base import BaseOCR
39
+ from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR, AnthropicOCR, GeminiOCR, VllmOCR
40
+ from xgen_doc2chunk.ocr.ocr_processor import (
41
+ IMAGE_TAG_PATTERN,
42
+ extract_image_tags,
43
+ load_image_from_path,
44
+ convert_image_to_text_with_llm,
45
+ process_text_with_ocr,
46
+ process_text_with_ocr_progress,
47
+ _b64_from_file,
48
+ _get_mime_type,
49
+ )
50
+
51
+ __all__ = [
52
+ # Base Class
53
+ "BaseOCR",
54
+ # OCR Engines
55
+ "OpenAIOCR",
56
+ "AnthropicOCR",
57
+ "GeminiOCR",
58
+ "VllmOCR",
59
+ # Functions
60
+ "IMAGE_TAG_PATTERN",
61
+ "extract_image_tags",
62
+ "load_image_from_path",
63
+ "convert_image_to_text_with_llm",
64
+ "process_text_with_ocr",
65
+ "process_text_with_ocr_progress",
66
+ ]
67
+
@@ -0,0 +1,209 @@
1
+ # xgen_doc2chunk/ocr/base.py
2
+ # Abstract base class for OCR models
3
+ import logging
4
+ import re
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Optional, Pattern
7
+
8
+ logger = logging.getLogger("ocr-base")
9
+
10
+
11
+ class BaseOCR(ABC):
12
+ """
13
+ Abstract base class for OCR processing.
14
+
15
+ All OCR model implementations must inherit from this class.
16
+ """
17
+
18
+ # Default prompt (can be overridden in subclasses)
19
+ DEFAULT_PROMPT = (
20
+ "Extract meaningful information from this image.\n\n"
21
+ "**If the image contains a TABLE:**\n"
22
+ "- Convert to HTML table format (<table>, <tr>, <td>, <th>)\n"
23
+ "- Use 'rowspan' and 'colspan' attributes for merged cells\n"
24
+ "- Preserve all cell content exactly as shown\n"
25
+ "- Example:\n"
26
+ " <table>\n"
27
+ " <tr><th colspan=\"2\">Header</th></tr>\n"
28
+ " <tr><td rowspan=\"2\">Merged</td><td>A</td></tr>\n"
29
+ " <tr><td>B</td></tr>\n"
30
+ " </table>\n\n"
31
+ "**If the image contains TEXT (non-table):**\n"
32
+ "- Extract all text exactly as shown\n"
33
+ "- Keep layout, hierarchy, and structure\n\n"
34
+ "**If the image contains DATA (charts, graphs, diagrams):**\n"
35
+ "- Extract the data and its meaning\n"
36
+ "- Describe trends, relationships, or key insights\n\n"
37
+ "**If the image is decorative or has no semantic meaning:**\n"
38
+ "- Simply state what it is in one short sentence\n"
39
+ "- Example: 'A decorative geometric shape' or 'Company logo'\n"
40
+ "- Do NOT over-analyze decorative elements\n\n"
41
+ "**Rules:**\n"
42
+ "- Output in Korean (except HTML tags)\n"
43
+ "- Tables MUST use HTML format with proper rowspan/colspan\n"
44
+ "- Be concise - only include what is semantically meaningful\n"
45
+ "- No filler words or unnecessary descriptions"
46
+ )
47
+
48
+ # Simple prompt (used for vllm, etc.)
49
+ SIMPLE_PROMPT = "Describe the contents of this image."
50
+
51
+ def __init__(self, llm_client: Any, prompt: Optional[str] = None):
52
+ """
53
+ Initialize OCR model.
54
+
55
+ Args:
56
+ llm_client: LangChain LLM client (must support Vision models)
57
+ prompt: Custom prompt (uses default prompt if None)
58
+ """
59
+ self.llm_client = llm_client
60
+ self.prompt = prompt if prompt is not None else self.DEFAULT_PROMPT
61
+ self._image_pattern: Optional[Pattern[str]] = None
62
+
63
+ @property
64
+ @abstractmethod
65
+ def provider(self) -> str:
66
+ """Return OCR provider name (e.g., 'openai', 'anthropic')"""
67
+ pass
68
+
69
+ @abstractmethod
70
+ def build_message_content(self, b64_image: str, mime_type: str) -> list:
71
+ """
72
+ Build message content for LLM.
73
+
74
+ Args:
75
+ b64_image: Base64 encoded image
76
+ mime_type: Image MIME type
77
+
78
+ Returns:
79
+ Content list for LangChain HumanMessage
80
+ """
81
+ pass
82
+
83
+ def convert_image_to_text(self, image_path: str) -> Optional[str]:
84
+ """
85
+ Convert image to text.
86
+
87
+ Args:
88
+ image_path: Local image file path
89
+
90
+ Returns:
91
+ Extracted text from image or None (on failure)
92
+ """
93
+ from xgen_doc2chunk.ocr.ocr_processor import (
94
+ _b64_from_file,
95
+ _get_mime_type,
96
+ )
97
+ from langchain_core.messages import HumanMessage
98
+
99
+ try:
100
+ b64_image = _b64_from_file(image_path)
101
+ mime_type = _get_mime_type(image_path)
102
+
103
+ content = self.build_message_content(b64_image, mime_type)
104
+ message = HumanMessage(content=content)
105
+
106
+ response = self.llm_client.invoke([message])
107
+ result = response.content.strip()
108
+
109
+ # Wrap result in [Figure:...] format
110
+ result = f"[Figure:{result}]"
111
+
112
+ logger.info(f"[{self.provider.upper()}] Image to text conversion completed")
113
+ return result
114
+
115
+ except Exception as e:
116
+ logger.error(f"[{self.provider.upper()}] Image to text conversion failed: {e}")
117
+ return f"[Image conversion error: {str(e)}]"
118
+
119
+ def set_image_pattern(self, pattern: Optional[Pattern[str]] = None) -> None:
120
+ """
121
+ Set custom image pattern for tag detection.
122
+
123
+ Args:
124
+ pattern: Compiled regex pattern with capture group for image path.
125
+ If None, uses default [Image:{path}] pattern.
126
+
127
+ Examples:
128
+ >>> import re
129
+ >>> ocr.set_image_pattern(re.compile(r"<img src='([^']+)'/>"))
130
+ """
131
+ self._image_pattern = pattern
132
+
133
+ def set_image_pattern_from_string(self, pattern_string: str) -> None:
134
+ """
135
+ Set custom image pattern from pattern string.
136
+
137
+ Args:
138
+ pattern_string: Regex pattern string with capture group for image path.
139
+
140
+ Examples:
141
+ >>> ocr.set_image_pattern_from_string(r"<img src='([^']+)'/>")
142
+ """
143
+ self._image_pattern = re.compile(pattern_string)
144
+
145
+ def process_text(self, text: str, image_pattern: Optional[Pattern[str]] = None) -> str:
146
+ """
147
+ Detect image tags in text and replace with OCR results.
148
+
149
+ Args:
150
+ text: Text containing image tags
151
+ image_pattern: Custom regex pattern for image tags.
152
+ If None, uses instance pattern or default [Image:{path}] pattern.
153
+
154
+ Returns:
155
+ Text with image tags replaced by OCR results
156
+ """
157
+ from xgen_doc2chunk.ocr.ocr_processor import (
158
+ extract_image_tags,
159
+ load_image_from_path,
160
+ DEFAULT_IMAGE_TAG_PATTERN,
161
+ )
162
+
163
+ if not self.llm_client:
164
+ logger.warning(f"[{self.provider.upper()}] Skipping OCR processing: no LLM client")
165
+ return text
166
+
167
+ # Determine which pattern to use: parameter > instance > default
168
+ pattern = image_pattern or self._image_pattern or DEFAULT_IMAGE_TAG_PATTERN
169
+
170
+ image_paths = extract_image_tags(text, pattern)
171
+
172
+ if not image_paths:
173
+ logger.debug(f"[{self.provider.upper()}] No image tags found in text")
174
+ return text
175
+
176
+ logger.info(f"[{self.provider.upper()}] Detected {len(image_paths)} image tags")
177
+
178
+ result_text = text
179
+
180
+ for img_path in image_paths:
181
+ # Build replacement pattern using the same pattern structure
182
+ # Escape the path and create a pattern that matches the full tag
183
+ escaped_path = re.escape(img_path)
184
+ # Get the pattern string and replace capture group with escaped path
185
+ pattern_str = pattern.pattern
186
+ # Replace the capture group (.*), ([^...]+), etc. with the escaped path
187
+ tag_pattern_str = re.sub(r'\([^)]+\)', escaped_path, pattern_str, count=1)
188
+ tag_pattern = re.compile(tag_pattern_str)
189
+
190
+ local_path = load_image_from_path(img_path)
191
+
192
+ if local_path is None:
193
+ logger.warning(f"[{self.provider.upper()}] Image load failed, keeping original tag: {img_path}")
194
+ continue
195
+
196
+ ocr_result = self.convert_image_to_text(local_path)
197
+
198
+ if ocr_result is None or ocr_result.startswith("[Image conversion error:"):
199
+ logger.warning(f"[{self.provider.upper()}] Image conversion failed, keeping original tag: {img_path}")
200
+ continue
201
+
202
+ result_text = tag_pattern.sub(ocr_result, result_text)
203
+ logger.info(f"[{self.provider.upper()}] Tag replacement completed: {img_path[:50]}...")
204
+
205
+ return result_text
206
+
207
+ def __repr__(self) -> str:
208
+ return f"{self.__class__.__name__}(provider='{self.provider}')"
209
+
@@ -0,0 +1,22 @@
1
+ # xgen_doc2chunk/ocr/ocr_engine/__init__.py
2
+ # OCR engine module initialization
3
+ """
4
+ OCR Engine Module
5
+
6
+ Provides OCR engine classes for each LLM provider.
7
+ """
8
+
9
+ from xgen_doc2chunk.ocr.ocr_engine.openai_ocr import OpenAIOCR
10
+ from xgen_doc2chunk.ocr.ocr_engine.anthropic_ocr import AnthropicOCR
11
+ from xgen_doc2chunk.ocr.ocr_engine.gemini_ocr import GeminiOCR
12
+ from xgen_doc2chunk.ocr.ocr_engine.vllm_ocr import VllmOCR
13
+ from xgen_doc2chunk.ocr.ocr_engine.bedrock_ocr import BedrockOCR
14
+
15
+ __all__ = [
16
+ "OpenAIOCR",
17
+ "AnthropicOCR",
18
+ "GeminiOCR",
19
+ "VllmOCR",
20
+ "BedrockOCR",
21
+ ]
22
+
@@ -0,0 +1,91 @@
1
+ # xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py
2
+ # OCR class using Anthropic Claude Vision model
3
+ import logging
4
+ from typing import Any, Optional
5
+
6
+ from xgen_doc2chunk.ocr.base import BaseOCR
7
+
8
+ logger = logging.getLogger("ocr-anthropic")
9
+
10
+ # Default model
11
+ DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
12
+
13
+
14
+ class AnthropicOCR(BaseOCR):
15
+ """
16
+ OCR processing class using Anthropic Claude Vision model.
17
+
18
+ Supported models: claude-3-opus, claude-3-sonnet, claude-3-haiku, claude-sonnet-4, etc.
19
+
20
+ Example:
21
+ ```python
22
+ from xgen_doc2chunk.ocr.ocr_engine import AnthropicOCR
23
+
24
+ # Method 1: Initialize with api_key and model
25
+ ocr = AnthropicOCR(api_key="sk-ant-...", model="claude-sonnet-4-20250514")
26
+
27
+ # Method 2: Use existing LLM client
28
+ from langchain_anthropic import ChatAnthropic
29
+ llm = ChatAnthropic(model="claude-sonnet-4-20250514", temperature=0, api_key="sk-ant-...")
30
+ ocr = AnthropicOCR(llm_client=llm)
31
+
32
+ # Single image conversion
33
+ result = ocr.convert_image_to_text("/path/to/image.png")
34
+ ```
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ api_key: Optional[str] = None,
40
+ model: str = DEFAULT_ANTHROPIC_MODEL,
41
+ llm_client: Optional[Any] = None,
42
+ prompt: Optional[str] = None,
43
+ temperature: float = 0.0,
44
+ max_tokens: int = 4096,
45
+ ):
46
+ """
47
+ Initialize Anthropic OCR.
48
+
49
+ Args:
50
+ api_key: Anthropic API key (required if llm_client is not provided)
51
+ model: Model name to use (default: claude-sonnet-4-20250514)
52
+ llm_client: Existing LangChain Anthropic client (if provided, api_key and model are ignored)
53
+ prompt: Custom prompt (if None, default prompt is used)
54
+ temperature: Generation temperature (default: 0.0)
55
+ max_tokens: Maximum number of tokens (default: 4096)
56
+ """
57
+ if llm_client is None:
58
+ if api_key is None:
59
+ raise ValueError("Either api_key or llm_client is required.")
60
+
61
+ from langchain_anthropic import ChatAnthropic
62
+
63
+ llm_client = ChatAnthropic(
64
+ model=model,
65
+ api_key=api_key,
66
+ temperature=temperature,
67
+ max_tokens=max_tokens,
68
+ )
69
+ logger.info(f"[Anthropic OCR] Client created: model={model}")
70
+
71
+ super().__init__(llm_client=llm_client, prompt=prompt)
72
+ self.model = model
73
+ logger.info("[Anthropic OCR] Initialization completed")
74
+
75
+ @property
76
+ def provider(self) -> str:
77
+ return "anthropic"
78
+
79
+ def build_message_content(self, b64_image: str, mime_type: str) -> list:
80
+ return [
81
+ {
82
+ "type": "image",
83
+ "source": {
84
+ "type": "base64",
85
+ "media_type": mime_type,
86
+ "data": b64_image
87
+ }
88
+ },
89
+ {"type": "text", "text": self.prompt}
90
+ ]
91
+
@@ -0,0 +1,172 @@
1
+ # xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py
2
+ # OCR class using AWS Bedrock Vision model
3
+ import logging
4
+ import os
5
+ from typing import Any, Optional
6
+
7
+ from xgen_doc2chunk.ocr.base import BaseOCR
8
+
9
+ logger = logging.getLogger("ocr-bedrock")
10
+
11
+ # Default model
12
+ DEFAULT_BEDROCK_MODEL = "anthropic.claude-3-5-sonnet-20241022-v2:0"
13
+
14
+
15
+ class BedrockOCR(BaseOCR):
16
+ """
17
+ OCR processing class using AWS Bedrock Vision model.
18
+
19
+ Supports Claude and other vision-capable models available on AWS Bedrock.
20
+
21
+ Example:
22
+ ```python
23
+ from xgen_doc2chunk.ocr.ocr_engine import BedrockOCR
24
+
25
+ # Method 1: Initialize with AWS credentials
26
+ ocr = BedrockOCR(
27
+ aws_access_key_id="AKIA...",
28
+ aws_secret_access_key="...",
29
+ aws_region="us-east-1",
30
+ model="anthropic.claude-3-5-sonnet-20241022-v2:0"
31
+ )
32
+
33
+ # Method 2: Use existing AWS credentials from environment
34
+ # (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION)
35
+ ocr = BedrockOCR(model="anthropic.claude-3-5-sonnet-20241022-v2:0")
36
+
37
+ # Method 3: Use with session token (temporary credentials)
38
+ ocr = BedrockOCR(
39
+ aws_access_key_id="ASIA...",
40
+ aws_secret_access_key="...",
41
+ aws_session_token="...",
42
+ aws_region="ap-northeast-2"
43
+ )
44
+
45
+ # Method 4: Use existing LLM client
46
+ from langchain_aws import ChatBedrockConverse
47
+ llm = ChatBedrockConverse(model="anthropic.claude-3-5-sonnet-20241022-v2:0")
48
+ ocr = BedrockOCR(llm_client=llm)
49
+
50
+ # Single image conversion
51
+ result = ocr.convert_image_to_text("/path/to/image.png")
52
+ ```
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ aws_access_key_id: Optional[str] = None,
58
+ aws_secret_access_key: Optional[str] = None,
59
+ aws_session_token: Optional[str] = None,
60
+ aws_region: Optional[str] = None,
61
+ endpoint_url: Optional[str] = None,
62
+ model: str = DEFAULT_BEDROCK_MODEL,
63
+ llm_client: Optional[Any] = None,
64
+ prompt: Optional[str] = None,
65
+ temperature: float = 0.0,
66
+ max_tokens: int = 4096,
67
+ connect_timeout: int = 60,
68
+ read_timeout: int = 120,
69
+ max_retries: int = 10,
70
+ ):
71
+ """
72
+ Initialize AWS Bedrock OCR.
73
+
74
+ Args:
75
+ aws_access_key_id: AWS access key ID (if not provided, uses environment variable)
76
+ aws_secret_access_key: AWS secret access key (if not provided, uses environment variable)
77
+ aws_session_token: AWS session token for temporary credentials (optional)
78
+ aws_region: AWS region (default: from environment or "ap-northeast-2")
79
+ endpoint_url: Custom endpoint URL (for VPC endpoints, etc.)
80
+ model: Model ID to use (default: anthropic.claude-3-5-sonnet-20241022-v2:0)
81
+ llm_client: Existing LangChain Bedrock client (if provided, other params are ignored)
82
+ prompt: Custom prompt (if None, default prompt is used)
83
+ temperature: Generation temperature (default: 0.0)
84
+ max_tokens: Maximum number of tokens (default: 4096)
85
+ connect_timeout: Connection timeout in seconds (default: 60)
86
+ read_timeout: Read timeout in seconds (default: 120)
87
+ max_retries: Maximum retry attempts (default: 10)
88
+ """
89
+ if llm_client is None:
90
+ from langchain_aws import ChatBedrockConverse
91
+ from botocore.config import Config as BotocoreConfig
92
+
93
+ # Set environment variables for boto3 auto-discovery
94
+ if aws_access_key_id:
95
+ os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
96
+ if aws_secret_access_key:
97
+ os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
98
+ if aws_session_token:
99
+ os.environ["AWS_SESSION_TOKEN"] = aws_session_token
100
+
101
+ # Determine region
102
+ if not aws_region:
103
+ aws_region = os.environ.get(
104
+ "AWS_REGION",
105
+ os.environ.get("AWS_DEFAULT_REGION", "ap-northeast-2")
106
+ )
107
+
108
+ logger.info(f"[Bedrock OCR] Using: model={model}, region={aws_region}")
109
+
110
+ # Configure botocore with retry settings
111
+ bedrock_config = BotocoreConfig(
112
+ retries={
113
+ "max_attempts": max_retries,
114
+ "mode": "adaptive",
115
+ },
116
+ connect_timeout=connect_timeout,
117
+ read_timeout=read_timeout,
118
+ )
119
+
120
+ # Build kwargs for ChatBedrockConverse
121
+ llm_kwargs = {
122
+ "model": model,
123
+ "temperature": temperature,
124
+ "max_tokens": max_tokens,
125
+ "disable_streaming": False,
126
+ "config": bedrock_config,
127
+ }
128
+
129
+ if aws_region:
130
+ llm_kwargs["region_name"] = aws_region
131
+ if aws_access_key_id:
132
+ llm_kwargs["aws_access_key_id"] = aws_access_key_id
133
+ if aws_secret_access_key:
134
+ llm_kwargs["aws_secret_access_key"] = aws_secret_access_key
135
+ if aws_session_token:
136
+ llm_kwargs["aws_session_token"] = aws_session_token
137
+ if endpoint_url:
138
+ llm_kwargs["endpoint_url"] = endpoint_url
139
+
140
+ llm_client = ChatBedrockConverse(**llm_kwargs)
141
+ logger.info(f"[Bedrock OCR] Client created: model={model}, region={aws_region}")
142
+
143
+ super().__init__(llm_client=llm_client, prompt=prompt)
144
+ self.model = model
145
+ self.aws_region = aws_region
146
+ logger.info("[Bedrock OCR] Initialization completed")
147
+
148
+ @property
149
+ def provider(self) -> str:
150
+ return "aws_bedrock"
151
+
152
+ def build_message_content(self, b64_image: str, mime_type: str) -> list:
153
+ """
154
+ Build message content for AWS Bedrock.
155
+
156
+ AWS Bedrock uses the same format as Anthropic Claude models.
157
+ """
158
+ return [
159
+ {
160
+ "type": "image",
161
+ "source": {
162
+ "type": "base64",
163
+ "media_type": mime_type,
164
+ "data": b64_image
165
+ }
166
+ },
167
+ {"type": "text", "text": self.prompt}
168
+ ]
169
+
170
+
171
+ __all__ = ["BedrockOCR", "DEFAULT_BEDROCK_MODEL"]
172
+
@@ -0,0 +1,91 @@
1
+ # xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py
2
+ # OCR class using Google Gemini Vision model
3
+ import logging
4
+ from typing import Any, Optional
5
+
6
+ from xgen_doc2chunk.ocr.base import BaseOCR
7
+
8
+ logger = logging.getLogger("ocr-gemini")
9
+
10
+ # Default model
11
+ DEFAULT_GEMINI_MODEL = "gemini-2.0-flash"
12
+
13
+
14
+ class GeminiOCR(BaseOCR):
15
+ """
16
+ OCR processing class using Google Gemini Vision model.
17
+
18
+ Supported models: gemini-pro-vision, gemini-1.5-pro, gemini-2.0-flash, etc.
19
+
20
+ Example:
21
+ ```python
22
+ from xgen_doc2chunk.ocr.ocr_engine import GeminiOCR
23
+
24
+ # Method 1: Initialize with api_key and model
25
+ ocr = GeminiOCR(api_key="...", model="gemini-2.0-flash")
26
+
27
+ # Method 2: Use existing LLM client
28
+ from langchain_google_genai import ChatGoogleGenerativeAI
29
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key="...")
30
+ ocr = GeminiOCR(llm_client=llm)
31
+
32
+ # Single image conversion
33
+ result = ocr.convert_image_to_text("/path/to/image.png")
34
+ ```
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ api_key: Optional[str] = None,
40
+ model: str = DEFAULT_GEMINI_MODEL,
41
+ llm_client: Optional[Any] = None,
42
+ prompt: Optional[str] = None,
43
+ temperature: float = 0.0,
44
+ max_tokens: Optional[int] = None,
45
+ ):
46
+ """
47
+ Initialize Gemini OCR.
48
+
49
+ Args:
50
+ api_key: Google API key (required if llm_client is not provided)
51
+ model: Model name to use (default: gemini-2.0-flash)
52
+ llm_client: Existing LangChain Gemini client (if provided, api_key and model are ignored)
53
+ prompt: Custom prompt (if None, default prompt is used)
54
+ temperature: Generation temperature (default: 0.0)
55
+ max_tokens: Maximum number of tokens (if None, model default is used)
56
+ """
57
+ if llm_client is None:
58
+ if api_key is None:
59
+ raise ValueError("Either api_key or llm_client is required.")
60
+
61
+ from langchain_google_genai import ChatGoogleGenerativeAI
62
+
63
+ client_kwargs = {
64
+ "model": model,
65
+ "google_api_key": api_key,
66
+ "temperature": temperature,
67
+ }
68
+
69
+ if max_tokens is not None:
70
+ client_kwargs["max_output_tokens"] = max_tokens
71
+
72
+ llm_client = ChatGoogleGenerativeAI(**client_kwargs)
73
+ logger.info(f"[Gemini OCR] Client created: model={model}")
74
+
75
+ super().__init__(llm_client=llm_client, prompt=prompt)
76
+ self.model = model
77
+ logger.info("[Gemini OCR] Initialization completed")
78
+
79
+ @property
80
+ def provider(self) -> str:
81
+ return "gemini"
82
+
83
+ def build_message_content(self, b64_image: str, mime_type: str) -> list:
84
+ return [
85
+ {"type": "text", "text": self.prompt},
86
+ {
87
+ "type": "image_url",
88
+ "image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
89
+ }
90
+ ]
91
+