xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,100 @@
1
+ # xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py
2
+ # OCR class using OpenAI Vision model
3
+ import logging
4
+ from typing import Any, Optional
5
+
6
+ from xgen_doc2chunk.ocr.base import BaseOCR
7
+
8
+ logger = logging.getLogger("ocr-openai")
9
+
10
+ # Default model
11
+ DEFAULT_OPENAI_MODEL = "gpt-4o"
12
+
13
+
14
+ class OpenAIOCR(BaseOCR):
15
+ """
16
+ OCR processing class using OpenAI Vision model.
17
+
18
+ Supported models: gpt-4-vision-preview, gpt-4o, gpt-4o-mini, etc.
19
+
20
+ Example:
21
+ ```python
22
+ from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
23
+
24
+ # Method 1: Initialize with api_key and model
25
+ ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
26
+
27
+ # Method 2: Use existing LLM client
28
+ from langchain_openai import ChatOpenAI
29
+ llm = ChatOpenAI(model="gpt-4o", temperature=0, api_key="sk-...")
30
+ ocr = OpenAIOCR(llm_client=llm)
31
+
32
+ # Single image conversion
33
+ result = ocr.convert_image_to_text("/path/to/image.png")
34
+
35
+ # Process image tags in text
36
+ text = "Document content [Image:/path/to/image.png] continues..."
37
+ processed = ocr.process_text(text)
38
+ ```
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ api_key: Optional[str] = None,
44
+ model: str = DEFAULT_OPENAI_MODEL,
45
+ llm_client: Optional[Any] = None,
46
+ prompt: Optional[str] = None,
47
+ temperature: float = 0.0,
48
+ max_tokens: Optional[int] = None,
49
+ base_url: Optional[str] = None,
50
+ ):
51
+ """
52
+ Initialize OpenAI OCR.
53
+
54
+ Args:
55
+ api_key: OpenAI API key (required if llm_client is not provided)
56
+ model: Model name to use (default: gpt-4o)
57
+ llm_client: Existing LangChain OpenAI client (if provided, api_key and model are ignored)
58
+ prompt: Custom prompt (if None, default prompt is used)
59
+ temperature: Generation temperature (default: 0.0)
60
+ max_tokens: Maximum number of tokens (if None, model default is used)
61
+ base_url: OpenAI API base URL (for Azure, etc.)
62
+ """
63
+ if llm_client is None:
64
+ if api_key is None:
65
+ raise ValueError("Either api_key or llm_client is required.")
66
+
67
+ from langchain_openai import ChatOpenAI
68
+
69
+ client_kwargs = {
70
+ "model": model,
71
+ "api_key": api_key,
72
+ "temperature": temperature,
73
+ }
74
+
75
+ if max_tokens is not None:
76
+ client_kwargs["max_tokens"] = max_tokens
77
+
78
+ if base_url is not None:
79
+ client_kwargs["base_url"] = base_url
80
+
81
+ llm_client = ChatOpenAI(**client_kwargs)
82
+ logger.info(f"[OpenAI OCR] Client created: model={model}")
83
+
84
+ super().__init__(llm_client=llm_client, prompt=prompt)
85
+ self.model = model
86
+ logger.info("[OpenAI OCR] Initialization completed")
87
+
88
+ @property
89
+ def provider(self) -> str:
90
+ return "openai"
91
+
92
+ def build_message_content(self, b64_image: str, mime_type: str) -> list:
93
+ return [
94
+ {"type": "text", "text": self.prompt},
95
+ {
96
+ "type": "image_url",
97
+ "image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
98
+ }
99
+ ]
100
+
@@ -0,0 +1,116 @@
1
+ # xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py
2
+ # OCR class using vLLM-based Vision model
3
+ import logging
4
+ from typing import Any, Optional
5
+
6
+ from xgen_doc2chunk.ocr.base import BaseOCR
7
+
8
+ logger = logging.getLogger("ocr-vllm")
9
+
10
+ # Default model (varies by user environment)
11
+ DEFAULT_VLLM_MODEL = "Qwen/Qwen2-VL-7B-Instruct"
12
+
13
+
14
+ class VllmOCR(BaseOCR):
15
+ """
16
+ OCR processing class using vLLM-based Vision model.
17
+
18
+ Uses OpenAI-compatible API provided by vLLM server.
19
+
20
+ Example:
21
+ ```python
22
+ from xgen_doc2chunk.ocr.ocr_engine import VllmOCR
23
+
24
+ # Method 1: Initialize with base_url and model
25
+ ocr = VllmOCR(
26
+ base_url="http://localhost:8000/v1",
27
+ model="Qwen/Qwen2-VL-7B-Instruct"
28
+ )
29
+
30
+ # Method 2: When api_key is required
31
+ ocr = VllmOCR(
32
+ base_url="http://your-vllm-server:8000/v1",
33
+ api_key="your-api-key",
34
+ model="Qwen/Qwen2-VL-7B-Instruct"
35
+ )
36
+
37
+ # Method 3: Use existing LLM client
38
+ from langchain_openai import ChatOpenAI
39
+ llm = ChatOpenAI(
40
+ model="Qwen/Qwen2-VL-7B-Instruct",
41
+ base_url="http://localhost:8000/v1",
42
+ api_key="EMPTY"
43
+ )
44
+ ocr = VllmOCR(llm_client=llm)
45
+
46
+ # Single image conversion
47
+ result = ocr.convert_image_to_text("/path/to/image.png")
48
+ ```
49
+ """
50
+
51
+ # vLLM uses simple prompt
52
+ DEFAULT_PROMPT = "Describe the contents of this image."
53
+
54
+ def __init__(
55
+ self,
56
+ base_url: Optional[str] = None,
57
+ api_key: Optional[str] = "EMPTY",
58
+ model: str = DEFAULT_VLLM_MODEL,
59
+ llm_client: Optional[Any] = None,
60
+ prompt: Optional[str] = None,
61
+ temperature: float = 0.0,
62
+ max_tokens: Optional[int] = None,
63
+ ):
64
+ """
65
+ Initialize vLLM OCR.
66
+
67
+ Args:
68
+ base_url: vLLM server URL (e.g., "http://localhost:8000/v1")
69
+ api_key: API key (default: "EMPTY", vLLM default setting)
70
+ model: Model name to use (default: Qwen/Qwen2-VL-7B-Instruct)
71
+ llm_client: Existing LangChain client (if provided, base_url, api_key, and model are ignored)
72
+ prompt: Custom prompt (if None, SIMPLE_PROMPT is used)
73
+ temperature: Generation temperature (default: 0.0)
74
+ max_tokens: Maximum number of tokens (if None, model default is used)
75
+ """
76
+ # vLLM uses simple prompt by default
77
+ if prompt is None:
78
+ prompt = self.DEFAULT_PROMPT
79
+
80
+ if llm_client is None:
81
+ if base_url is None:
82
+ raise ValueError("Either base_url or llm_client is required.")
83
+
84
+ from langchain_openai import ChatOpenAI
85
+
86
+ client_kwargs = {
87
+ "model": model,
88
+ "base_url": base_url,
89
+ "api_key": api_key,
90
+ "temperature": temperature,
91
+ }
92
+
93
+ if max_tokens is not None:
94
+ client_kwargs["max_tokens"] = max_tokens
95
+
96
+ llm_client = ChatOpenAI(**client_kwargs)
97
+ logger.info(f"[vLLM OCR] Client created: base_url={base_url}, model={model}")
98
+
99
+ super().__init__(llm_client=llm_client, prompt=prompt)
100
+ self.model = model
101
+ self.base_url = base_url
102
+ logger.info("[vLLM OCR] Initialization completed")
103
+
104
+ @property
105
+ def provider(self) -> str:
106
+ return "vllm"
107
+
108
+ def build_message_content(self, b64_image: str, mime_type: str) -> list:
109
+ return [
110
+ {"type": "text", "text": self.prompt},
111
+ {
112
+ "type": "image_url",
113
+ "image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
114
+ }
115
+ ]
116
+
@@ -0,0 +1,387 @@
1
+ # xgen_doc2chunk/ocr/ocr_processor.py
2
+ # Module for loading image files and processing OCR.
3
+ import re
4
+ import base64
5
+ import logging
6
+ import os
7
+ from typing import Any, Callable, Dict, List, Optional, Pattern
8
+
9
+ logger = logging.getLogger("ocr-processor")
10
+
11
+ # Default image tag pattern: [Image:{path}] or [image:{path}] (case-insensitive)
12
+ DEFAULT_IMAGE_TAG_PATTERN = re.compile(r'\[[Ii]mage:([^\]]+)\]')
13
+
14
+ # Keep for backward compatibility
15
+ IMAGE_TAG_PATTERN = DEFAULT_IMAGE_TAG_PATTERN
16
+
17
+
18
+ def _b64_from_file(path: str) -> str:
19
+ """Encode file to Base64"""
20
+ with open(path, "rb") as f:
21
+ return base64.b64encode(f.read()).decode("utf-8")
22
+
23
+
24
+ def _get_mime_type(file_path: str) -> str:
25
+ """Return MIME type based on file extension"""
26
+ ext = os.path.splitext(file_path)[1].lower()
27
+ mime_map = {
28
+ ".jpg": "image/jpeg",
29
+ ".jpeg": "image/jpeg",
30
+ ".png": "image/png",
31
+ ".gif": "image/gif",
32
+ ".bmp": "image/bmp",
33
+ ".webp": "image/webp",
34
+ ".tiff": "image/tiff",
35
+ ".svg": "image/svg+xml",
36
+ }
37
+ return mime_map.get(ext, "image/jpeg")
38
+
39
+
40
+ def extract_image_tags(
41
+ text: str,
42
+ pattern: Optional[Pattern[str]] = None
43
+ ) -> List[str]:
44
+ """
45
+ Extract image tags from text.
46
+
47
+ Args:
48
+ text: Text containing image tags
49
+ pattern: Custom regex pattern with capture group for path.
50
+ If None, uses default [Image:{path}] pattern.
51
+
52
+ Returns:
53
+ List of extracted image_path values
54
+ """
55
+ if pattern is None:
56
+ pattern = DEFAULT_IMAGE_TAG_PATTERN
57
+ matches = pattern.findall(text)
58
+ return matches
59
+
60
+
61
+ def load_image_from_path(image_path: str) -> Optional[str]:
62
+ """
63
+ Validate and return local image file path.
64
+
65
+ Args:
66
+ image_path: Image file path
67
+
68
+ Returns:
69
+ Valid local file path or None
70
+ """
71
+ try:
72
+ # Convert to absolute path
73
+ if not os.path.isabs(image_path):
74
+ image_path = os.path.abspath(image_path)
75
+
76
+ # Check file existence
77
+ if os.path.exists(image_path) and os.path.getsize(image_path) > 0:
78
+ logger.info(f"[OCR] Image loaded: {image_path}")
79
+ return image_path
80
+
81
+ logger.warning(f"[OCR] Image file not found: {image_path}")
82
+ return None
83
+
84
+ except Exception as e:
85
+ logger.error(f"[OCR] Image load failed: {image_path}, error: {e}")
86
+ return None
87
+
88
+
89
+ def convert_image_to_text_with_llm(
90
+ image_path: str,
91
+ llm_client: Any,
92
+ provider: str
93
+ ) -> str:
94
+ """
95
+ Convert image to text using VL model.
96
+
97
+ Args:
98
+ image_path: Local image file path
99
+ llm_client: LangChain LLM client
100
+ provider: LLM provider (openai, anthropic, gemini, vllm)
101
+
102
+ Returns:
103
+ Text extracted from image
104
+ """
105
+ try:
106
+ from langchain_core.messages import HumanMessage
107
+
108
+ b64_image = _b64_from_file(image_path)
109
+ mime_type = _get_mime_type(image_path)
110
+
111
+ # vllm uses simple prompt
112
+ if provider == "vllm":
113
+ prompt = "Describe the contents of this image."
114
+ else:
115
+ prompt = (
116
+ "Extract meaningful information from this image.\n\n"
117
+ "**If the image contains a TABLE:**\n"
118
+ "- Convert to HTML table format (<table>, <tr>, <td>, <th>)\n"
119
+ "- Use 'rowspan' and 'colspan' attributes for merged cells\n"
120
+ "- Preserve all cell content exactly as shown\n"
121
+ "- Example:\n"
122
+ " <table>\n"
123
+ " <tr><th colspan=\"2\">Header</th></tr>\n"
124
+ " <tr><td rowspan=\"2\">Merged</td><td>A</td></tr>\n"
125
+ " <tr><td>B</td></tr>\n"
126
+ " </table>\n\n"
127
+ "**If the image contains TEXT (non-table):**\n"
128
+ "- Extract all text exactly as shown\n"
129
+ "- Keep layout, hierarchy, and structure\n\n"
130
+ "**If the image contains DATA (charts, graphs, diagrams):**\n"
131
+ "- Extract the data and its meaning\n"
132
+ "- Describe trends, relationships, or key insights\n\n"
133
+ "**If the image is decorative or has no semantic meaning:**\n"
134
+ "- Simply state what it is in one short sentence\n"
135
+ "- Example: 'A decorative geometric shape' or 'Company logo'\n"
136
+ "- Do NOT over-analyze decorative elements\n\n"
137
+ "**Rules:**\n"
138
+ "- Output in Korean (except HTML tags)\n"
139
+ "- Tables MUST use HTML format with proper rowspan/colspan\n"
140
+ "- Be concise - only include what is semantically meaningful\n"
141
+ "- No filler words or unnecessary descriptions"
142
+ )
143
+
144
+ # Build message by provider
145
+ if provider in ("openai", "vllm"):
146
+ content = [
147
+ {"type": "text", "text": prompt},
148
+ {
149
+ "type": "image_url",
150
+ "image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
151
+ }
152
+ ]
153
+ message = HumanMessage(content=content)
154
+
155
+ elif provider == "anthropic":
156
+ content = [
157
+ {
158
+ "type": "image",
159
+ "source": {
160
+ "type": "base64",
161
+ "media_type": mime_type,
162
+ "data": b64_image
163
+ }
164
+ },
165
+ {"type": "text", "text": prompt}
166
+ ]
167
+ message = HumanMessage(content=content)
168
+
169
+ elif provider == "gemini":
170
+ content = [
171
+ {"type": "text", "text": prompt},
172
+ {
173
+ "type": "image_url",
174
+ "image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
175
+ }
176
+ ]
177
+ message = HumanMessage(content=content)
178
+
179
+ elif provider == "aws_bedrock":
180
+ # AWS Bedrock (Claude via Bedrock) - Uses same format as Anthropic
181
+ content = [
182
+ {
183
+ "type": "image",
184
+ "source": {
185
+ "type": "base64",
186
+ "media_type": mime_type,
187
+ "data": b64_image
188
+ }
189
+ },
190
+ {"type": "text", "text": prompt}
191
+ ]
192
+ message = HumanMessage(content=content)
193
+
194
+ else:
195
+ return None # Unsupported provider
196
+
197
+ response = llm_client.invoke([message])
198
+ result = response.content.strip()
199
+
200
+ # Wrap result in [Figure:...] format
201
+ result = f"[Figure:{result}]"
202
+
203
+ logger.info(f"[OCR] Image to text conversion completed: {os.path.basename(image_path)}")
204
+ return result
205
+
206
+ except Exception as e:
207
+ logger.error(f"[OCR] Image to text conversion failed: {image_path}, error: {e}")
208
+ return f"[Image conversion error: {str(e)}]"
209
+
210
+
211
+ def process_text_with_ocr(
212
+ text: str,
213
+ llm_client: Any,
214
+ provider: str
215
+ ) -> str:
216
+ """
217
+ Detect image tags in text and replace with OCR results.
218
+
219
+ Args:
220
+ text: Text containing [Image:{path}] tags
221
+ llm_client: LangChain LLM client
222
+ provider: LLM provider
223
+
224
+ Returns:
225
+ Text with image tags replaced by OCR results
226
+ """
227
+ if not llm_client:
228
+ logger.warning("[OCR] Skipping OCR processing: no LLM client")
229
+ return text
230
+
231
+ # Extract image tags
232
+ image_paths = extract_image_tags(text)
233
+
234
+ if not image_paths:
235
+ logger.debug("[OCR] No image tags found in text")
236
+ return text
237
+
238
+ logger.info(f"[OCR] Detected {len(image_paths)} image tags")
239
+
240
+ result_text = text
241
+
242
+ for img_path in image_paths:
243
+ # Case-insensitive tag matching
244
+ tag_pattern = re.compile(r'\[[Ii]mage:' + re.escape(img_path) + r'\]')
245
+
246
+ # Load image from local path
247
+ local_path = load_image_from_path(img_path)
248
+
249
+ if local_path is None:
250
+ # Keep original tag on load failure
251
+ logger.warning(f"[OCR] Image load failed, keeping original tag: {img_path}")
252
+ continue
253
+
254
+ # Convert image to text using VL model
255
+ ocr_result = convert_image_to_text_with_llm(
256
+ image_path=local_path,
257
+ llm_client=llm_client,
258
+ provider=provider
259
+ )
260
+
261
+ # Keep original tag on OCR failure (None or error message)
262
+ if ocr_result is None or ocr_result.startswith("[Image conversion error:"):
263
+ logger.warning(f"[OCR] Image conversion failed, keeping original tag: {img_path}")
264
+ continue
265
+
266
+ # Replace tag with OCR result
267
+ result_text = tag_pattern.sub(ocr_result, result_text)
268
+ logger.info(f"[OCR] Tag replacement completed: {img_path[:50]}...")
269
+
270
+ return result_text
271
+
272
+
273
+ def process_text_with_ocr_progress(
274
+ text: str,
275
+ llm_client: Any,
276
+ provider: str,
277
+ progress_callback: Optional[Callable[[Dict[str, Any]], Any]] = None
278
+ ) -> str:
279
+ """
280
+ Detect image tags in text and replace with OCR results (with progress callback support).
281
+
282
+ Args:
283
+ text: Text containing [Image:{path}] tags
284
+ llm_client: LangChain LLM client
285
+ provider: LLM provider
286
+ progress_callback: Progress callback function
287
+
288
+ Returns:
289
+ Text with image tags replaced by OCR results
290
+ """
291
+ if not llm_client:
292
+ logger.warning("[OCR] Skipping OCR processing: no LLM client")
293
+ return text
294
+
295
+ # Extract image tags
296
+ image_paths = extract_image_tags(text)
297
+
298
+ if not image_paths:
299
+ logger.debug("[OCR] No image tags found in text")
300
+ return text
301
+
302
+ total_chunks = len(image_paths)
303
+ logger.info(f"[OCR] Detected {total_chunks} image tags")
304
+
305
+ result_text = text
306
+ success_count = 0
307
+ failed_count = 0
308
+
309
+ for idx, img_path in enumerate(image_paths):
310
+ # Progress callback - processing started
311
+ if progress_callback:
312
+ progress_callback({
313
+ 'event': 'ocr_tag_processing',
314
+ 'chunk_index': idx,
315
+ 'total_chunks': total_chunks,
316
+ 'image_path': img_path
317
+ })
318
+
319
+ # Case-insensitive tag matching
320
+ tag_pattern = re.compile(r'\[[Ii]mage:' + re.escape(img_path) + r'\]')
321
+
322
+ # Load image from local path
323
+ local_path = load_image_from_path(img_path)
324
+
325
+ if local_path is None:
326
+ # Keep original tag on load failure
327
+ logger.warning(f"[OCR] Image load failed, keeping original tag: {img_path}")
328
+ failed_count += 1
329
+ if progress_callback:
330
+ progress_callback({
331
+ 'event': 'ocr_chunk_processed',
332
+ 'chunk_index': idx,
333
+ 'total_chunks': total_chunks,
334
+ 'status': 'failed',
335
+ 'error': f'Load failed: {img_path}'
336
+ })
337
+ continue
338
+
339
+ try:
340
+ # Convert image to text using VL model
341
+ ocr_result = convert_image_to_text_with_llm(
342
+ image_path=local_path,
343
+ llm_client=llm_client,
344
+ provider=provider
345
+ )
346
+
347
+ # Keep original tag on OCR failure (None or error message)
348
+ if ocr_result is None or ocr_result.startswith("[Image conversion error:"):
349
+ logger.warning(f"[OCR] Image conversion failed, keeping original tag: {img_path}")
350
+ failed_count += 1
351
+ if progress_callback:
352
+ progress_callback({
353
+ 'event': 'ocr_chunk_processed',
354
+ 'chunk_index': idx,
355
+ 'total_chunks': total_chunks,
356
+ 'status': 'failed',
357
+ 'error': ocr_result or 'OCR returned None'
358
+ })
359
+ continue
360
+
361
+ # Replace tag with OCR result
362
+ result_text = tag_pattern.sub(ocr_result, result_text)
363
+ success_count += 1
364
+ logger.info(f"[OCR] Tag replacement completed: {img_path[:50]}...")
365
+
366
+ if progress_callback:
367
+ progress_callback({
368
+ 'event': 'ocr_chunk_processed',
369
+ 'chunk_index': idx,
370
+ 'total_chunks': total_chunks,
371
+ 'status': 'success'
372
+ })
373
+
374
+ except Exception as e:
375
+ logger.error(f"[OCR] Image processing error: {img_path}, error: {e}")
376
+ failed_count += 1
377
+ if progress_callback:
378
+ progress_callback({
379
+ 'event': 'ocr_chunk_processed',
380
+ 'chunk_index': idx,
381
+ 'total_chunks': total_chunks,
382
+ 'status': 'failed',
383
+ 'error': str(e)
384
+ })
385
+
386
+ return result_text
387
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme