xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py
|
|
2
|
+
# OCR class using OpenAI Vision model
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from xgen_doc2chunk.ocr.base import BaseOCR
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("ocr-openai")
|
|
9
|
+
|
|
10
|
+
# Default model
|
|
11
|
+
DEFAULT_OPENAI_MODEL = "gpt-4o"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OpenAIOCR(BaseOCR):
|
|
15
|
+
"""
|
|
16
|
+
OCR processing class using OpenAI Vision model.
|
|
17
|
+
|
|
18
|
+
Supported models: gpt-4-vision-preview, gpt-4o, gpt-4o-mini, etc.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
```python
|
|
22
|
+
from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
|
|
23
|
+
|
|
24
|
+
# Method 1: Initialize with api_key and model
|
|
25
|
+
ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
|
|
26
|
+
|
|
27
|
+
# Method 2: Use existing LLM client
|
|
28
|
+
from langchain_openai import ChatOpenAI
|
|
29
|
+
llm = ChatOpenAI(model="gpt-4o", temperature=0, api_key="sk-...")
|
|
30
|
+
ocr = OpenAIOCR(llm_client=llm)
|
|
31
|
+
|
|
32
|
+
# Single image conversion
|
|
33
|
+
result = ocr.convert_image_to_text("/path/to/image.png")
|
|
34
|
+
|
|
35
|
+
# Process image tags in text
|
|
36
|
+
text = "Document content [Image:/path/to/image.png] continues..."
|
|
37
|
+
processed = ocr.process_text(text)
|
|
38
|
+
```
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
api_key: Optional[str] = None,
|
|
44
|
+
model: str = DEFAULT_OPENAI_MODEL,
|
|
45
|
+
llm_client: Optional[Any] = None,
|
|
46
|
+
prompt: Optional[str] = None,
|
|
47
|
+
temperature: float = 0.0,
|
|
48
|
+
max_tokens: Optional[int] = None,
|
|
49
|
+
base_url: Optional[str] = None,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize OpenAI OCR.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
api_key: OpenAI API key (required if llm_client is not provided)
|
|
56
|
+
model: Model name to use (default: gpt-4o)
|
|
57
|
+
llm_client: Existing LangChain OpenAI client (if provided, api_key and model are ignored)
|
|
58
|
+
prompt: Custom prompt (if None, default prompt is used)
|
|
59
|
+
temperature: Generation temperature (default: 0.0)
|
|
60
|
+
max_tokens: Maximum number of tokens (if None, model default is used)
|
|
61
|
+
base_url: OpenAI API base URL (for Azure, etc.)
|
|
62
|
+
"""
|
|
63
|
+
if llm_client is None:
|
|
64
|
+
if api_key is None:
|
|
65
|
+
raise ValueError("Either api_key or llm_client is required.")
|
|
66
|
+
|
|
67
|
+
from langchain_openai import ChatOpenAI
|
|
68
|
+
|
|
69
|
+
client_kwargs = {
|
|
70
|
+
"model": model,
|
|
71
|
+
"api_key": api_key,
|
|
72
|
+
"temperature": temperature,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if max_tokens is not None:
|
|
76
|
+
client_kwargs["max_tokens"] = max_tokens
|
|
77
|
+
|
|
78
|
+
if base_url is not None:
|
|
79
|
+
client_kwargs["base_url"] = base_url
|
|
80
|
+
|
|
81
|
+
llm_client = ChatOpenAI(**client_kwargs)
|
|
82
|
+
logger.info(f"[OpenAI OCR] Client created: model={model}")
|
|
83
|
+
|
|
84
|
+
super().__init__(llm_client=llm_client, prompt=prompt)
|
|
85
|
+
self.model = model
|
|
86
|
+
logger.info("[OpenAI OCR] Initialization completed")
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def provider(self) -> str:
|
|
90
|
+
return "openai"
|
|
91
|
+
|
|
92
|
+
def build_message_content(self, b64_image: str, mime_type: str) -> list:
|
|
93
|
+
return [
|
|
94
|
+
{"type": "text", "text": self.prompt},
|
|
95
|
+
{
|
|
96
|
+
"type": "image_url",
|
|
97
|
+
"image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py
|
|
2
|
+
# OCR class using vLLM-based Vision model
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from xgen_doc2chunk.ocr.base import BaseOCR
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("ocr-vllm")
|
|
9
|
+
|
|
10
|
+
# Default model (varies by user environment)
|
|
11
|
+
DEFAULT_VLLM_MODEL = "Qwen/Qwen2-VL-7B-Instruct"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VllmOCR(BaseOCR):
|
|
15
|
+
"""
|
|
16
|
+
OCR processing class using vLLM-based Vision model.
|
|
17
|
+
|
|
18
|
+
Uses OpenAI-compatible API provided by vLLM server.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
```python
|
|
22
|
+
from xgen_doc2chunk.ocr.ocr_engine import VllmOCR
|
|
23
|
+
|
|
24
|
+
# Method 1: Initialize with base_url and model
|
|
25
|
+
ocr = VllmOCR(
|
|
26
|
+
base_url="http://localhost:8000/v1",
|
|
27
|
+
model="Qwen/Qwen2-VL-7B-Instruct"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Method 2: When api_key is required
|
|
31
|
+
ocr = VllmOCR(
|
|
32
|
+
base_url="http://your-vllm-server:8000/v1",
|
|
33
|
+
api_key="your-api-key",
|
|
34
|
+
model="Qwen/Qwen2-VL-7B-Instruct"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Method 3: Use existing LLM client
|
|
38
|
+
from langchain_openai import ChatOpenAI
|
|
39
|
+
llm = ChatOpenAI(
|
|
40
|
+
model="Qwen/Qwen2-VL-7B-Instruct",
|
|
41
|
+
base_url="http://localhost:8000/v1",
|
|
42
|
+
api_key="EMPTY"
|
|
43
|
+
)
|
|
44
|
+
ocr = VllmOCR(llm_client=llm)
|
|
45
|
+
|
|
46
|
+
# Single image conversion
|
|
47
|
+
result = ocr.convert_image_to_text("/path/to/image.png")
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# vLLM uses simple prompt
|
|
52
|
+
DEFAULT_PROMPT = "Describe the contents of this image."
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
base_url: Optional[str] = None,
|
|
57
|
+
api_key: Optional[str] = "EMPTY",
|
|
58
|
+
model: str = DEFAULT_VLLM_MODEL,
|
|
59
|
+
llm_client: Optional[Any] = None,
|
|
60
|
+
prompt: Optional[str] = None,
|
|
61
|
+
temperature: float = 0.0,
|
|
62
|
+
max_tokens: Optional[int] = None,
|
|
63
|
+
):
|
|
64
|
+
"""
|
|
65
|
+
Initialize vLLM OCR.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
base_url: vLLM server URL (e.g., "http://localhost:8000/v1")
|
|
69
|
+
api_key: API key (default: "EMPTY", vLLM default setting)
|
|
70
|
+
model: Model name to use (default: Qwen/Qwen2-VL-7B-Instruct)
|
|
71
|
+
llm_client: Existing LangChain client (if provided, base_url, api_key, and model are ignored)
|
|
72
|
+
prompt: Custom prompt (if None, SIMPLE_PROMPT is used)
|
|
73
|
+
temperature: Generation temperature (default: 0.0)
|
|
74
|
+
max_tokens: Maximum number of tokens (if None, model default is used)
|
|
75
|
+
"""
|
|
76
|
+
# vLLM uses simple prompt by default
|
|
77
|
+
if prompt is None:
|
|
78
|
+
prompt = self.DEFAULT_PROMPT
|
|
79
|
+
|
|
80
|
+
if llm_client is None:
|
|
81
|
+
if base_url is None:
|
|
82
|
+
raise ValueError("Either base_url or llm_client is required.")
|
|
83
|
+
|
|
84
|
+
from langchain_openai import ChatOpenAI
|
|
85
|
+
|
|
86
|
+
client_kwargs = {
|
|
87
|
+
"model": model,
|
|
88
|
+
"base_url": base_url,
|
|
89
|
+
"api_key": api_key,
|
|
90
|
+
"temperature": temperature,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if max_tokens is not None:
|
|
94
|
+
client_kwargs["max_tokens"] = max_tokens
|
|
95
|
+
|
|
96
|
+
llm_client = ChatOpenAI(**client_kwargs)
|
|
97
|
+
logger.info(f"[vLLM OCR] Client created: base_url={base_url}, model={model}")
|
|
98
|
+
|
|
99
|
+
super().__init__(llm_client=llm_client, prompt=prompt)
|
|
100
|
+
self.model = model
|
|
101
|
+
self.base_url = base_url
|
|
102
|
+
logger.info("[vLLM OCR] Initialization completed")
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def provider(self) -> str:
|
|
106
|
+
return "vllm"
|
|
107
|
+
|
|
108
|
+
def build_message_content(self, b64_image: str, mime_type: str) -> list:
|
|
109
|
+
return [
|
|
110
|
+
{"type": "text", "text": self.prompt},
|
|
111
|
+
{
|
|
112
|
+
"type": "image_url",
|
|
113
|
+
"image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
|
|
114
|
+
}
|
|
115
|
+
]
|
|
116
|
+
|
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
# xgen_doc2chunk/ocr/ocr_processor.py
|
|
2
|
+
# Module for loading image files and processing OCR.
|
|
3
|
+
import re
|
|
4
|
+
import base64
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Pattern
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("ocr-processor")
|
|
10
|
+
|
|
11
|
+
# Default image tag pattern: [Image:{path}] or [image:{path}] (case-insensitive)
|
|
12
|
+
DEFAULT_IMAGE_TAG_PATTERN = re.compile(r'\[[Ii]mage:([^\]]+)\]')
|
|
13
|
+
|
|
14
|
+
# Keep for backward compatibility
|
|
15
|
+
IMAGE_TAG_PATTERN = DEFAULT_IMAGE_TAG_PATTERN
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _b64_from_file(path: str) -> str:
|
|
19
|
+
"""Encode file to Base64"""
|
|
20
|
+
with open(path, "rb") as f:
|
|
21
|
+
return base64.b64encode(f.read()).decode("utf-8")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_mime_type(file_path: str) -> str:
|
|
25
|
+
"""Return MIME type based on file extension"""
|
|
26
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
27
|
+
mime_map = {
|
|
28
|
+
".jpg": "image/jpeg",
|
|
29
|
+
".jpeg": "image/jpeg",
|
|
30
|
+
".png": "image/png",
|
|
31
|
+
".gif": "image/gif",
|
|
32
|
+
".bmp": "image/bmp",
|
|
33
|
+
".webp": "image/webp",
|
|
34
|
+
".tiff": "image/tiff",
|
|
35
|
+
".svg": "image/svg+xml",
|
|
36
|
+
}
|
|
37
|
+
return mime_map.get(ext, "image/jpeg")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def extract_image_tags(
|
|
41
|
+
text: str,
|
|
42
|
+
pattern: Optional[Pattern[str]] = None
|
|
43
|
+
) -> List[str]:
|
|
44
|
+
"""
|
|
45
|
+
Extract image tags from text.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
text: Text containing image tags
|
|
49
|
+
pattern: Custom regex pattern with capture group for path.
|
|
50
|
+
If None, uses default [Image:{path}] pattern.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of extracted image_path values
|
|
54
|
+
"""
|
|
55
|
+
if pattern is None:
|
|
56
|
+
pattern = DEFAULT_IMAGE_TAG_PATTERN
|
|
57
|
+
matches = pattern.findall(text)
|
|
58
|
+
return matches
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def load_image_from_path(image_path: str) -> Optional[str]:
|
|
62
|
+
"""
|
|
63
|
+
Validate and return local image file path.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
image_path: Image file path
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Valid local file path or None
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
# Convert to absolute path
|
|
73
|
+
if not os.path.isabs(image_path):
|
|
74
|
+
image_path = os.path.abspath(image_path)
|
|
75
|
+
|
|
76
|
+
# Check file existence
|
|
77
|
+
if os.path.exists(image_path) and os.path.getsize(image_path) > 0:
|
|
78
|
+
logger.info(f"[OCR] Image loaded: {image_path}")
|
|
79
|
+
return image_path
|
|
80
|
+
|
|
81
|
+
logger.warning(f"[OCR] Image file not found: {image_path}")
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"[OCR] Image load failed: {image_path}, error: {e}")
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def convert_image_to_text_with_llm(
|
|
90
|
+
image_path: str,
|
|
91
|
+
llm_client: Any,
|
|
92
|
+
provider: str
|
|
93
|
+
) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Convert image to text using VL model.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
image_path: Local image file path
|
|
99
|
+
llm_client: LangChain LLM client
|
|
100
|
+
provider: LLM provider (openai, anthropic, gemini, vllm)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Text extracted from image
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
from langchain_core.messages import HumanMessage
|
|
107
|
+
|
|
108
|
+
b64_image = _b64_from_file(image_path)
|
|
109
|
+
mime_type = _get_mime_type(image_path)
|
|
110
|
+
|
|
111
|
+
# vllm uses simple prompt
|
|
112
|
+
if provider == "vllm":
|
|
113
|
+
prompt = "Describe the contents of this image."
|
|
114
|
+
else:
|
|
115
|
+
prompt = (
|
|
116
|
+
"Extract meaningful information from this image.\n\n"
|
|
117
|
+
"**If the image contains a TABLE:**\n"
|
|
118
|
+
"- Convert to HTML table format (<table>, <tr>, <td>, <th>)\n"
|
|
119
|
+
"- Use 'rowspan' and 'colspan' attributes for merged cells\n"
|
|
120
|
+
"- Preserve all cell content exactly as shown\n"
|
|
121
|
+
"- Example:\n"
|
|
122
|
+
" <table>\n"
|
|
123
|
+
" <tr><th colspan=\"2\">Header</th></tr>\n"
|
|
124
|
+
" <tr><td rowspan=\"2\">Merged</td><td>A</td></tr>\n"
|
|
125
|
+
" <tr><td>B</td></tr>\n"
|
|
126
|
+
" </table>\n\n"
|
|
127
|
+
"**If the image contains TEXT (non-table):**\n"
|
|
128
|
+
"- Extract all text exactly as shown\n"
|
|
129
|
+
"- Keep layout, hierarchy, and structure\n\n"
|
|
130
|
+
"**If the image contains DATA (charts, graphs, diagrams):**\n"
|
|
131
|
+
"- Extract the data and its meaning\n"
|
|
132
|
+
"- Describe trends, relationships, or key insights\n\n"
|
|
133
|
+
"**If the image is decorative or has no semantic meaning:**\n"
|
|
134
|
+
"- Simply state what it is in one short sentence\n"
|
|
135
|
+
"- Example: 'A decorative geometric shape' or 'Company logo'\n"
|
|
136
|
+
"- Do NOT over-analyze decorative elements\n\n"
|
|
137
|
+
"**Rules:**\n"
|
|
138
|
+
"- Output in Korean (except HTML tags)\n"
|
|
139
|
+
"- Tables MUST use HTML format with proper rowspan/colspan\n"
|
|
140
|
+
"- Be concise - only include what is semantically meaningful\n"
|
|
141
|
+
"- No filler words or unnecessary descriptions"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Build message by provider
|
|
145
|
+
if provider in ("openai", "vllm"):
|
|
146
|
+
content = [
|
|
147
|
+
{"type": "text", "text": prompt},
|
|
148
|
+
{
|
|
149
|
+
"type": "image_url",
|
|
150
|
+
"image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
|
|
151
|
+
}
|
|
152
|
+
]
|
|
153
|
+
message = HumanMessage(content=content)
|
|
154
|
+
|
|
155
|
+
elif provider == "anthropic":
|
|
156
|
+
content = [
|
|
157
|
+
{
|
|
158
|
+
"type": "image",
|
|
159
|
+
"source": {
|
|
160
|
+
"type": "base64",
|
|
161
|
+
"media_type": mime_type,
|
|
162
|
+
"data": b64_image
|
|
163
|
+
}
|
|
164
|
+
},
|
|
165
|
+
{"type": "text", "text": prompt}
|
|
166
|
+
]
|
|
167
|
+
message = HumanMessage(content=content)
|
|
168
|
+
|
|
169
|
+
elif provider == "gemini":
|
|
170
|
+
content = [
|
|
171
|
+
{"type": "text", "text": prompt},
|
|
172
|
+
{
|
|
173
|
+
"type": "image_url",
|
|
174
|
+
"image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
|
|
175
|
+
}
|
|
176
|
+
]
|
|
177
|
+
message = HumanMessage(content=content)
|
|
178
|
+
|
|
179
|
+
elif provider == "aws_bedrock":
|
|
180
|
+
# AWS Bedrock (Claude via Bedrock) - Uses same format as Anthropic
|
|
181
|
+
content = [
|
|
182
|
+
{
|
|
183
|
+
"type": "image",
|
|
184
|
+
"source": {
|
|
185
|
+
"type": "base64",
|
|
186
|
+
"media_type": mime_type,
|
|
187
|
+
"data": b64_image
|
|
188
|
+
}
|
|
189
|
+
},
|
|
190
|
+
{"type": "text", "text": prompt}
|
|
191
|
+
]
|
|
192
|
+
message = HumanMessage(content=content)
|
|
193
|
+
|
|
194
|
+
else:
|
|
195
|
+
return None # Unsupported provider
|
|
196
|
+
|
|
197
|
+
response = llm_client.invoke([message])
|
|
198
|
+
result = response.content.strip()
|
|
199
|
+
|
|
200
|
+
# Wrap result in [Figure:...] format
|
|
201
|
+
result = f"[Figure:{result}]"
|
|
202
|
+
|
|
203
|
+
logger.info(f"[OCR] Image to text conversion completed: {os.path.basename(image_path)}")
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error(f"[OCR] Image to text conversion failed: {image_path}, error: {e}")
|
|
208
|
+
return f"[Image conversion error: {str(e)}]"
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def process_text_with_ocr(
|
|
212
|
+
text: str,
|
|
213
|
+
llm_client: Any,
|
|
214
|
+
provider: str
|
|
215
|
+
) -> str:
|
|
216
|
+
"""
|
|
217
|
+
Detect image tags in text and replace with OCR results.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
text: Text containing [Image:{path}] tags
|
|
221
|
+
llm_client: LangChain LLM client
|
|
222
|
+
provider: LLM provider
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Text with image tags replaced by OCR results
|
|
226
|
+
"""
|
|
227
|
+
if not llm_client:
|
|
228
|
+
logger.warning("[OCR] Skipping OCR processing: no LLM client")
|
|
229
|
+
return text
|
|
230
|
+
|
|
231
|
+
# Extract image tags
|
|
232
|
+
image_paths = extract_image_tags(text)
|
|
233
|
+
|
|
234
|
+
if not image_paths:
|
|
235
|
+
logger.debug("[OCR] No image tags found in text")
|
|
236
|
+
return text
|
|
237
|
+
|
|
238
|
+
logger.info(f"[OCR] Detected {len(image_paths)} image tags")
|
|
239
|
+
|
|
240
|
+
result_text = text
|
|
241
|
+
|
|
242
|
+
for img_path in image_paths:
|
|
243
|
+
# Case-insensitive tag matching
|
|
244
|
+
tag_pattern = re.compile(r'\[[Ii]mage:' + re.escape(img_path) + r'\]')
|
|
245
|
+
|
|
246
|
+
# Load image from local path
|
|
247
|
+
local_path = load_image_from_path(img_path)
|
|
248
|
+
|
|
249
|
+
if local_path is None:
|
|
250
|
+
# Keep original tag on load failure
|
|
251
|
+
logger.warning(f"[OCR] Image load failed, keeping original tag: {img_path}")
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
# Convert image to text using VL model
|
|
255
|
+
ocr_result = convert_image_to_text_with_llm(
|
|
256
|
+
image_path=local_path,
|
|
257
|
+
llm_client=llm_client,
|
|
258
|
+
provider=provider
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Keep original tag on OCR failure (None or error message)
|
|
262
|
+
if ocr_result is None or ocr_result.startswith("[Image conversion error:"):
|
|
263
|
+
logger.warning(f"[OCR] Image conversion failed, keeping original tag: {img_path}")
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
# Replace tag with OCR result
|
|
267
|
+
result_text = tag_pattern.sub(ocr_result, result_text)
|
|
268
|
+
logger.info(f"[OCR] Tag replacement completed: {img_path[:50]}...")
|
|
269
|
+
|
|
270
|
+
return result_text
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def process_text_with_ocr_progress(
|
|
274
|
+
text: str,
|
|
275
|
+
llm_client: Any,
|
|
276
|
+
provider: str,
|
|
277
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], Any]] = None
|
|
278
|
+
) -> str:
|
|
279
|
+
"""
|
|
280
|
+
Detect image tags in text and replace with OCR results (with progress callback support).
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
text: Text containing [Image:{path}] tags
|
|
284
|
+
llm_client: LangChain LLM client
|
|
285
|
+
provider: LLM provider
|
|
286
|
+
progress_callback: Progress callback function
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Text with image tags replaced by OCR results
|
|
290
|
+
"""
|
|
291
|
+
if not llm_client:
|
|
292
|
+
logger.warning("[OCR] Skipping OCR processing: no LLM client")
|
|
293
|
+
return text
|
|
294
|
+
|
|
295
|
+
# Extract image tags
|
|
296
|
+
image_paths = extract_image_tags(text)
|
|
297
|
+
|
|
298
|
+
if not image_paths:
|
|
299
|
+
logger.debug("[OCR] No image tags found in text")
|
|
300
|
+
return text
|
|
301
|
+
|
|
302
|
+
total_chunks = len(image_paths)
|
|
303
|
+
logger.info(f"[OCR] Detected {total_chunks} image tags")
|
|
304
|
+
|
|
305
|
+
result_text = text
|
|
306
|
+
success_count = 0
|
|
307
|
+
failed_count = 0
|
|
308
|
+
|
|
309
|
+
for idx, img_path in enumerate(image_paths):
|
|
310
|
+
# Progress callback - processing started
|
|
311
|
+
if progress_callback:
|
|
312
|
+
progress_callback({
|
|
313
|
+
'event': 'ocr_tag_processing',
|
|
314
|
+
'chunk_index': idx,
|
|
315
|
+
'total_chunks': total_chunks,
|
|
316
|
+
'image_path': img_path
|
|
317
|
+
})
|
|
318
|
+
|
|
319
|
+
# Case-insensitive tag matching
|
|
320
|
+
tag_pattern = re.compile(r'\[[Ii]mage:' + re.escape(img_path) + r'\]')
|
|
321
|
+
|
|
322
|
+
# Load image from local path
|
|
323
|
+
local_path = load_image_from_path(img_path)
|
|
324
|
+
|
|
325
|
+
if local_path is None:
|
|
326
|
+
# Keep original tag on load failure
|
|
327
|
+
logger.warning(f"[OCR] Image load failed, keeping original tag: {img_path}")
|
|
328
|
+
failed_count += 1
|
|
329
|
+
if progress_callback:
|
|
330
|
+
progress_callback({
|
|
331
|
+
'event': 'ocr_chunk_processed',
|
|
332
|
+
'chunk_index': idx,
|
|
333
|
+
'total_chunks': total_chunks,
|
|
334
|
+
'status': 'failed',
|
|
335
|
+
'error': f'Load failed: {img_path}'
|
|
336
|
+
})
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
# Convert image to text using VL model
|
|
341
|
+
ocr_result = convert_image_to_text_with_llm(
|
|
342
|
+
image_path=local_path,
|
|
343
|
+
llm_client=llm_client,
|
|
344
|
+
provider=provider
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Keep original tag on OCR failure (None or error message)
|
|
348
|
+
if ocr_result is None or ocr_result.startswith("[Image conversion error:"):
|
|
349
|
+
logger.warning(f"[OCR] Image conversion failed, keeping original tag: {img_path}")
|
|
350
|
+
failed_count += 1
|
|
351
|
+
if progress_callback:
|
|
352
|
+
progress_callback({
|
|
353
|
+
'event': 'ocr_chunk_processed',
|
|
354
|
+
'chunk_index': idx,
|
|
355
|
+
'total_chunks': total_chunks,
|
|
356
|
+
'status': 'failed',
|
|
357
|
+
'error': ocr_result or 'OCR returned None'
|
|
358
|
+
})
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
# Replace tag with OCR result
|
|
362
|
+
result_text = tag_pattern.sub(ocr_result, result_text)
|
|
363
|
+
success_count += 1
|
|
364
|
+
logger.info(f"[OCR] Tag replacement completed: {img_path[:50]}...")
|
|
365
|
+
|
|
366
|
+
if progress_callback:
|
|
367
|
+
progress_callback({
|
|
368
|
+
'event': 'ocr_chunk_processed',
|
|
369
|
+
'chunk_index': idx,
|
|
370
|
+
'total_chunks': total_chunks,
|
|
371
|
+
'status': 'success'
|
|
372
|
+
})
|
|
373
|
+
|
|
374
|
+
except Exception as e:
|
|
375
|
+
logger.error(f"[OCR] Image processing error: {img_path}, error: {e}")
|
|
376
|
+
failed_count += 1
|
|
377
|
+
if progress_callback:
|
|
378
|
+
progress_callback({
|
|
379
|
+
'event': 'ocr_chunk_processed',
|
|
380
|
+
'chunk_index': idx,
|
|
381
|
+
'total_chunks': total_chunks,
|
|
382
|
+
'status': 'failed',
|
|
383
|
+
'error': str(e)
|
|
384
|
+
})
|
|
385
|
+
|
|
386
|
+
return result_text
|
|
387
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|