xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/chart_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Chart Processor Module
|
|
4
|
+
|
|
5
|
+
Provides functionality for generating and formatting chart content in extracted text.
|
|
6
|
+
This module standardizes chart tag format across all document handlers.
|
|
7
|
+
|
|
8
|
+
=== Architecture Overview ===
|
|
9
|
+
|
|
10
|
+
1. Creation:
|
|
11
|
+
- ChartProcessor instance is created when DocumentProcessor is initialized.
|
|
12
|
+
- Created via DocumentProcessor.__init__() calling _create_chart_processor() method.
|
|
13
|
+
|
|
14
|
+
2. Propagation:
|
|
15
|
+
- The created ChartProcessor is passed to ALL handlers.
|
|
16
|
+
- In DocumentProcessor._get_handler_registry(), each handler is created with
|
|
17
|
+
chart_processor=self._chart_processor parameter.
|
|
18
|
+
|
|
19
|
+
3. Access from Handlers:
|
|
20
|
+
- Each Handler inherits from BaseHandler and can access via self.chart_processor.
|
|
21
|
+
- Use format_chart_data() to convert chart data to standardized format.
|
|
22
|
+
|
|
23
|
+
4. Output Format:
|
|
24
|
+
{chart_tag_prefix}
|
|
25
|
+
Chart Type: {type}
|
|
26
|
+
<table>...</table>
|
|
27
|
+
{chart_tag_suffix}
|
|
28
|
+
|
|
29
|
+
=== Usage Examples ===
|
|
30
|
+
|
|
31
|
+
# Custom settings at DocumentProcessor level
|
|
32
|
+
from xgen_doc2chunk.core.document_processor import DocumentProcessor
|
|
33
|
+
|
|
34
|
+
processor = DocumentProcessor(
|
|
35
|
+
chart_tag_prefix="<chart>",
|
|
36
|
+
chart_tag_suffix="</chart>"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Usage inside Handler (BaseHandler subclass)
|
|
40
|
+
class MyHandler(BaseHandler):
|
|
41
|
+
def extract_text(self, ...):
|
|
42
|
+
chart_content = self.chart_processor.format_chart_data(
|
|
43
|
+
chart_type="Bar Chart",
|
|
44
|
+
title="Sales Report",
|
|
45
|
+
categories=["Q1", "Q2", "Q3"],
|
|
46
|
+
series=[{"name": "Revenue", "values": [100, 150, 200]}]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
=== Default Tag Format ===
|
|
50
|
+
|
|
51
|
+
[chart]
|
|
52
|
+
Chart Type: Bar Chart
|
|
53
|
+
Title: Sales Report
|
|
54
|
+
<table border='1'>
|
|
55
|
+
<tr><th>Category</th><th>Revenue</th></tr>
|
|
56
|
+
<tr><td>Q1</td><td>100</td></tr>
|
|
57
|
+
...
|
|
58
|
+
</table>
|
|
59
|
+
[/chart]
|
|
60
|
+
|
|
61
|
+
"""
|
|
62
|
+
import logging
|
|
63
|
+
import re
|
|
64
|
+
from dataclasses import dataclass
|
|
65
|
+
from typing import Any, Dict, List, Optional, Pattern, Tuple
|
|
66
|
+
|
|
67
|
+
logger = logging.getLogger("document-processor")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Chart type mapping (OOXML chart type names to human-readable names)
|
|
71
|
+
CHART_TYPE_MAP = {
|
|
72
|
+
# Bar/Column charts
|
|
73
|
+
'barChart': 'Bar Chart',
|
|
74
|
+
'bar3DChart': '3D Bar Chart',
|
|
75
|
+
'colChart': 'Column Chart',
|
|
76
|
+
'col3DChart': '3D Column Chart',
|
|
77
|
+
|
|
78
|
+
# Line charts
|
|
79
|
+
'lineChart': 'Line Chart',
|
|
80
|
+
'line3DChart': '3D Line Chart',
|
|
81
|
+
'stockChart': 'Stock Chart',
|
|
82
|
+
|
|
83
|
+
# Pie charts
|
|
84
|
+
'pieChart': 'Pie Chart',
|
|
85
|
+
'pie3DChart': '3D Pie Chart',
|
|
86
|
+
'doughnutChart': 'Doughnut Chart',
|
|
87
|
+
'ofPieChart': 'Pie of Pie Chart',
|
|
88
|
+
|
|
89
|
+
# Area charts
|
|
90
|
+
'areaChart': 'Area Chart',
|
|
91
|
+
'area3DChart': '3D Area Chart',
|
|
92
|
+
|
|
93
|
+
# Scatter/Bubble charts
|
|
94
|
+
'scatterChart': 'Scatter Chart',
|
|
95
|
+
'bubbleChart': 'Bubble Chart',
|
|
96
|
+
|
|
97
|
+
# Radar charts
|
|
98
|
+
'radarChart': 'Radar Chart',
|
|
99
|
+
|
|
100
|
+
# Surface charts
|
|
101
|
+
'surfaceChart': 'Surface Chart',
|
|
102
|
+
'surface3DChart': '3D Surface Chart',
|
|
103
|
+
|
|
104
|
+
# Combo/Other
|
|
105
|
+
'comboChart': 'Combo Chart',
|
|
106
|
+
'unknownChart': 'Chart',
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class ChartProcessorConfig:
|
|
112
|
+
"""
|
|
113
|
+
ChartProcessor configuration.
|
|
114
|
+
|
|
115
|
+
Attributes:
|
|
116
|
+
tag_prefix: Chart tag prefix (e.g., "[chart]")
|
|
117
|
+
tag_suffix: Chart tag suffix (e.g., "[/chart]")
|
|
118
|
+
use_html_table: Whether to use HTML table format (True) or Markdown (False)
|
|
119
|
+
include_type: Whether to include chart type in output
|
|
120
|
+
include_title: Whether to include chart title in output
|
|
121
|
+
"""
|
|
122
|
+
tag_prefix: str = "[chart]"
|
|
123
|
+
tag_suffix: str = "[/chart]"
|
|
124
|
+
use_html_table: bool = True
|
|
125
|
+
include_type: bool = True
|
|
126
|
+
include_title: bool = True
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ChartProcessor:
|
|
130
|
+
"""
|
|
131
|
+
Chart Processor Class
|
|
132
|
+
|
|
133
|
+
Generates and formats chart content for document text extraction.
|
|
134
|
+
Provides a standardized interface for all document handlers.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
tag_prefix: Chart tag prefix (default: "[chart]")
|
|
138
|
+
tag_suffix: Chart tag suffix (default: "[/chart]")
|
|
139
|
+
use_html_table: Use HTML table format (default: True)
|
|
140
|
+
config: ChartProcessorConfig instance (overrides individual parameters)
|
|
141
|
+
|
|
142
|
+
Examples:
|
|
143
|
+
>>> processor = ChartProcessor()
|
|
144
|
+
>>> content = processor.format_chart_data(
|
|
145
|
+
... chart_type="Bar Chart",
|
|
146
|
+
... title="Sales",
|
|
147
|
+
... categories=["Q1", "Q2"],
|
|
148
|
+
... series=[{"name": "Revenue", "values": [100, 200]}]
|
|
149
|
+
... )
|
|
150
|
+
'[chart]\\nChart Type: Bar Chart\\nTitle: Sales\\n<table>...</table>\\n[/chart]'
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
def __init__(
|
|
154
|
+
self,
|
|
155
|
+
tag_prefix: Optional[str] = None,
|
|
156
|
+
tag_suffix: Optional[str] = None,
|
|
157
|
+
use_html_table: Optional[bool] = None,
|
|
158
|
+
config: Optional[ChartProcessorConfig] = None
|
|
159
|
+
):
|
|
160
|
+
"""Initialize ChartProcessor with configuration."""
|
|
161
|
+
if config is not None:
|
|
162
|
+
self._config = config
|
|
163
|
+
else:
|
|
164
|
+
self._config = ChartProcessorConfig(
|
|
165
|
+
tag_prefix=tag_prefix if tag_prefix is not None else ChartProcessorConfig.tag_prefix,
|
|
166
|
+
tag_suffix=tag_suffix if tag_suffix is not None else ChartProcessorConfig.tag_suffix,
|
|
167
|
+
use_html_table=use_html_table if use_html_table is not None else ChartProcessorConfig.use_html_table,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Pre-compile regex pattern for parsing
|
|
171
|
+
self._chart_pattern: Optional[Pattern] = None
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def config(self) -> ChartProcessorConfig:
|
|
175
|
+
"""Current configuration."""
|
|
176
|
+
return self._config
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def tag_prefix(self) -> str:
|
|
180
|
+
"""Chart tag prefix."""
|
|
181
|
+
return self._config.tag_prefix
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def tag_suffix(self) -> str:
|
|
185
|
+
"""Chart tag suffix."""
|
|
186
|
+
return self._config.tag_suffix
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def chart_pattern(self) -> Pattern:
|
|
190
|
+
"""Compiled regex pattern for matching chart blocks."""
|
|
191
|
+
if self._chart_pattern is None:
|
|
192
|
+
escaped_prefix = re.escape(self._config.tag_prefix)
|
|
193
|
+
escaped_suffix = re.escape(self._config.tag_suffix)
|
|
194
|
+
self._chart_pattern = re.compile(
|
|
195
|
+
f'{escaped_prefix}(.*?){escaped_suffix}',
|
|
196
|
+
re.DOTALL | re.IGNORECASE
|
|
197
|
+
)
|
|
198
|
+
return self._chart_pattern
|
|
199
|
+
|
|
200
|
+
def get_pattern_string(self) -> str:
|
|
201
|
+
"""
|
|
202
|
+
Get regex pattern string for matching chart blocks.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Regex pattern string for matching chart blocks
|
|
206
|
+
"""
|
|
207
|
+
escaped_prefix = re.escape(self._config.tag_prefix)
|
|
208
|
+
escaped_suffix = re.escape(self._config.tag_suffix)
|
|
209
|
+
return f'{escaped_prefix}.*?{escaped_suffix}'
|
|
210
|
+
|
|
211
|
+
def get_chart_type_name(self, ooxml_type: str) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Convert OOXML chart type to human-readable name.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
ooxml_type: OOXML chart type (e.g., 'barChart', 'pieChart')
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Human-readable chart type name
|
|
220
|
+
"""
|
|
221
|
+
return CHART_TYPE_MAP.get(ooxml_type, ooxml_type or 'Chart')
|
|
222
|
+
|
|
223
|
+
def format_chart_data(
|
|
224
|
+
self,
|
|
225
|
+
chart_type: Optional[str] = None,
|
|
226
|
+
title: Optional[str] = None,
|
|
227
|
+
categories: Optional[List[Any]] = None,
|
|
228
|
+
series: Optional[List[Dict[str, Any]]] = None,
|
|
229
|
+
raw_content: Optional[str] = None
|
|
230
|
+
) -> str:
|
|
231
|
+
"""
|
|
232
|
+
Format chart data into standardized output format.
|
|
233
|
+
|
|
234
|
+
Creates a formatted chart block with the configured tags, containing:
|
|
235
|
+
- Chart type (if available)
|
|
236
|
+
- Chart title (if available)
|
|
237
|
+
- Data table in HTML format
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
chart_type: Chart type name (e.g., "Bar Chart", "Pie Chart")
|
|
241
|
+
title: Chart title
|
|
242
|
+
categories: List of category labels (x-axis values)
|
|
243
|
+
series: List of series data, each containing:
|
|
244
|
+
- 'name': Series name
|
|
245
|
+
- 'values': List of values
|
|
246
|
+
raw_content: Raw content to include (if no structured data)
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Formatted chart block string
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
>>> processor = ChartProcessor()
|
|
253
|
+
>>> result = processor.format_chart_data(
|
|
254
|
+
... chart_type="Bar Chart",
|
|
255
|
+
... title="Quarterly Sales",
|
|
256
|
+
... categories=["Q1", "Q2", "Q3", "Q4"],
|
|
257
|
+
... series=[
|
|
258
|
+
... {"name": "Product A", "values": [100, 150, 200, 180]},
|
|
259
|
+
... {"name": "Product B", "values": [80, 120, 160, 140]}
|
|
260
|
+
... ]
|
|
261
|
+
... )
|
|
262
|
+
"""
|
|
263
|
+
parts = [self._config.tag_prefix]
|
|
264
|
+
|
|
265
|
+
# Add chart type
|
|
266
|
+
if chart_type and self._config.include_type:
|
|
267
|
+
parts.append(f"Chart Type: {chart_type}")
|
|
268
|
+
|
|
269
|
+
# Add title
|
|
270
|
+
if title and self._config.include_title:
|
|
271
|
+
parts.append(f"Title: {title}")
|
|
272
|
+
|
|
273
|
+
# Add data table or raw content
|
|
274
|
+
if series and any(s.get('values') for s in series):
|
|
275
|
+
table = self._build_data_table(categories, series)
|
|
276
|
+
if table:
|
|
277
|
+
parts.append("") # Empty line before table
|
|
278
|
+
parts.append(table)
|
|
279
|
+
elif raw_content:
|
|
280
|
+
parts.append("")
|
|
281
|
+
parts.append(raw_content)
|
|
282
|
+
|
|
283
|
+
parts.append(self._config.tag_suffix)
|
|
284
|
+
return "\n".join(parts)
|
|
285
|
+
|
|
286
|
+
def format_chart_fallback(
|
|
287
|
+
self,
|
|
288
|
+
chart_type: Optional[str] = None,
|
|
289
|
+
title: Optional[str] = None,
|
|
290
|
+
message: Optional[str] = None
|
|
291
|
+
) -> str:
|
|
292
|
+
"""
|
|
293
|
+
Format a fallback chart block when data extraction fails.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
chart_type: Chart type name
|
|
297
|
+
title: Chart title
|
|
298
|
+
message: Optional message about the chart
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Minimal chart block string
|
|
302
|
+
"""
|
|
303
|
+
parts = [self._config.tag_prefix]
|
|
304
|
+
|
|
305
|
+
if chart_type:
|
|
306
|
+
parts.append(f"Chart Type: {chart_type}")
|
|
307
|
+
if title:
|
|
308
|
+
parts.append(f"Title: {title}")
|
|
309
|
+
if message:
|
|
310
|
+
parts.append(message)
|
|
311
|
+
elif not chart_type and not title:
|
|
312
|
+
parts.append("(Chart content could not be extracted)")
|
|
313
|
+
|
|
314
|
+
parts.append(self._config.tag_suffix)
|
|
315
|
+
return "\n".join(parts)
|
|
316
|
+
|
|
317
|
+
def _build_data_table(
|
|
318
|
+
self,
|
|
319
|
+
categories: Optional[List[Any]],
|
|
320
|
+
series: List[Dict[str, Any]]
|
|
321
|
+
) -> str:
|
|
322
|
+
"""
|
|
323
|
+
Build an HTML table from chart data.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
categories: Category labels
|
|
327
|
+
series: Series data list
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
HTML table string
|
|
331
|
+
"""
|
|
332
|
+
if not series:
|
|
333
|
+
return ""
|
|
334
|
+
|
|
335
|
+
categories = categories or []
|
|
336
|
+
|
|
337
|
+
if self._config.use_html_table:
|
|
338
|
+
return self._build_html_table(categories, series)
|
|
339
|
+
else:
|
|
340
|
+
return self._build_markdown_table(categories, series)
|
|
341
|
+
|
|
342
|
+
def _build_html_table(
|
|
343
|
+
self,
|
|
344
|
+
categories: List[Any],
|
|
345
|
+
series: List[Dict[str, Any]]
|
|
346
|
+
) -> str:
|
|
347
|
+
"""Build HTML table from chart data."""
|
|
348
|
+
rows = []
|
|
349
|
+
rows.append("<table border='1'>")
|
|
350
|
+
|
|
351
|
+
# Header row
|
|
352
|
+
header_cells = ["<th>Category</th>"]
|
|
353
|
+
for i, s in enumerate(series):
|
|
354
|
+
name = s.get('name') or f"Series {i+1}"
|
|
355
|
+
header_cells.append(f"<th>{self._escape_html(str(name))}</th>")
|
|
356
|
+
rows.append(f"<tr>{''.join(header_cells)}</tr>")
|
|
357
|
+
|
|
358
|
+
# Data rows
|
|
359
|
+
max_len = max(
|
|
360
|
+
len(categories),
|
|
361
|
+
max((len(s.get('values', [])) for s in series), default=0)
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
for i in range(max_len):
|
|
365
|
+
cells = []
|
|
366
|
+
|
|
367
|
+
# Category cell
|
|
368
|
+
if i < len(categories):
|
|
369
|
+
cat = self._escape_html(str(categories[i]))
|
|
370
|
+
else:
|
|
371
|
+
cat = f"Item {i+1}"
|
|
372
|
+
cells.append(f"<td>{cat}</td>")
|
|
373
|
+
|
|
374
|
+
# Value cells
|
|
375
|
+
for s in series:
|
|
376
|
+
values = s.get('values', [])
|
|
377
|
+
if i < len(values) and values[i] is not None:
|
|
378
|
+
val = values[i]
|
|
379
|
+
if isinstance(val, float):
|
|
380
|
+
formatted = f"{val:,.2f}"
|
|
381
|
+
else:
|
|
382
|
+
formatted = self._escape_html(str(val))
|
|
383
|
+
cells.append(f"<td>{formatted}</td>")
|
|
384
|
+
else:
|
|
385
|
+
cells.append("<td></td>")
|
|
386
|
+
|
|
387
|
+
rows.append(f"<tr>{''.join(cells)}</tr>")
|
|
388
|
+
|
|
389
|
+
rows.append("</table>")
|
|
390
|
+
return "\n".join(rows)
|
|
391
|
+
|
|
392
|
+
def _build_markdown_table(
|
|
393
|
+
self,
|
|
394
|
+
categories: List[Any],
|
|
395
|
+
series: List[Dict[str, Any]]
|
|
396
|
+
) -> str:
|
|
397
|
+
"""Build Markdown table from chart data."""
|
|
398
|
+
rows = []
|
|
399
|
+
|
|
400
|
+
# Header row
|
|
401
|
+
header = ["Category"] + [s.get('name', f'Series {i+1}') for i, s in enumerate(series)]
|
|
402
|
+
rows.append("| " + " | ".join(str(h) for h in header) + " |")
|
|
403
|
+
rows.append("| " + " | ".join(["---"] * len(header)) + " |")
|
|
404
|
+
|
|
405
|
+
# Data rows
|
|
406
|
+
max_len = max(
|
|
407
|
+
len(categories),
|
|
408
|
+
max((len(s.get('values', [])) for s in series), default=0)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
for i in range(max_len):
|
|
412
|
+
row = []
|
|
413
|
+
|
|
414
|
+
# Category
|
|
415
|
+
if i < len(categories):
|
|
416
|
+
row.append(str(categories[i]))
|
|
417
|
+
else:
|
|
418
|
+
row.append(f"Item {i+1}")
|
|
419
|
+
|
|
420
|
+
# Values
|
|
421
|
+
for s in series:
|
|
422
|
+
values = s.get('values', [])
|
|
423
|
+
if i < len(values) and values[i] is not None:
|
|
424
|
+
val = values[i]
|
|
425
|
+
if isinstance(val, float):
|
|
426
|
+
row.append(f"{val:,.2f}")
|
|
427
|
+
else:
|
|
428
|
+
row.append(str(val))
|
|
429
|
+
else:
|
|
430
|
+
row.append("")
|
|
431
|
+
|
|
432
|
+
rows.append("| " + " | ".join(row) + " |")
|
|
433
|
+
|
|
434
|
+
return "\n".join(rows)
|
|
435
|
+
|
|
436
|
+
def _escape_html(self, text: str) -> str:
|
|
437
|
+
"""Escape HTML special characters."""
|
|
438
|
+
return (
|
|
439
|
+
text
|
|
440
|
+
.replace("&", "&")
|
|
441
|
+
.replace("<", "<")
|
|
442
|
+
.replace(">", ">")
|
|
443
|
+
.replace('"', """)
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
def has_chart_blocks(self, text: str) -> bool:
|
|
447
|
+
"""
|
|
448
|
+
Check if text contains chart blocks.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
text: Text to check
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
True if chart blocks found
|
|
455
|
+
"""
|
|
456
|
+
return bool(self.chart_pattern.search(text))
|
|
457
|
+
|
|
458
|
+
def find_chart_blocks(self, text: str) -> List[Tuple[int, int, str]]:
|
|
459
|
+
"""
|
|
460
|
+
Find all chart blocks in text.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
text: Text to search
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
List of tuples: (start_pos, end_pos, content)
|
|
467
|
+
"""
|
|
468
|
+
results = []
|
|
469
|
+
for match in self.chart_pattern.finditer(text):
|
|
470
|
+
results.append((match.start(), match.end(), match.group(1)))
|
|
471
|
+
return results
|
|
472
|
+
|
|
473
|
+
def remove_chart_blocks(self, text: str) -> str:
|
|
474
|
+
"""
|
|
475
|
+
Remove all chart blocks from text.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
text: Text with chart blocks
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
Text with chart blocks removed
|
|
482
|
+
"""
|
|
483
|
+
return self.chart_pattern.sub('', text)
|
|
484
|
+
|
|
485
|
+
def __repr__(self) -> str:
|
|
486
|
+
return (
|
|
487
|
+
f"ChartProcessor(tag_prefix={self._config.tag_prefix!r}, "
|
|
488
|
+
f"tag_suffix={self._config.tag_suffix!r})"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
# Default instance for convenience
|
|
493
|
+
_default_processor: Optional[ChartProcessor] = None
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def get_default_chart_processor() -> ChartProcessor:
|
|
497
|
+
"""Get the default ChartProcessor instance."""
|
|
498
|
+
global _default_processor
|
|
499
|
+
if _default_processor is None:
|
|
500
|
+
_default_processor = ChartProcessor()
|
|
501
|
+
return _default_processor
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def create_chart_processor(
|
|
505
|
+
tag_prefix: Optional[str] = None,
|
|
506
|
+
tag_suffix: Optional[str] = None,
|
|
507
|
+
use_html_table: bool = True
|
|
508
|
+
) -> ChartProcessor:
|
|
509
|
+
"""
|
|
510
|
+
Factory function to create a ChartProcessor instance.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
tag_prefix: Chart tag prefix (default: "[chart]")
|
|
514
|
+
tag_suffix: Chart tag suffix (default: "[/chart]")
|
|
515
|
+
use_html_table: Use HTML table format (default: True)
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
ChartProcessor instance
|
|
519
|
+
"""
|
|
520
|
+
return ChartProcessor(
|
|
521
|
+
tag_prefix=tag_prefix,
|
|
522
|
+
tag_suffix=tag_suffix,
|
|
523
|
+
use_html_table=use_html_table
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
__all__ = [
|
|
528
|
+
"ChartProcessorConfig",
|
|
529
|
+
"ChartProcessor",
|
|
530
|
+
"CHART_TYPE_MAP",
|
|
531
|
+
"get_default_chart_processor",
|
|
532
|
+
"create_chart_processor",
|
|
533
|
+
]
|
|
534
|
+
|