xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,534 @@
1
+ # xgen_doc2chunk/core/functions/chart_processor.py
2
+ """
3
+ Chart Processor Module
4
+
5
+ Provides functionality for generating and formatting chart content in extracted text.
6
+ This module standardizes chart tag format across all document handlers.
7
+
8
+ === Architecture Overview ===
9
+
10
+ 1. Creation:
11
+ - ChartProcessor instance is created when DocumentProcessor is initialized.
12
+ - Created via DocumentProcessor.__init__() calling _create_chart_processor() method.
13
+
14
+ 2. Propagation:
15
+ - The created ChartProcessor is passed to ALL handlers.
16
+ - In DocumentProcessor._get_handler_registry(), each handler is created with
17
+ chart_processor=self._chart_processor parameter.
18
+
19
+ 3. Access from Handlers:
20
+ - Each Handler inherits from BaseHandler and can access via self.chart_processor.
21
+ - Use format_chart_data() to convert chart data to standardized format.
22
+
23
+ 4. Output Format:
24
+ {chart_tag_prefix}
25
+ Chart Type: {type}
26
+ <table>...</table>
27
+ {chart_tag_suffix}
28
+
29
+ === Usage Examples ===
30
+
31
+ # Custom settings at DocumentProcessor level
32
+ from xgen_doc2chunk.core.document_processor import DocumentProcessor
33
+
34
+ processor = DocumentProcessor(
35
+ chart_tag_prefix="<chart>",
36
+ chart_tag_suffix="</chart>"
37
+ )
38
+
39
+ # Usage inside Handler (BaseHandler subclass)
40
+ class MyHandler(BaseHandler):
41
+ def extract_text(self, ...):
42
+ chart_content = self.chart_processor.format_chart_data(
43
+ chart_type="Bar Chart",
44
+ title="Sales Report",
45
+ categories=["Q1", "Q2", "Q3"],
46
+ series=[{"name": "Revenue", "values": [100, 150, 200]}]
47
+ )
48
+
49
+ === Default Tag Format ===
50
+
51
+ [chart]
52
+ Chart Type: Bar Chart
53
+ Title: Sales Report
54
+ <table border='1'>
55
+ <tr><th>Category</th><th>Revenue</th></tr>
56
+ <tr><td>Q1</td><td>100</td></tr>
57
+ ...
58
+ </table>
59
+ [/chart]
60
+
61
+ """
62
+ import logging
63
+ import re
64
+ from dataclasses import dataclass
65
+ from typing import Any, Dict, List, Optional, Pattern, Tuple
66
+
67
+ logger = logging.getLogger("document-processor")
68
+
69
+
70
+ # Chart type mapping (OOXML chart type names to human-readable names)
71
+ CHART_TYPE_MAP = {
72
+ # Bar/Column charts
73
+ 'barChart': 'Bar Chart',
74
+ 'bar3DChart': '3D Bar Chart',
75
+ 'colChart': 'Column Chart',
76
+ 'col3DChart': '3D Column Chart',
77
+
78
+ # Line charts
79
+ 'lineChart': 'Line Chart',
80
+ 'line3DChart': '3D Line Chart',
81
+ 'stockChart': 'Stock Chart',
82
+
83
+ # Pie charts
84
+ 'pieChart': 'Pie Chart',
85
+ 'pie3DChart': '3D Pie Chart',
86
+ 'doughnutChart': 'Doughnut Chart',
87
+ 'ofPieChart': 'Pie of Pie Chart',
88
+
89
+ # Area charts
90
+ 'areaChart': 'Area Chart',
91
+ 'area3DChart': '3D Area Chart',
92
+
93
+ # Scatter/Bubble charts
94
+ 'scatterChart': 'Scatter Chart',
95
+ 'bubbleChart': 'Bubble Chart',
96
+
97
+ # Radar charts
98
+ 'radarChart': 'Radar Chart',
99
+
100
+ # Surface charts
101
+ 'surfaceChart': 'Surface Chart',
102
+ 'surface3DChart': '3D Surface Chart',
103
+
104
+ # Combo/Other
105
+ 'comboChart': 'Combo Chart',
106
+ 'unknownChart': 'Chart',
107
+ }
108
+
109
+
110
+ @dataclass
111
+ class ChartProcessorConfig:
112
+ """
113
+ ChartProcessor configuration.
114
+
115
+ Attributes:
116
+ tag_prefix: Chart tag prefix (e.g., "[chart]")
117
+ tag_suffix: Chart tag suffix (e.g., "[/chart]")
118
+ use_html_table: Whether to use HTML table format (True) or Markdown (False)
119
+ include_type: Whether to include chart type in output
120
+ include_title: Whether to include chart title in output
121
+ """
122
+ tag_prefix: str = "[chart]"
123
+ tag_suffix: str = "[/chart]"
124
+ use_html_table: bool = True
125
+ include_type: bool = True
126
+ include_title: bool = True
127
+
128
+
129
+ class ChartProcessor:
130
+ """
131
+ Chart Processor Class
132
+
133
+ Generates and formats chart content for document text extraction.
134
+ Provides a standardized interface for all document handlers.
135
+
136
+ Args:
137
+ tag_prefix: Chart tag prefix (default: "[chart]")
138
+ tag_suffix: Chart tag suffix (default: "[/chart]")
139
+ use_html_table: Use HTML table format (default: True)
140
+ config: ChartProcessorConfig instance (overrides individual parameters)
141
+
142
+ Examples:
143
+ >>> processor = ChartProcessor()
144
+ >>> content = processor.format_chart_data(
145
+ ... chart_type="Bar Chart",
146
+ ... title="Sales",
147
+ ... categories=["Q1", "Q2"],
148
+ ... series=[{"name": "Revenue", "values": [100, 200]}]
149
+ ... )
150
+ '[chart]\\nChart Type: Bar Chart\\nTitle: Sales\\n<table>...</table>\\n[/chart]'
151
+ """
152
+
153
+ def __init__(
154
+ self,
155
+ tag_prefix: Optional[str] = None,
156
+ tag_suffix: Optional[str] = None,
157
+ use_html_table: Optional[bool] = None,
158
+ config: Optional[ChartProcessorConfig] = None
159
+ ):
160
+ """Initialize ChartProcessor with configuration."""
161
+ if config is not None:
162
+ self._config = config
163
+ else:
164
+ self._config = ChartProcessorConfig(
165
+ tag_prefix=tag_prefix if tag_prefix is not None else ChartProcessorConfig.tag_prefix,
166
+ tag_suffix=tag_suffix if tag_suffix is not None else ChartProcessorConfig.tag_suffix,
167
+ use_html_table=use_html_table if use_html_table is not None else ChartProcessorConfig.use_html_table,
168
+ )
169
+
170
+ # Pre-compile regex pattern for parsing
171
+ self._chart_pattern: Optional[Pattern] = None
172
+
173
+ @property
174
+ def config(self) -> ChartProcessorConfig:
175
+ """Current configuration."""
176
+ return self._config
177
+
178
+ @property
179
+ def tag_prefix(self) -> str:
180
+ """Chart tag prefix."""
181
+ return self._config.tag_prefix
182
+
183
+ @property
184
+ def tag_suffix(self) -> str:
185
+ """Chart tag suffix."""
186
+ return self._config.tag_suffix
187
+
188
+ @property
189
+ def chart_pattern(self) -> Pattern:
190
+ """Compiled regex pattern for matching chart blocks."""
191
+ if self._chart_pattern is None:
192
+ escaped_prefix = re.escape(self._config.tag_prefix)
193
+ escaped_suffix = re.escape(self._config.tag_suffix)
194
+ self._chart_pattern = re.compile(
195
+ f'{escaped_prefix}(.*?){escaped_suffix}',
196
+ re.DOTALL | re.IGNORECASE
197
+ )
198
+ return self._chart_pattern
199
+
200
+ def get_pattern_string(self) -> str:
201
+ """
202
+ Get regex pattern string for matching chart blocks.
203
+
204
+ Returns:
205
+ Regex pattern string for matching chart blocks
206
+ """
207
+ escaped_prefix = re.escape(self._config.tag_prefix)
208
+ escaped_suffix = re.escape(self._config.tag_suffix)
209
+ return f'{escaped_prefix}.*?{escaped_suffix}'
210
+
211
+ def get_chart_type_name(self, ooxml_type: str) -> str:
212
+ """
213
+ Convert OOXML chart type to human-readable name.
214
+
215
+ Args:
216
+ ooxml_type: OOXML chart type (e.g., 'barChart', 'pieChart')
217
+
218
+ Returns:
219
+ Human-readable chart type name
220
+ """
221
+ return CHART_TYPE_MAP.get(ooxml_type, ooxml_type or 'Chart')
222
+
223
+ def format_chart_data(
224
+ self,
225
+ chart_type: Optional[str] = None,
226
+ title: Optional[str] = None,
227
+ categories: Optional[List[Any]] = None,
228
+ series: Optional[List[Dict[str, Any]]] = None,
229
+ raw_content: Optional[str] = None
230
+ ) -> str:
231
+ """
232
+ Format chart data into standardized output format.
233
+
234
+ Creates a formatted chart block with the configured tags, containing:
235
+ - Chart type (if available)
236
+ - Chart title (if available)
237
+ - Data table in HTML format
238
+
239
+ Args:
240
+ chart_type: Chart type name (e.g., "Bar Chart", "Pie Chart")
241
+ title: Chart title
242
+ categories: List of category labels (x-axis values)
243
+ series: List of series data, each containing:
244
+ - 'name': Series name
245
+ - 'values': List of values
246
+ raw_content: Raw content to include (if no structured data)
247
+
248
+ Returns:
249
+ Formatted chart block string
250
+
251
+ Example:
252
+ >>> processor = ChartProcessor()
253
+ >>> result = processor.format_chart_data(
254
+ ... chart_type="Bar Chart",
255
+ ... title="Quarterly Sales",
256
+ ... categories=["Q1", "Q2", "Q3", "Q4"],
257
+ ... series=[
258
+ ... {"name": "Product A", "values": [100, 150, 200, 180]},
259
+ ... {"name": "Product B", "values": [80, 120, 160, 140]}
260
+ ... ]
261
+ ... )
262
+ """
263
+ parts = [self._config.tag_prefix]
264
+
265
+ # Add chart type
266
+ if chart_type and self._config.include_type:
267
+ parts.append(f"Chart Type: {chart_type}")
268
+
269
+ # Add title
270
+ if title and self._config.include_title:
271
+ parts.append(f"Title: {title}")
272
+
273
+ # Add data table or raw content
274
+ if series and any(s.get('values') for s in series):
275
+ table = self._build_data_table(categories, series)
276
+ if table:
277
+ parts.append("") # Empty line before table
278
+ parts.append(table)
279
+ elif raw_content:
280
+ parts.append("")
281
+ parts.append(raw_content)
282
+
283
+ parts.append(self._config.tag_suffix)
284
+ return "\n".join(parts)
285
+
286
+ def format_chart_fallback(
287
+ self,
288
+ chart_type: Optional[str] = None,
289
+ title: Optional[str] = None,
290
+ message: Optional[str] = None
291
+ ) -> str:
292
+ """
293
+ Format a fallback chart block when data extraction fails.
294
+
295
+ Args:
296
+ chart_type: Chart type name
297
+ title: Chart title
298
+ message: Optional message about the chart
299
+
300
+ Returns:
301
+ Minimal chart block string
302
+ """
303
+ parts = [self._config.tag_prefix]
304
+
305
+ if chart_type:
306
+ parts.append(f"Chart Type: {chart_type}")
307
+ if title:
308
+ parts.append(f"Title: {title}")
309
+ if message:
310
+ parts.append(message)
311
+ elif not chart_type and not title:
312
+ parts.append("(Chart content could not be extracted)")
313
+
314
+ parts.append(self._config.tag_suffix)
315
+ return "\n".join(parts)
316
+
317
+ def _build_data_table(
318
+ self,
319
+ categories: Optional[List[Any]],
320
+ series: List[Dict[str, Any]]
321
+ ) -> str:
322
+ """
323
+ Build an HTML table from chart data.
324
+
325
+ Args:
326
+ categories: Category labels
327
+ series: Series data list
328
+
329
+ Returns:
330
+ HTML table string
331
+ """
332
+ if not series:
333
+ return ""
334
+
335
+ categories = categories or []
336
+
337
+ if self._config.use_html_table:
338
+ return self._build_html_table(categories, series)
339
+ else:
340
+ return self._build_markdown_table(categories, series)
341
+
342
+ def _build_html_table(
343
+ self,
344
+ categories: List[Any],
345
+ series: List[Dict[str, Any]]
346
+ ) -> str:
347
+ """Build HTML table from chart data."""
348
+ rows = []
349
+ rows.append("<table border='1'>")
350
+
351
+ # Header row
352
+ header_cells = ["<th>Category</th>"]
353
+ for i, s in enumerate(series):
354
+ name = s.get('name') or f"Series {i+1}"
355
+ header_cells.append(f"<th>{self._escape_html(str(name))}</th>")
356
+ rows.append(f"<tr>{''.join(header_cells)}</tr>")
357
+
358
+ # Data rows
359
+ max_len = max(
360
+ len(categories),
361
+ max((len(s.get('values', [])) for s in series), default=0)
362
+ )
363
+
364
+ for i in range(max_len):
365
+ cells = []
366
+
367
+ # Category cell
368
+ if i < len(categories):
369
+ cat = self._escape_html(str(categories[i]))
370
+ else:
371
+ cat = f"Item {i+1}"
372
+ cells.append(f"<td>{cat}</td>")
373
+
374
+ # Value cells
375
+ for s in series:
376
+ values = s.get('values', [])
377
+ if i < len(values) and values[i] is not None:
378
+ val = values[i]
379
+ if isinstance(val, float):
380
+ formatted = f"{val:,.2f}"
381
+ else:
382
+ formatted = self._escape_html(str(val))
383
+ cells.append(f"<td>{formatted}</td>")
384
+ else:
385
+ cells.append("<td></td>")
386
+
387
+ rows.append(f"<tr>{''.join(cells)}</tr>")
388
+
389
+ rows.append("</table>")
390
+ return "\n".join(rows)
391
+
392
+ def _build_markdown_table(
393
+ self,
394
+ categories: List[Any],
395
+ series: List[Dict[str, Any]]
396
+ ) -> str:
397
+ """Build Markdown table from chart data."""
398
+ rows = []
399
+
400
+ # Header row
401
+ header = ["Category"] + [s.get('name', f'Series {i+1}') for i, s in enumerate(series)]
402
+ rows.append("| " + " | ".join(str(h) for h in header) + " |")
403
+ rows.append("| " + " | ".join(["---"] * len(header)) + " |")
404
+
405
+ # Data rows
406
+ max_len = max(
407
+ len(categories),
408
+ max((len(s.get('values', [])) for s in series), default=0)
409
+ )
410
+
411
+ for i in range(max_len):
412
+ row = []
413
+
414
+ # Category
415
+ if i < len(categories):
416
+ row.append(str(categories[i]))
417
+ else:
418
+ row.append(f"Item {i+1}")
419
+
420
+ # Values
421
+ for s in series:
422
+ values = s.get('values', [])
423
+ if i < len(values) and values[i] is not None:
424
+ val = values[i]
425
+ if isinstance(val, float):
426
+ row.append(f"{val:,.2f}")
427
+ else:
428
+ row.append(str(val))
429
+ else:
430
+ row.append("")
431
+
432
+ rows.append("| " + " | ".join(row) + " |")
433
+
434
+ return "\n".join(rows)
435
+
436
+ def _escape_html(self, text: str) -> str:
437
+ """Escape HTML special characters."""
438
+ return (
439
+ text
440
+ .replace("&", "&amp;")
441
+ .replace("<", "&lt;")
442
+ .replace(">", "&gt;")
443
+ .replace('"', "&quot;")
444
+ )
445
+
446
+ def has_chart_blocks(self, text: str) -> bool:
447
+ """
448
+ Check if text contains chart blocks.
449
+
450
+ Args:
451
+ text: Text to check
452
+
453
+ Returns:
454
+ True if chart blocks found
455
+ """
456
+ return bool(self.chart_pattern.search(text))
457
+
458
+ def find_chart_blocks(self, text: str) -> List[Tuple[int, int, str]]:
459
+ """
460
+ Find all chart blocks in text.
461
+
462
+ Args:
463
+ text: Text to search
464
+
465
+ Returns:
466
+ List of tuples: (start_pos, end_pos, content)
467
+ """
468
+ results = []
469
+ for match in self.chart_pattern.finditer(text):
470
+ results.append((match.start(), match.end(), match.group(1)))
471
+ return results
472
+
473
+ def remove_chart_blocks(self, text: str) -> str:
474
+ """
475
+ Remove all chart blocks from text.
476
+
477
+ Args:
478
+ text: Text with chart blocks
479
+
480
+ Returns:
481
+ Text with chart blocks removed
482
+ """
483
+ return self.chart_pattern.sub('', text)
484
+
485
+ def __repr__(self) -> str:
486
+ return (
487
+ f"ChartProcessor(tag_prefix={self._config.tag_prefix!r}, "
488
+ f"tag_suffix={self._config.tag_suffix!r})"
489
+ )
490
+
491
+
492
+ # Default instance for convenience
493
+ _default_processor: Optional[ChartProcessor] = None
494
+
495
+
496
+ def get_default_chart_processor() -> ChartProcessor:
497
+ """Get the default ChartProcessor instance."""
498
+ global _default_processor
499
+ if _default_processor is None:
500
+ _default_processor = ChartProcessor()
501
+ return _default_processor
502
+
503
+
504
+ def create_chart_processor(
505
+ tag_prefix: Optional[str] = None,
506
+ tag_suffix: Optional[str] = None,
507
+ use_html_table: bool = True
508
+ ) -> ChartProcessor:
509
+ """
510
+ Factory function to create a ChartProcessor instance.
511
+
512
+ Args:
513
+ tag_prefix: Chart tag prefix (default: "[chart]")
514
+ tag_suffix: Chart tag suffix (default: "[/chart]")
515
+ use_html_table: Use HTML table format (default: True)
516
+
517
+ Returns:
518
+ ChartProcessor instance
519
+ """
520
+ return ChartProcessor(
521
+ tag_prefix=tag_prefix,
522
+ tag_suffix=tag_suffix,
523
+ use_html_table=use_html_table
524
+ )
525
+
526
+
527
+ __all__ = [
528
+ "ChartProcessorConfig",
529
+ "ChartProcessor",
530
+ "CHART_TYPE_MAP",
531
+ "get_default_chart_processor",
532
+ "create_chart_processor",
533
+ ]
534
+