xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,85 @@
1
+ # xgen_doc2chunk/core/functions/__init__.py
2
+ """
3
+ Functions - Common Utility Functions Module
4
+
5
+ Provides common utility functions used in document processing.
6
+
7
+ Module Components:
8
+ - utils: Text cleaning, code cleaning, JSON sanitization utilities
9
+ - img_processor: Image processing and storage (ImageProcessor class)
10
+ - storage_backend: Storage backend implementations (Local, MinIO, S3)
11
+ - metadata_extractor: Document metadata extraction interface
12
+
13
+ Usage Example:
14
+ from xgen_doc2chunk.core.functions import clean_text, clean_code_text
15
+ from xgen_doc2chunk.core.functions import ImageProcessor, save_image_to_file
16
+ from xgen_doc2chunk.core.functions.storage_backend import LocalStorageBackend
17
+ from xgen_doc2chunk.core.functions.utils import sanitize_text_for_json
18
+ """
19
+
20
+ from xgen_doc2chunk.core.functions.utils import (
21
+ clean_text,
22
+ clean_code_text,
23
+ sanitize_text_for_json,
24
+ )
25
+
26
+ # Storage backend module
27
+ from xgen_doc2chunk.core.functions.storage_backend import (
28
+ StorageType,
29
+ BaseStorageBackend,
30
+ LocalStorageBackend,
31
+ MinIOStorageBackend,
32
+ S3StorageBackend,
33
+ create_storage_backend,
34
+ get_default_backend,
35
+ )
36
+
37
+ # Image processor module
38
+ from xgen_doc2chunk.core.functions.img_processor import (
39
+ ImageProcessor,
40
+ ImageProcessorConfig,
41
+ ImageFormat,
42
+ NamingStrategy,
43
+ save_image_to_file,
44
+ create_image_processor,
45
+ DEFAULT_IMAGE_CONFIG,
46
+ )
47
+
48
+ # Metadata extraction module
49
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
50
+ MetadataField,
51
+ DocumentMetadata,
52
+ MetadataFormatter,
53
+ BaseMetadataExtractor,
54
+ format_metadata,
55
+ )
56
+
57
+ __all__ = [
58
+ # Text utilities
59
+ "clean_text",
60
+ "clean_code_text",
61
+ "sanitize_text_for_json",
62
+ # Storage backends
63
+ "StorageType",
64
+ "BaseStorageBackend",
65
+ "LocalStorageBackend",
66
+ "MinIOStorageBackend",
67
+ "S3StorageBackend",
68
+ "create_storage_backend",
69
+ "get_default_backend",
70
+ # Image processor (base class for all format-specific processors)
71
+ "ImageProcessor",
72
+ "ImageProcessorConfig",
73
+ "ImageFormat",
74
+ "NamingStrategy",
75
+ "save_image_to_file",
76
+ "create_image_processor",
77
+ "DEFAULT_IMAGE_CONFIG",
78
+ # Metadata extraction
79
+ "MetadataField",
80
+ "DocumentMetadata",
81
+ "MetadataFormatter",
82
+ "BaseMetadataExtractor",
83
+ "format_metadata",
84
+ ]
85
+
@@ -0,0 +1,144 @@
1
+ """
2
+ Chart Extractor Base Module
3
+
4
+ Abstract base class for chart extraction across different file formats.
5
+ Each file handler should have its own ChartExtractor implementation.
6
+
7
+ Output format:
8
+ {chart_prefix}
9
+ Title: {chart_title}
10
+ Chart Type: {chart_type}
11
+ <table>...</table>
12
+ {chart_suffix}
13
+ """
14
+ from abc import ABC, abstractmethod
15
+ from dataclasses import dataclass
16
+ from typing import Any, Dict, List, Optional, TYPE_CHECKING
17
+
18
+ if TYPE_CHECKING:
19
+ from xgen_doc2chunk.core.functions.chart_processor import ChartProcessor
20
+
21
+
22
+ @dataclass
23
+ class ChartData:
24
+ """
25
+ Standardized chart data structure.
26
+
27
+ All chart extractors should convert their format-specific chart data
28
+ into this common structure before formatting.
29
+
30
+ Attributes:
31
+ chart_type: Type of chart (e.g., "Bar Chart", "Line Chart", "Pie Chart")
32
+ title: Chart title (optional)
33
+ categories: List of category labels (X-axis values)
34
+ series: List of series data, each containing 'name' and 'values'
35
+ """
36
+ chart_type: str = "Chart"
37
+ title: Optional[str] = None
38
+ categories: Optional[List[str]] = None
39
+ series: Optional[List[Dict[str, Any]]] = None
40
+
41
+ def has_data(self) -> bool:
42
+ """Check if chart has extractable data."""
43
+ if not self.series:
44
+ return False
45
+ return any(s.get('values') for s in self.series)
46
+
47
+
48
+ class BaseChartExtractor(ABC):
49
+ """
50
+ Abstract base class for chart extraction.
51
+
52
+ Each file format handler should implement its own ChartExtractor
53
+ that inherits from this class.
54
+
55
+ Usage:
56
+ class ExcelChartExtractor(BaseChartExtractor):
57
+ def extract(self, chart_element) -> ChartData:
58
+ # Excel-specific extraction logic
59
+ ...
60
+ """
61
+
62
+ def __init__(self, chart_processor: "ChartProcessor"):
63
+ """
64
+ Initialize chart extractor.
65
+
66
+ Args:
67
+ chart_processor: ChartProcessor instance for formatting output
68
+ """
69
+ self._chart_processor = chart_processor
70
+
71
+ @property
72
+ def chart_processor(self) -> "ChartProcessor":
73
+ """ChartProcessor instance."""
74
+ return self._chart_processor
75
+
76
+ @abstractmethod
77
+ def extract(self, chart_element: Any) -> ChartData:
78
+ """
79
+ Extract chart data from format-specific chart element.
80
+
81
+ Args:
82
+ chart_element: Format-specific chart object/element
83
+
84
+ Returns:
85
+ ChartData with extracted information
86
+ """
87
+ pass
88
+
89
+ def process(self, chart_element: Any) -> str:
90
+ """
91
+ Extract and format chart data.
92
+
93
+ This is the main entry point for chart processing.
94
+ Extracts data using format-specific logic, then formats using ChartProcessor.
95
+
96
+ Args:
97
+ chart_element: Format-specific chart object/element
98
+
99
+ Returns:
100
+ Formatted chart string with tags
101
+ """
102
+ try:
103
+ chart_data = self.extract(chart_element)
104
+
105
+ if chart_data.has_data():
106
+ return self._chart_processor.format_chart_data(
107
+ chart_type=chart_data.chart_type,
108
+ title=chart_data.title,
109
+ categories=chart_data.categories,
110
+ series=chart_data.series
111
+ )
112
+ else:
113
+ return self._chart_processor.format_chart_fallback(
114
+ chart_type=chart_data.chart_type,
115
+ title=chart_data.title
116
+ )
117
+ except Exception as e:
118
+ return self._chart_processor.format_chart_fallback(
119
+ chart_type="Unknown",
120
+ message=f"Error extracting chart: {str(e)}"
121
+ )
122
+
123
+
124
+ class NullChartExtractor(BaseChartExtractor):
125
+ """
126
+ Null implementation for handlers that don't support charts.
127
+
128
+ Use this for file formats like PDF, CSV, TXT that don't contain charts.
129
+ """
130
+
131
+ def extract(self, chart_element: Any) -> ChartData:
132
+ """Return empty chart data."""
133
+ return ChartData(chart_type="Unsupported")
134
+
135
+ def process(self, chart_element: Any) -> str:
136
+ """Return empty string for unsupported formats."""
137
+ return ""
138
+
139
+
140
+ __all__ = [
141
+ 'ChartData',
142
+ 'BaseChartExtractor',
143
+ 'NullChartExtractor',
144
+ ]