xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,126 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py
2
+ """
3
+ DOCX Paragraph Processing Utility
4
+
5
+ Processes Paragraph elements in DOCX documents.
6
+ - process_paragraph_element: Process Paragraph element
7
+ - has_page_break_element: Check for page break
8
+
9
+ Image and drawing extraction is handled by DOCXImageProcessor.
10
+ """
11
+ import logging
12
+ from typing import Optional, Set, Tuple, Callable, TYPE_CHECKING
13
+
14
+ from docx import Document
15
+
16
+ from xgen_doc2chunk.core.processor.docx_helper.docx_constants import ElementType, NAMESPACES
17
+
18
+ if TYPE_CHECKING:
19
+ from xgen_doc2chunk.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor
20
+
21
+ logger = logging.getLogger("document-processor")
22
+
23
+
24
+ def process_paragraph_element(
25
+ para_elem,
26
+ doc: Document,
27
+ processed_images: Set[str],
28
+ file_path: str = None,
29
+ image_processor: Optional["DOCXImageProcessor"] = None,
30
+ chart_callback: Optional[Callable[[], str]] = None
31
+ ) -> Tuple[str, bool, int, int]:
32
+ """
33
+ Process Paragraph element.
34
+
35
+ Extracts text, images, charts and detects page breaks.
36
+
37
+ Args:
38
+ para_elem: paragraph XML element
39
+ doc: python-docx Document object
40
+ processed_images: Set of processed image paths (deduplication)
41
+ file_path: Original file path
42
+ image_processor: DOCXImageProcessor instance
43
+ chart_callback: Callback function to get next chart content
44
+
45
+ Returns:
46
+ (content, has_page_break, image_count, chart_count) tuple
47
+ """
48
+ content_parts = []
49
+ has_page_break = False
50
+ image_count = 0
51
+ chart_count = 0
52
+
53
+ try:
54
+ # Check for page break
55
+ has_page_break = has_page_break_element(para_elem)
56
+
57
+ # Traverse Run elements
58
+ for run_elem in para_elem.findall('.//w:r', NAMESPACES):
59
+ # Extract text
60
+ for t_elem in run_elem.findall('w:t', NAMESPACES):
61
+ if t_elem.text:
62
+ content_parts.append(t_elem.text)
63
+
64
+ # Process Drawing (image/chart/diagram) via DOCXImageProcessor
65
+ for drawing_elem in run_elem.findall('w:drawing', NAMESPACES):
66
+ if image_processor and hasattr(image_processor, 'process_drawing_element'):
67
+ drawing_content, drawing_type = image_processor.process_drawing_element(
68
+ drawing_elem, doc, processed_images, chart_callback=chart_callback
69
+ )
70
+ else:
71
+ drawing_content, drawing_type = "", None
72
+ if drawing_content:
73
+ content_parts.append(drawing_content)
74
+ if drawing_type == ElementType.IMAGE:
75
+ image_count += 1
76
+ elif drawing_type == ElementType.CHART:
77
+ chart_count += 1
78
+
79
+ # Process pict element (legacy VML image) - use DOCXImageProcessor
80
+ for pict_elem in run_elem.findall('w:pict', NAMESPACES):
81
+ if image_processor and hasattr(image_processor, 'extract_from_pict'):
82
+ pict_content = image_processor.extract_from_pict(pict_elem, doc, processed_images)
83
+ else:
84
+ pict_content = "[Unknown Image]"
85
+ if pict_content:
86
+ content_parts.append(pict_content)
87
+ image_count += 1
88
+
89
+ except Exception as e:
90
+ logger.warning(f"Error processing paragraph: {e}")
91
+ # Fallback: simple text extraction
92
+ try:
93
+ texts = para_elem.findall('.//w:t', NAMESPACES)
94
+ content_parts = [t.text or '' for t in texts]
95
+ except:
96
+ pass
97
+
98
+ return ''.join(content_parts), has_page_break, image_count, chart_count
99
+
100
+
101
+ def has_page_break_element(element) -> bool:
102
+ """
103
+ Check if element contains a page break.
104
+
105
+ Args:
106
+ element: XML element
107
+
108
+ Returns:
109
+ Whether page break exists
110
+ """
111
+ try:
112
+ # Explicit page break
113
+ if element.findall('.//w:br[@w:type="page"]', NAMESPACES):
114
+ return True
115
+ # Rendered page break
116
+ if element.findall('.//w:lastRenderedPageBreak', NAMESPACES):
117
+ return True
118
+ return False
119
+ except Exception:
120
+ return False
121
+
122
+
123
+ __all__ = [
124
+ 'process_paragraph_element',
125
+ 'has_page_break_element',
126
+ ]
@@ -0,0 +1,82 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py
2
+ """
3
+ DOCX Preprocessor - Process DOCX document after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. DOCXFileConverter.convert() ??docx.Document
7
+ 2. DOCXPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. DOCXMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. Content extraction (paragraphs, tables, images)
10
+
11
+ Current Implementation:
12
+ - Pass-through (DOCX uses python-docx Document object directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.docx.preprocessor")
23
+
24
+
25
+ class DOCXPreprocessor(BasePreprocessor):
26
+ """
27
+ DOCX Document Preprocessor.
28
+
29
+ Currently a pass-through implementation as DOCX processing
30
+ is handled during the content extraction phase using python-docx.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted DOCX document.
40
+
41
+ Args:
42
+ converted_data: docx.Document object from DOCXFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the document and any extracted resources
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ # Extract basic document info if available
51
+ if hasattr(converted_data, 'core_properties'):
52
+ props = converted_data.core_properties
53
+ if hasattr(props, 'title') and props.title:
54
+ metadata['title'] = props.title
55
+
56
+ if hasattr(converted_data, 'paragraphs'):
57
+ metadata['paragraph_count'] = len(converted_data.paragraphs)
58
+
59
+ if hasattr(converted_data, 'tables'):
60
+ metadata['table_count'] = len(converted_data.tables)
61
+
62
+ logger.debug("DOCX preprocessor: pass-through, metadata=%s", metadata)
63
+
64
+ # clean_content is the TRUE SOURCE - contains the docx.Document
65
+ return PreprocessedData(
66
+ raw_content=converted_data,
67
+ clean_content=converted_data, # TRUE SOURCE - docx.Document
68
+ encoding="utf-8",
69
+ extracted_resources={},
70
+ metadata=metadata,
71
+ )
72
+
73
+ def get_format_name(self) -> str:
74
+ """Return format name."""
75
+ return "DOCX Preprocessor"
76
+
77
+ def validate(self, data: Any) -> bool:
78
+ """Validate if data is a DOCX Document object."""
79
+ return hasattr(data, 'paragraphs') and hasattr(data, 'tables')
80
+
81
+
82
+ __all__ = ['DOCXPreprocessor']