xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,92 @@
1
+ # xgen_doc2chunk/core/processor/html_helper/html_file_converter.py
2
+ """
3
+ HTMLFileConverter - HTML file format converter
4
+
5
+ Converts binary HTML data to BeautifulSoup object.
6
+ """
7
+ from typing import Any, Optional, BinaryIO
8
+
9
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
10
+
11
+
12
+ class HTMLFileConverter(BaseFileConverter):
13
+ """
14
+ HTML file converter using BeautifulSoup.
15
+
16
+ Converts binary HTML data to BeautifulSoup object.
17
+ """
18
+
19
+ DEFAULT_ENCODINGS = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1']
20
+
21
+ def __init__(self, parser: str = 'html.parser'):
22
+ """
23
+ Initialize HTMLFileConverter.
24
+
25
+ Args:
26
+ parser: BeautifulSoup parser to use
27
+ """
28
+ self._parser = parser
29
+ self._detected_encoding: Optional[str] = None
30
+
31
+ def convert(
32
+ self,
33
+ file_data: bytes,
34
+ file_stream: Optional[BinaryIO] = None,
35
+ encoding: Optional[str] = None,
36
+ **kwargs
37
+ ) -> Any:
38
+ """
39
+ Convert binary HTML data to BeautifulSoup object.
40
+
41
+ Args:
42
+ file_data: Raw binary HTML data
43
+ file_stream: Ignored
44
+ encoding: Specific encoding to use
45
+ **kwargs: Additional options
46
+
47
+ Returns:
48
+ BeautifulSoup object
49
+ """
50
+ from bs4 import BeautifulSoup
51
+
52
+ # Decode to text first
53
+ text = self._decode(file_data, encoding)
54
+ return BeautifulSoup(text, self._parser)
55
+
56
+ def _decode(self, file_data: bytes, encoding: Optional[str] = None) -> str:
57
+ """Decode bytes to string."""
58
+ if encoding:
59
+ try:
60
+ self._detected_encoding = encoding
61
+ return file_data.decode(encoding)
62
+ except UnicodeDecodeError:
63
+ pass
64
+
65
+ for enc in self.DEFAULT_ENCODINGS:
66
+ try:
67
+ self._detected_encoding = enc
68
+ return file_data.decode(enc)
69
+ except UnicodeDecodeError:
70
+ continue
71
+
72
+ # Fallback
73
+ self._detected_encoding = 'utf-8'
74
+ return file_data.decode('utf-8', errors='replace')
75
+
76
+ def get_format_name(self) -> str:
77
+ """Return format name."""
78
+ return "HTML Document"
79
+
80
+ def validate(self, file_data: bytes) -> bool:
81
+ """Validate if data appears to be HTML."""
82
+ if not file_data:
83
+ return False
84
+
85
+ header = file_data[:100].lower()
86
+ return (
87
+ b'<!doctype' in header or
88
+ b'<html' in header or
89
+ b'<head' in header or
90
+ b'<body' in header
91
+ )
92
+
@@ -0,0 +1,74 @@
1
+ # xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py
2
+ """
3
+ HTML Preprocessor - Process HTML content after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. HTMLFileConverter.convert() ??BeautifulSoup
7
+ 2. HTMLPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. Content extraction
9
+
10
+ Current Implementation:
11
+ - Pass-through (HTML uses BeautifulSoup object directly)
12
+ """
13
+ import logging
14
+ from typing import Any, Dict
15
+
16
+ from xgen_doc2chunk.core.functions.preprocessor import (
17
+ BasePreprocessor,
18
+ PreprocessedData,
19
+ )
20
+
21
+ logger = logging.getLogger("xgen_doc2chunk.html.preprocessor")
22
+
23
+
24
+ class HTMLPreprocessor(BasePreprocessor):
25
+ """
26
+ HTML Content Preprocessor.
27
+
28
+ Currently a pass-through implementation as HTML processing
29
+ is handled using BeautifulSoup.
30
+ """
31
+
32
+ def preprocess(
33
+ self,
34
+ converted_data: Any,
35
+ **kwargs
36
+ ) -> PreprocessedData:
37
+ """
38
+ Preprocess the converted HTML content.
39
+
40
+ Args:
41
+ converted_data: BeautifulSoup object from HTMLFileConverter
42
+ **kwargs: Additional options
43
+
44
+ Returns:
45
+ PreprocessedData with the BeautifulSoup object
46
+ """
47
+ metadata: Dict[str, Any] = {}
48
+
49
+ if hasattr(converted_data, 'find_all'):
50
+ # Count some basic elements
51
+ metadata['table_count'] = len(converted_data.find_all('table'))
52
+ metadata['image_count'] = len(converted_data.find_all('img'))
53
+ metadata['link_count'] = len(converted_data.find_all('a'))
54
+
55
+ logger.debug("HTML preprocessor: pass-through, metadata=%s", metadata)
56
+
57
+ return PreprocessedData(
58
+ raw_content=b"",
59
+ clean_content=b"",
60
+ encoding="utf-8",
61
+ extracted_resources={"soup": converted_data},
62
+ metadata=metadata,
63
+ )
64
+
65
+ def get_format_name(self) -> str:
66
+ """Return format name."""
67
+ return "HTML Preprocessor"
68
+
69
+ def validate(self, data: Any) -> bool:
70
+ """Validate if data is a BeautifulSoup object."""
71
+ return hasattr(data, 'find_all') and hasattr(data, 'get_text')
72
+
73
+
74
+ __all__ = ['HTMLPreprocessor']
@@ -0,0 +1,140 @@
1
+ from bs4 import BeautifulSoup
2
+ import os
3
+ from pathlib import Path
4
+
5
+ def clean_html_file(html_content, output_file_path=None):
6
+ """
7
+ HTML 파일을 읽어서 스타일을 제거하고 텍스트와 표만 남긴 후 저장
8
+ """
9
+ try:
10
+ soup = BeautifulSoup(html_content, 'html.parser')
11
+
12
+ # 1. 불필요한 태그들 완전 제거
13
+ print("🧹 불필요한 태그 제거 중...")
14
+ unwanted_tags = ['script', 'style', 'link', 'meta', 'noscript', 'iframe', 'img']
15
+ for tag_name in unwanted_tags:
16
+ for tag in soup.find_all(tag_name):
17
+ tag.decompose()
18
+
19
+ # 2. 모든 태그의 스타일 관련 속성 제거
20
+ print("✨ 스타일 속성 제거 중...")
21
+ for tag in soup.find_all(True):
22
+ attrs_to_remove = ['style', 'class', 'id', 'width', 'height',
23
+ 'bgcolor', 'color', 'font-family', 'font-size',
24
+ 'margin', 'padding', 'border', 'background', 'face', 'size', 'align','lang']
25
+
26
+ for attr in attrs_to_remove:
27
+ if tag.has_attr(attr):
28
+ del tag[attr]
29
+
30
+ # 3. 테이블 병합 셀 처리 및 빈 칸 채우기
31
+ print("📊 테이블 병합 셀 처리 중...")
32
+ for table in soup.find_all('table'):
33
+ _process_table_merged_cells(table, soup)
34
+
35
+ # 4. 빈 태그들 제거
36
+ print("🗑️ 빈 태그 제거 중...")
37
+ for tag in soup.find_all():
38
+ if (not tag.get_text(strip=True) and
39
+ not tag.find_all() and
40
+ tag.name not in ['br', 'hr', 'img']):
41
+ tag.decompose()
42
+
43
+ # 5. 불필요한 서식 태그만 제거 (공백은 보존)
44
+ for tag_name in ['font', 'u', 'b']:
45
+ for tag in soup.find_all(tag_name):
46
+ tag.unwrap() # 태그는 제거하되 내용은 보존
47
+
48
+ # 6. HTML을 문자열로 변환 (prettify 사용하지 않음)
49
+ cleaned_html = str(soup)
50
+
51
+ # 7. 연속된 공백만 정리 (단일 공백은 보존)
52
+ import re
53
+ cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
54
+ cleaned_html = re.sub(r'>\s+<', '><', cleaned_html) # 태그 사이 공백만 제거
55
+ cleaned_html = cleaned_html.replace('<p>', '').replace('</p>', '').replace('</span>', '').replace('<span>', '') # 빈 <p> 태그 제거
56
+
57
+ return cleaned_html
58
+
59
+ except Exception as e:
60
+ print(f"❌ 오류 발생: {str(e)}")
61
+ return None
62
+
63
+ def _process_table_merged_cells(table, soup):
64
+ """
65
+ 테이블의 병합된 셀을 풀고 빈 셀을 'None'으로 채우는 함수
66
+ """
67
+ # 모든 행을 리스트로 변환
68
+ rows = table.find_all('tr')
69
+ if not rows:
70
+ return
71
+
72
+ # 테이블을 2차원 배열로 변환하여 처리
73
+ grid = []
74
+ max_cols = 0
75
+
76
+ # 1단계: 기존 테이블을 그리드로 변환
77
+ for row_idx, row in enumerate(rows):
78
+ cells = row.find_all(['td', 'th'])
79
+ grid.append([])
80
+ col_idx = 0
81
+
82
+ for cell in cells:
83
+ # 이미 채워진 열은 건너뛰기
84
+ while col_idx < len(grid[row_idx]) and grid[row_idx][col_idx] is not None:
85
+ col_idx += 1
86
+
87
+ # 현재 셀의 내용
88
+ cell_text = cell.get_text(strip=True)
89
+ if not cell_text or cell_text == '-':
90
+ cell_text = 'None'
91
+
92
+ # colspan, rowspan 값 가져오기
93
+ colspan = int(cell.get('colspan', 1))
94
+ rowspan = int(cell.get('rowspan', 1))
95
+
96
+ # 그리드에 셀 내용 채우기
97
+ for r in range(rowspan):
98
+ target_row_idx = row_idx + r
99
+
100
+ # 필요한 만큼 행을 추가
101
+ while len(grid) <= target_row_idx:
102
+ grid.append([])
103
+
104
+ # 필요한 만큼 열을 추가
105
+ while len(grid[target_row_idx]) < col_idx + colspan:
106
+ grid[target_row_idx].append(None)
107
+
108
+ # 셀 내용을 해당 영역에 복사
109
+ for c in range(colspan):
110
+ if col_idx + c < len(grid[target_row_idx]):
111
+ grid[target_row_idx][col_idx + c] = cell_text
112
+
113
+ col_idx += colspan
114
+
115
+ # 최대 열 수 업데이트
116
+ max_cols = max(max_cols, len(grid[row_idx]))
117
+
118
+ # 2단계: 모든 행의 길이를 맞추고 빈 셀 채우기
119
+ for row in grid:
120
+ while len(row) < max_cols:
121
+ row.append('None')
122
+
123
+ # None인 셀들을 'None'으로 변경
124
+ for i in range(len(row)):
125
+ if row[i] is None:
126
+ row[i] = 'None'
127
+
128
+ # 3단계: 기존 테이블 내용을 새로운 그리드로 교체
129
+ # 기존 행들 제거
130
+ for row in table.find_all('tr'):
131
+ row.decompose()
132
+
133
+ # 새로운 행들 추가
134
+ for grid_row in grid:
135
+ new_row = soup.new_tag('tr')
136
+ for cell_text in grid_row:
137
+ new_cell = soup.new_tag('td')
138
+ new_cell.string = cell_text
139
+ new_row.append(new_cell)
140
+ table.append(new_row)