xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,89 @@
1
+ # csv_helper/__init__.py
2
+ """
3
+ CSV Helper 모듈
4
+
5
+ csv_handler.py에서 사용하는 기능적 구성요소들을 모듈화하여 제공합니다.
6
+
7
+ 모듈 구성:
8
+ - csv_constants: 상수 및 데이터 클래스 정의
9
+ - csv_metadata: 메타데이터 추출 및 포맷팅
10
+ - csv_encoding: 인코딩 감지 및 파일 읽기
11
+ - csv_parser: CSV 파싱, 구분자/헤더 감지
12
+ - csv_table: 테이블 변환 (Markdown/HTML)
13
+ """
14
+
15
+ # Constants
16
+ from xgen_doc2chunk.core.processor.csv_helper.csv_constants import (
17
+ ENCODING_CANDIDATES,
18
+ DELIMITER_CANDIDATES,
19
+ DELIMITER_NAMES,
20
+ MAX_ROWS,
21
+ MAX_COLS,
22
+ CSVMetadata,
23
+ )
24
+
25
+ # Metadata
26
+ from xgen_doc2chunk.core.processor.csv_helper.csv_metadata import (
27
+ CSVMetadataExtractor,
28
+ CSVSourceInfo,
29
+ )
30
+
31
+ # Image Processor
32
+ from xgen_doc2chunk.core.processor.csv_helper.csv_image_processor import (
33
+ CSVImageProcessor,
34
+ )
35
+
36
+ # Encoding
37
+ from xgen_doc2chunk.core.processor.csv_helper.csv_encoding import (
38
+ detect_bom,
39
+ read_file_with_encoding,
40
+ )
41
+
42
+ # Parser
43
+ from xgen_doc2chunk.core.processor.csv_helper.csv_parser import (
44
+ detect_delimiter,
45
+ parse_csv_content,
46
+ parse_csv_simple,
47
+ detect_header,
48
+ is_numeric,
49
+ )
50
+
51
+ # Table
52
+ from xgen_doc2chunk.core.processor.csv_helper.csv_table import (
53
+ has_merged_cells,
54
+ analyze_merge_info,
55
+ convert_rows_to_table,
56
+ convert_rows_to_markdown,
57
+ convert_rows_to_html,
58
+ )
59
+
60
+ __all__ = [
61
+ # Constants
62
+ "ENCODING_CANDIDATES",
63
+ "DELIMITER_CANDIDATES",
64
+ "DELIMITER_NAMES",
65
+ "MAX_ROWS",
66
+ "MAX_COLS",
67
+ "CSVMetadata",
68
+ # Metadata
69
+ "CSVMetadataExtractor",
70
+ "CSVSourceInfo",
71
+ # Image Processor
72
+ "CSVImageProcessor",
73
+ # Encoding
74
+ # Encoding
75
+ "detect_bom",
76
+ "read_file_with_encoding",
77
+ # Parser
78
+ "detect_delimiter",
79
+ "parse_csv_content",
80
+ "parse_csv_simple",
81
+ "detect_header",
82
+ "is_numeric",
83
+ # Table
84
+ "has_merged_cells",
85
+ "analyze_merge_info",
86
+ "convert_rows_to_table",
87
+ "convert_rows_to_markdown",
88
+ "convert_rows_to_html",
89
+ ]
@@ -0,0 +1,63 @@
1
+ # csv_helper/csv_constants.py
2
+ """
3
+ CSV Handler 상수 및 타입 정의
4
+
5
+ CSV/TSV 파일 처리에 필요한 상수, 설정값, 데이터 클래스를 정의합니다.
6
+ """
7
+ from dataclasses import dataclass
8
+ from datetime import datetime
9
+ from typing import Optional
10
+
11
+
12
+ # === 인코딩 관련 상수 ===
13
+
14
+ # 시도할 인코딩 목록 (우선순위 순)
15
+ ENCODING_CANDIDATES = [
16
+ "utf-8",
17
+ "utf-8-sig", # BOM 포함 UTF-8
18
+ "cp949", # 한국어 Windows
19
+ "euc-kr", # 한국어 레거시
20
+ "utf-16",
21
+ "utf-16-le",
22
+ "utf-16-be",
23
+ "latin-1", # 폴백 (모든 바이트 허용)
24
+ "iso-8859-1",
25
+ ]
26
+
27
+
28
+ # === 구분자 관련 상수 ===
29
+
30
+ # CSV delimiter candidates
31
+ DELIMITER_CANDIDATES = [',', '\t', ';', '|']
32
+
33
+ # Delimiter name mapping (Korean for output display)
34
+ DELIMITER_NAMES = {
35
+ ',': '쉼표 (,)',
36
+ '\t': '탭 (\\t)',
37
+ ';': '세미콜론 (;)',
38
+ '|': '파이프 (|)',
39
+ }
40
+
41
+
42
+ # === Processing limit constants ===
43
+
44
+ # Maximum rows to process (memory protection)
45
+ MAX_ROWS = 100000
46
+
47
+ # Maximum columns
48
+ MAX_COLS = 1000
49
+
50
+
51
+ # === 데이터 클래스 ===
52
+
53
+ @dataclass
54
+ class CSVMetadata:
55
+ """CSV 파일 메타데이터"""
56
+ encoding: str
57
+ delimiter: str
58
+ has_header: bool
59
+ row_count: int
60
+ col_count: int
61
+ file_size: int
62
+ file_name: Optional[str] = None
63
+ modified_time: Optional[datetime] = None
@@ -0,0 +1,104 @@
1
+ # csv_helper/csv_encoding.py
2
+ """
3
+ CSV 인코딩 감지 및 파일 읽기
4
+
5
+ 파일의 인코딩을 자동 감지하고 올바르게 디코딩합니다.
6
+ BOM 감지, chardet 라이브러리, 휴리스틱 방식을 사용합니다.
7
+ """
8
+ import logging
9
+ from typing import Optional, Tuple
10
+ import chardet
11
+
12
+ from xgen_doc2chunk.core.processor.csv_helper.csv_constants import ENCODING_CANDIDATES
13
+
14
+ logger = logging.getLogger("document-processor")
15
+
16
+
17
+ def detect_bom(data: bytes) -> Optional[str]:
18
+ """
19
+ BOM(Byte Order Mark)을 감지합니다.
20
+
21
+ Args:
22
+ data: 파일의 바이너리 데이터
23
+
24
+ Returns:
25
+ 감지된 인코딩 또는 None
26
+ """
27
+ if data.startswith(b'\xef\xbb\xbf'):
28
+ return 'utf-8-sig'
29
+ elif data.startswith(b'\xff\xfe\x00\x00'):
30
+ return 'utf-32-le'
31
+ elif data.startswith(b'\x00\x00\xfe\xff'):
32
+ return 'utf-32-be'
33
+ elif data.startswith(b'\xff\xfe'):
34
+ return 'utf-16-le'
35
+ elif data.startswith(b'\xfe\xff'):
36
+ return 'utf-16-be'
37
+ return None
38
+
39
+
40
+ def read_file_with_encoding(
41
+ file_path: str,
42
+ preferred_encoding: str = None
43
+ ) -> Tuple[str, str]:
44
+ """
45
+ 파일을 읽고 인코딩을 자동 감지합니다.
46
+
47
+ 감지 순서:
48
+ 1. BOM 확인
49
+ 2. 선호 인코딩 시도 (지정된 경우)
50
+ 3. chardet 라이브러리 사용 (가능한 경우)
51
+ 4. 인코딩 후보 목록 순차 시도
52
+ 5. latin-1 폴백 (항상 성공)
53
+
54
+ Args:
55
+ file_path: 파일 경로
56
+ preferred_encoding: 선호 인코딩 (None이면 자동 감지)
57
+
58
+ Returns:
59
+ (content, detected_encoding) 튜플
60
+ """
61
+ # 바이너리로 먼저 읽기
62
+ with open(file_path, mode='rb') as f:
63
+ raw_data = f.read()
64
+
65
+ # BOM 확인
66
+ bom_encoding = detect_bom(raw_data)
67
+ if bom_encoding:
68
+ logger.debug(f"BOM detected: {bom_encoding}")
69
+ try:
70
+ return raw_data.decode(bom_encoding), bom_encoding
71
+ except UnicodeDecodeError:
72
+ pass
73
+
74
+ # 선호 인코딩 시도
75
+ if preferred_encoding:
76
+ try:
77
+ return raw_data.decode(preferred_encoding), preferred_encoding
78
+ except UnicodeDecodeError:
79
+ logger.debug(f"Preferred encoding {preferred_encoding} failed")
80
+
81
+ # chardet 사용
82
+ detected = chardet.detect(raw_data[:10000]) # 처음 10KB만 분석
83
+ if detected and detected.get('encoding'):
84
+ detected_enc = detected['encoding']
85
+ confidence = detected.get('confidence', 0)
86
+ logger.debug(f"chardet detected: {detected_enc} (confidence: {confidence})")
87
+
88
+ if confidence > 0.7:
89
+ try:
90
+ return raw_data.decode(detected_enc), detected_enc
91
+ except UnicodeDecodeError:
92
+ pass
93
+
94
+ # 인코딩 후보 시도
95
+ for enc in ENCODING_CANDIDATES:
96
+ try:
97
+ content = raw_data.decode(enc)
98
+ logger.debug(f"Successfully decoded with: {enc}")
99
+ return content, enc
100
+ except UnicodeDecodeError:
101
+ continue
102
+
103
+ # 최후의 수단: latin-1 (항상 성공)
104
+ return raw_data.decode('latin-1', errors='replace'), 'latin-1'
@@ -0,0 +1,78 @@
1
+ # xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py
2
+ """
3
+ CSVFileConverter - CSV file format converter
4
+
5
+ Converts binary CSV data to text string with encoding detection.
6
+ """
7
+ from typing import Any, Optional, BinaryIO, Tuple
8
+
9
+ from xgen_doc2chunk.core.functions.file_converter import TextFileConverter
10
+
11
+
12
+ class CSVFileConverter(TextFileConverter):
13
+ """
14
+ CSV file converter.
15
+
16
+ Converts binary CSV data to decoded text string.
17
+ Extends TextFileConverter with BOM detection.
18
+ """
19
+
20
+ # BOM markers
21
+ BOM_UTF8 = b'\xef\xbb\xbf'
22
+ BOM_UTF16_LE = b'\xff\xfe'
23
+ BOM_UTF16_BE = b'\xfe\xff'
24
+
25
+ def __init__(self):
26
+ """Initialize CSVFileConverter."""
27
+ super().__init__(encodings=['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1'])
28
+ self._delimiter: Optional[str] = None
29
+
30
+ def convert(
31
+ self,
32
+ file_data: bytes,
33
+ file_stream: Optional[BinaryIO] = None,
34
+ encoding: Optional[str] = None,
35
+ delimiter: Optional[str] = None,
36
+ **kwargs
37
+ ) -> Tuple[str, str]:
38
+ """
39
+ Convert binary CSV data to text string.
40
+
41
+ Args:
42
+ file_data: Raw binary CSV data
43
+ file_stream: Ignored
44
+ encoding: Specific encoding to use
45
+ delimiter: CSV delimiter (for reference)
46
+ **kwargs: Additional options
47
+
48
+ Returns:
49
+ Tuple of (decoded text, detected encoding)
50
+ """
51
+ self._delimiter = delimiter
52
+
53
+ # Check for BOM
54
+ bom_encoding = self._detect_bom(file_data)
55
+ if bom_encoding:
56
+ text = file_data.decode(bom_encoding)
57
+ self._detected_encoding = bom_encoding
58
+ return text, bom_encoding
59
+
60
+ # Use parent's convert logic
61
+ text = super().convert(file_data, file_stream, encoding, **kwargs)
62
+ return text, self._detected_encoding or 'utf-8'
63
+
64
+ def _detect_bom(self, file_data: bytes) -> Optional[str]:
65
+ """Detect encoding from BOM."""
66
+ if file_data.startswith(self.BOM_UTF8):
67
+ return 'utf-8-sig'
68
+ elif file_data.startswith(self.BOM_UTF16_LE):
69
+ return 'utf-16-le'
70
+ elif file_data.startswith(self.BOM_UTF16_BE):
71
+ return 'utf-16-be'
72
+ return None
73
+
74
+ def get_format_name(self) -> str:
75
+ """Return format name."""
76
+ enc = self._detected_encoding or 'unknown'
77
+ return f"CSV ({enc})"
78
+
@@ -0,0 +1,75 @@
1
+ # xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py
2
+ """
3
+ CSV Image Processor
4
+
5
+ Provides CSV-specific image processing that inherits from ImageProcessor.
6
+ CSV files do not contain embedded images, so this is a minimal implementation.
7
+ """
8
+ import logging
9
+ from typing import Any, Optional
10
+
11
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
12
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
13
+
14
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.csv")
15
+
16
+
17
+ class CSVImageProcessor(ImageProcessor):
18
+ """
19
+ CSV-specific image processor.
20
+
21
+ Inherits from ImageProcessor and provides CSV-specific processing.
22
+ CSV files do not contain embedded images, so this processor
23
+ provides a consistent interface without additional functionality.
24
+
25
+ This class exists to maintain interface consistency across all handlers.
26
+
27
+ Example:
28
+ processor = CSVImageProcessor()
29
+
30
+ # No images in CSV, but interface is consistent
31
+ tag = processor.process_image(image_data) # Falls back to base implementation
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ directory_path: str = "temp/images",
37
+ tag_prefix: str = "[Image:",
38
+ tag_suffix: str = "]",
39
+ storage_backend: Optional[BaseStorageBackend] = None,
40
+ ):
41
+ """
42
+ Initialize CSVImageProcessor.
43
+
44
+ Args:
45
+ directory_path: Image save directory
46
+ tag_prefix: Tag prefix for image references
47
+ tag_suffix: Tag suffix for image references
48
+ storage_backend: Storage backend for saving images
49
+ """
50
+ super().__init__(
51
+ directory_path=directory_path,
52
+ tag_prefix=tag_prefix,
53
+ tag_suffix=tag_suffix,
54
+ storage_backend=storage_backend,
55
+ )
56
+
57
+ def process_image(
58
+ self,
59
+ image_data: bytes,
60
+ **kwargs
61
+ ) -> Optional[str]:
62
+ """
63
+ Process and save image data.
64
+
65
+ CSV files do not contain embedded images, so this method
66
+ delegates to the base implementation.
67
+
68
+ Args:
69
+ image_data: Raw image binary data
70
+ **kwargs: Additional options
71
+
72
+ Returns:
73
+ Image tag string or None if processing failed
74
+ """
75
+ return super().process_image(image_data, **kwargs)
@@ -0,0 +1,168 @@
1
+ # xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py
2
+ """
3
+ CSV Metadata Extraction Module
4
+
5
+ Provides CSVMetadataExtractor class for extracting metadata from CSV files.
6
+ Implements BaseMetadataExtractor interface.
7
+
8
+ CSV differs from regular documents - it provides file structure information as metadata:
9
+ - File name, file size, modification time
10
+ - Encoding, delimiter
11
+ - Row/column count, header information
12
+ """
13
+ import logging
14
+ import os
15
+ from dataclasses import dataclass
16
+ from datetime import datetime
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
20
+ BaseMetadataExtractor,
21
+ DocumentMetadata,
22
+ MetadataFormatter,
23
+ )
24
+ from xgen_doc2chunk.core.processor.csv_helper.csv_constants import DELIMITER_NAMES
25
+
26
+ logger = logging.getLogger("document-processor")
27
+
28
+
29
+ def format_file_size(size_bytes: int) -> str:
30
+ """
31
+ Convert file size to human-readable format.
32
+
33
+ Args:
34
+ size_bytes: File size in bytes
35
+
36
+ Returns:
37
+ Formatted file size string (e.g., "1.5 MB")
38
+ """
39
+ if size_bytes < 1024:
40
+ return f"{size_bytes} B"
41
+ elif size_bytes < 1024 * 1024:
42
+ return f"{size_bytes / 1024:.1f} KB"
43
+ elif size_bytes < 1024 * 1024 * 1024:
44
+ return f"{size_bytes / (1024 * 1024):.1f} MB"
45
+ else:
46
+ return f"{size_bytes / (1024 * 1024 * 1024):.1f} GB"
47
+
48
+
49
+ def get_delimiter_name(delimiter: str) -> str:
50
+ """
51
+ Convert delimiter to human-readable name.
52
+
53
+ Args:
54
+ delimiter: Delimiter character
55
+
56
+ Returns:
57
+ Human-readable delimiter name (e.g., "Comma (,)")
58
+ """
59
+ return DELIMITER_NAMES.get(delimiter, repr(delimiter))
60
+
61
+
62
+ @dataclass
63
+ class CSVSourceInfo:
64
+ """
65
+ Source information for CSV metadata extraction.
66
+
67
+ Container for data passed to CSVMetadataExtractor.extract().
68
+ """
69
+ file_path: str
70
+ encoding: str
71
+ delimiter: str
72
+ rows: List[List[str]]
73
+ has_header: bool
74
+
75
+
76
+ class CSVMetadataExtractor(BaseMetadataExtractor):
77
+ """
78
+ CSV Metadata Extractor.
79
+
80
+ CSV 파일의 구조 정보를 메타데이터로 추출합니다.
81
+
82
+ 지원 필드 (custom 필드에 저장):
83
+ - file_name, file_size, modified_time
84
+ - encoding, delimiter
85
+ - row_count, col_count, has_header, columns
86
+
87
+ 사용법:
88
+ extractor = CSVMetadataExtractor()
89
+ source = CSVSourceInfo(
90
+ file_path="data.csv",
91
+ encoding="utf-8",
92
+ delimiter=",",
93
+ rows=parsed_rows,
94
+ has_header=True
95
+ )
96
+ metadata = extractor.extract(source)
97
+ text = extractor.format(metadata)
98
+ """
99
+
100
+ # CSV 특화 필드 라벨
101
+ CSV_FIELD_LABELS = {
102
+ 'file_name': '파일명',
103
+ 'file_size': '파일 크기',
104
+ 'modified_time': '수정일',
105
+ 'encoding': '인코딩',
106
+ 'delimiter': '구분자',
107
+ 'row_count': '행 수',
108
+ 'col_count': '열 수',
109
+ 'has_header': '헤더 존재',
110
+ 'columns': '컬럼 목록',
111
+ }
112
+
113
+ def __init__(self, **kwargs):
114
+ super().__init__(**kwargs)
115
+ # CSV용 커스텀 포맷터 설정
116
+ self._formatter.field_labels.update(self.CSV_FIELD_LABELS)
117
+
118
+ def extract(self, source: CSVSourceInfo) -> DocumentMetadata:
119
+ """
120
+ CSV 파일에서 메타데이터를 추출합니다.
121
+
122
+ Args:
123
+ source: CSVSourceInfo 객체 (파일 경로, 인코딩, 구분자, 행 데이터, 헤더 여부)
124
+
125
+ Returns:
126
+ 추출된 메타데이터가 담긴 DocumentMetadata 인스턴스
127
+ """
128
+ custom_fields: Dict[str, Any] = {}
129
+
130
+ try:
131
+ # 파일 정보
132
+ file_stat = os.stat(source.file_path)
133
+ file_name = os.path.basename(source.file_path)
134
+
135
+ custom_fields['file_name'] = file_name
136
+ custom_fields['file_size'] = format_file_size(file_stat.st_size)
137
+ custom_fields['modified_time'] = datetime.fromtimestamp(file_stat.st_mtime)
138
+
139
+ # CSV 구조 정보
140
+ custom_fields['encoding'] = source.encoding
141
+ custom_fields['delimiter'] = get_delimiter_name(source.delimiter)
142
+ custom_fields['row_count'] = len(source.rows)
143
+ custom_fields['col_count'] = len(source.rows[0]) if source.rows else 0
144
+ custom_fields['has_header'] = '예' if source.has_header else '아니오'
145
+
146
+ # 헤더 정보 (있는 경우)
147
+ if source.has_header and source.rows:
148
+ headers = [h.strip() for h in source.rows[0] if h.strip()]
149
+ if headers:
150
+ custom_fields['columns'] = ', '.join(headers[:10]) # 최대 10개
151
+ if len(source.rows[0]) > 10:
152
+ custom_fields['columns'] += f' 외 {len(source.rows[0]) - 10}개'
153
+
154
+ self.logger.debug(f"Extracted CSV metadata: {list(custom_fields.keys())}")
155
+
156
+ except Exception as e:
157
+ self.logger.warning(f"Failed to extract CSV metadata: {e}")
158
+
159
+ # CSV는 표준 필드가 없고 모두 custom 필드
160
+ return DocumentMetadata(custom=custom_fields)
161
+
162
+
163
+ __all__ = [
164
+ 'CSVMetadataExtractor',
165
+ 'CSVSourceInfo',
166
+ 'format_file_size',
167
+ 'get_delimiter_name',
168
+ ]