xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# csv_helper/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
CSV Helper 모듈
|
|
4
|
+
|
|
5
|
+
csv_handler.py에서 사용하는 기능적 구성요소들을 모듈화하여 제공합니다.
|
|
6
|
+
|
|
7
|
+
모듈 구성:
|
|
8
|
+
- csv_constants: 상수 및 데이터 클래스 정의
|
|
9
|
+
- csv_metadata: 메타데이터 추출 및 포맷팅
|
|
10
|
+
- csv_encoding: 인코딩 감지 및 파일 읽기
|
|
11
|
+
- csv_parser: CSV 파싱, 구분자/헤더 감지
|
|
12
|
+
- csv_table: 테이블 변환 (Markdown/HTML)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Constants
|
|
16
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_constants import (
|
|
17
|
+
ENCODING_CANDIDATES,
|
|
18
|
+
DELIMITER_CANDIDATES,
|
|
19
|
+
DELIMITER_NAMES,
|
|
20
|
+
MAX_ROWS,
|
|
21
|
+
MAX_COLS,
|
|
22
|
+
CSVMetadata,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Metadata
|
|
26
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_metadata import (
|
|
27
|
+
CSVMetadataExtractor,
|
|
28
|
+
CSVSourceInfo,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Image Processor
|
|
32
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_image_processor import (
|
|
33
|
+
CSVImageProcessor,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Encoding
|
|
37
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_encoding import (
|
|
38
|
+
detect_bom,
|
|
39
|
+
read_file_with_encoding,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Parser
|
|
43
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_parser import (
|
|
44
|
+
detect_delimiter,
|
|
45
|
+
parse_csv_content,
|
|
46
|
+
parse_csv_simple,
|
|
47
|
+
detect_header,
|
|
48
|
+
is_numeric,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Table
|
|
52
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_table import (
|
|
53
|
+
has_merged_cells,
|
|
54
|
+
analyze_merge_info,
|
|
55
|
+
convert_rows_to_table,
|
|
56
|
+
convert_rows_to_markdown,
|
|
57
|
+
convert_rows_to_html,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
__all__ = [
|
|
61
|
+
# Constants
|
|
62
|
+
"ENCODING_CANDIDATES",
|
|
63
|
+
"DELIMITER_CANDIDATES",
|
|
64
|
+
"DELIMITER_NAMES",
|
|
65
|
+
"MAX_ROWS",
|
|
66
|
+
"MAX_COLS",
|
|
67
|
+
"CSVMetadata",
|
|
68
|
+
# Metadata
|
|
69
|
+
"CSVMetadataExtractor",
|
|
70
|
+
"CSVSourceInfo",
|
|
71
|
+
# Image Processor
|
|
72
|
+
"CSVImageProcessor",
|
|
73
|
+
# Encoding
|
|
74
|
+
# Encoding
|
|
75
|
+
"detect_bom",
|
|
76
|
+
"read_file_with_encoding",
|
|
77
|
+
# Parser
|
|
78
|
+
"detect_delimiter",
|
|
79
|
+
"parse_csv_content",
|
|
80
|
+
"parse_csv_simple",
|
|
81
|
+
"detect_header",
|
|
82
|
+
"is_numeric",
|
|
83
|
+
# Table
|
|
84
|
+
"has_merged_cells",
|
|
85
|
+
"analyze_merge_info",
|
|
86
|
+
"convert_rows_to_table",
|
|
87
|
+
"convert_rows_to_markdown",
|
|
88
|
+
"convert_rows_to_html",
|
|
89
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# csv_helper/csv_constants.py
|
|
2
|
+
"""
|
|
3
|
+
CSV Handler 상수 및 타입 정의
|
|
4
|
+
|
|
5
|
+
CSV/TSV 파일 처리에 필요한 상수, 설정값, 데이터 클래스를 정의합니다.
|
|
6
|
+
"""
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# === 인코딩 관련 상수 ===
|
|
13
|
+
|
|
14
|
+
# 시도할 인코딩 목록 (우선순위 순)
|
|
15
|
+
ENCODING_CANDIDATES = [
|
|
16
|
+
"utf-8",
|
|
17
|
+
"utf-8-sig", # BOM 포함 UTF-8
|
|
18
|
+
"cp949", # 한국어 Windows
|
|
19
|
+
"euc-kr", # 한국어 레거시
|
|
20
|
+
"utf-16",
|
|
21
|
+
"utf-16-le",
|
|
22
|
+
"utf-16-be",
|
|
23
|
+
"latin-1", # 폴백 (모든 바이트 허용)
|
|
24
|
+
"iso-8859-1",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# === 구분자 관련 상수 ===
|
|
29
|
+
|
|
30
|
+
# CSV delimiter candidates
|
|
31
|
+
DELIMITER_CANDIDATES = [',', '\t', ';', '|']
|
|
32
|
+
|
|
33
|
+
# Delimiter name mapping (Korean for output display)
|
|
34
|
+
DELIMITER_NAMES = {
|
|
35
|
+
',': '쉼표 (,)',
|
|
36
|
+
'\t': '탭 (\\t)',
|
|
37
|
+
';': '세미콜론 (;)',
|
|
38
|
+
'|': '파이프 (|)',
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# === Processing limit constants ===
|
|
43
|
+
|
|
44
|
+
# Maximum rows to process (memory protection)
|
|
45
|
+
MAX_ROWS = 100000
|
|
46
|
+
|
|
47
|
+
# Maximum columns
|
|
48
|
+
MAX_COLS = 1000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# === 데이터 클래스 ===
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class CSVMetadata:
|
|
55
|
+
"""CSV 파일 메타데이터"""
|
|
56
|
+
encoding: str
|
|
57
|
+
delimiter: str
|
|
58
|
+
has_header: bool
|
|
59
|
+
row_count: int
|
|
60
|
+
col_count: int
|
|
61
|
+
file_size: int
|
|
62
|
+
file_name: Optional[str] = None
|
|
63
|
+
modified_time: Optional[datetime] = None
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# csv_helper/csv_encoding.py
|
|
2
|
+
"""
|
|
3
|
+
CSV 인코딩 감지 및 파일 읽기
|
|
4
|
+
|
|
5
|
+
파일의 인코딩을 자동 감지하고 올바르게 디코딩합니다.
|
|
6
|
+
BOM 감지, chardet 라이브러리, 휴리스틱 방식을 사용합니다.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional, Tuple
|
|
10
|
+
import chardet
|
|
11
|
+
|
|
12
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_constants import ENCODING_CANDIDATES
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("document-processor")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def detect_bom(data: bytes) -> Optional[str]:
|
|
18
|
+
"""
|
|
19
|
+
BOM(Byte Order Mark)을 감지합니다.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
data: 파일의 바이너리 데이터
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
감지된 인코딩 또는 None
|
|
26
|
+
"""
|
|
27
|
+
if data.startswith(b'\xef\xbb\xbf'):
|
|
28
|
+
return 'utf-8-sig'
|
|
29
|
+
elif data.startswith(b'\xff\xfe\x00\x00'):
|
|
30
|
+
return 'utf-32-le'
|
|
31
|
+
elif data.startswith(b'\x00\x00\xfe\xff'):
|
|
32
|
+
return 'utf-32-be'
|
|
33
|
+
elif data.startswith(b'\xff\xfe'):
|
|
34
|
+
return 'utf-16-le'
|
|
35
|
+
elif data.startswith(b'\xfe\xff'):
|
|
36
|
+
return 'utf-16-be'
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def read_file_with_encoding(
|
|
41
|
+
file_path: str,
|
|
42
|
+
preferred_encoding: str = None
|
|
43
|
+
) -> Tuple[str, str]:
|
|
44
|
+
"""
|
|
45
|
+
파일을 읽고 인코딩을 자동 감지합니다.
|
|
46
|
+
|
|
47
|
+
감지 순서:
|
|
48
|
+
1. BOM 확인
|
|
49
|
+
2. 선호 인코딩 시도 (지정된 경우)
|
|
50
|
+
3. chardet 라이브러리 사용 (가능한 경우)
|
|
51
|
+
4. 인코딩 후보 목록 순차 시도
|
|
52
|
+
5. latin-1 폴백 (항상 성공)
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
file_path: 파일 경로
|
|
56
|
+
preferred_encoding: 선호 인코딩 (None이면 자동 감지)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
(content, detected_encoding) 튜플
|
|
60
|
+
"""
|
|
61
|
+
# 바이너리로 먼저 읽기
|
|
62
|
+
with open(file_path, mode='rb') as f:
|
|
63
|
+
raw_data = f.read()
|
|
64
|
+
|
|
65
|
+
# BOM 확인
|
|
66
|
+
bom_encoding = detect_bom(raw_data)
|
|
67
|
+
if bom_encoding:
|
|
68
|
+
logger.debug(f"BOM detected: {bom_encoding}")
|
|
69
|
+
try:
|
|
70
|
+
return raw_data.decode(bom_encoding), bom_encoding
|
|
71
|
+
except UnicodeDecodeError:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
# 선호 인코딩 시도
|
|
75
|
+
if preferred_encoding:
|
|
76
|
+
try:
|
|
77
|
+
return raw_data.decode(preferred_encoding), preferred_encoding
|
|
78
|
+
except UnicodeDecodeError:
|
|
79
|
+
logger.debug(f"Preferred encoding {preferred_encoding} failed")
|
|
80
|
+
|
|
81
|
+
# chardet 사용
|
|
82
|
+
detected = chardet.detect(raw_data[:10000]) # 처음 10KB만 분석
|
|
83
|
+
if detected and detected.get('encoding'):
|
|
84
|
+
detected_enc = detected['encoding']
|
|
85
|
+
confidence = detected.get('confidence', 0)
|
|
86
|
+
logger.debug(f"chardet detected: {detected_enc} (confidence: {confidence})")
|
|
87
|
+
|
|
88
|
+
if confidence > 0.7:
|
|
89
|
+
try:
|
|
90
|
+
return raw_data.decode(detected_enc), detected_enc
|
|
91
|
+
except UnicodeDecodeError:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
# 인코딩 후보 시도
|
|
95
|
+
for enc in ENCODING_CANDIDATES:
|
|
96
|
+
try:
|
|
97
|
+
content = raw_data.decode(enc)
|
|
98
|
+
logger.debug(f"Successfully decoded with: {enc}")
|
|
99
|
+
return content, enc
|
|
100
|
+
except UnicodeDecodeError:
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
# 최후의 수단: latin-1 (항상 성공)
|
|
104
|
+
return raw_data.decode('latin-1', errors='replace'), 'latin-1'
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
CSVFileConverter - CSV file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary CSV data to text string with encoding detection.
|
|
6
|
+
"""
|
|
7
|
+
from typing import Any, Optional, BinaryIO, Tuple
|
|
8
|
+
|
|
9
|
+
from xgen_doc2chunk.core.functions.file_converter import TextFileConverter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CSVFileConverter(TextFileConverter):
|
|
13
|
+
"""
|
|
14
|
+
CSV file converter.
|
|
15
|
+
|
|
16
|
+
Converts binary CSV data to decoded text string.
|
|
17
|
+
Extends TextFileConverter with BOM detection.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# BOM markers
|
|
21
|
+
BOM_UTF8 = b'\xef\xbb\xbf'
|
|
22
|
+
BOM_UTF16_LE = b'\xff\xfe'
|
|
23
|
+
BOM_UTF16_BE = b'\xfe\xff'
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
"""Initialize CSVFileConverter."""
|
|
27
|
+
super().__init__(encodings=['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1'])
|
|
28
|
+
self._delimiter: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
def convert(
|
|
31
|
+
self,
|
|
32
|
+
file_data: bytes,
|
|
33
|
+
file_stream: Optional[BinaryIO] = None,
|
|
34
|
+
encoding: Optional[str] = None,
|
|
35
|
+
delimiter: Optional[str] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> Tuple[str, str]:
|
|
38
|
+
"""
|
|
39
|
+
Convert binary CSV data to text string.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
file_data: Raw binary CSV data
|
|
43
|
+
file_stream: Ignored
|
|
44
|
+
encoding: Specific encoding to use
|
|
45
|
+
delimiter: CSV delimiter (for reference)
|
|
46
|
+
**kwargs: Additional options
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Tuple of (decoded text, detected encoding)
|
|
50
|
+
"""
|
|
51
|
+
self._delimiter = delimiter
|
|
52
|
+
|
|
53
|
+
# Check for BOM
|
|
54
|
+
bom_encoding = self._detect_bom(file_data)
|
|
55
|
+
if bom_encoding:
|
|
56
|
+
text = file_data.decode(bom_encoding)
|
|
57
|
+
self._detected_encoding = bom_encoding
|
|
58
|
+
return text, bom_encoding
|
|
59
|
+
|
|
60
|
+
# Use parent's convert logic
|
|
61
|
+
text = super().convert(file_data, file_stream, encoding, **kwargs)
|
|
62
|
+
return text, self._detected_encoding or 'utf-8'
|
|
63
|
+
|
|
64
|
+
def _detect_bom(self, file_data: bytes) -> Optional[str]:
|
|
65
|
+
"""Detect encoding from BOM."""
|
|
66
|
+
if file_data.startswith(self.BOM_UTF8):
|
|
67
|
+
return 'utf-8-sig'
|
|
68
|
+
elif file_data.startswith(self.BOM_UTF16_LE):
|
|
69
|
+
return 'utf-16-le'
|
|
70
|
+
elif file_data.startswith(self.BOM_UTF16_BE):
|
|
71
|
+
return 'utf-16-be'
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
def get_format_name(self) -> str:
|
|
75
|
+
"""Return format name."""
|
|
76
|
+
enc = self._detected_encoding or 'unknown'
|
|
77
|
+
return f"CSV ({enc})"
|
|
78
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
CSV Image Processor
|
|
4
|
+
|
|
5
|
+
Provides CSV-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
CSV files do not contain embedded images, so this is a minimal implementation.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
12
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.csv")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CSVImageProcessor(ImageProcessor):
|
|
18
|
+
"""
|
|
19
|
+
CSV-specific image processor.
|
|
20
|
+
|
|
21
|
+
Inherits from ImageProcessor and provides CSV-specific processing.
|
|
22
|
+
CSV files do not contain embedded images, so this processor
|
|
23
|
+
provides a consistent interface without additional functionality.
|
|
24
|
+
|
|
25
|
+
This class exists to maintain interface consistency across all handlers.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
processor = CSVImageProcessor()
|
|
29
|
+
|
|
30
|
+
# No images in CSV, but interface is consistent
|
|
31
|
+
tag = processor.process_image(image_data) # Falls back to base implementation
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
directory_path: str = "temp/images",
|
|
37
|
+
tag_prefix: str = "[Image:",
|
|
38
|
+
tag_suffix: str = "]",
|
|
39
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Initialize CSVImageProcessor.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
directory_path: Image save directory
|
|
46
|
+
tag_prefix: Tag prefix for image references
|
|
47
|
+
tag_suffix: Tag suffix for image references
|
|
48
|
+
storage_backend: Storage backend for saving images
|
|
49
|
+
"""
|
|
50
|
+
super().__init__(
|
|
51
|
+
directory_path=directory_path,
|
|
52
|
+
tag_prefix=tag_prefix,
|
|
53
|
+
tag_suffix=tag_suffix,
|
|
54
|
+
storage_backend=storage_backend,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def process_image(
|
|
58
|
+
self,
|
|
59
|
+
image_data: bytes,
|
|
60
|
+
**kwargs
|
|
61
|
+
) -> Optional[str]:
|
|
62
|
+
"""
|
|
63
|
+
Process and save image data.
|
|
64
|
+
|
|
65
|
+
CSV files do not contain embedded images, so this method
|
|
66
|
+
delegates to the base implementation.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
image_data: Raw image binary data
|
|
70
|
+
**kwargs: Additional options
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Image tag string or None if processing failed
|
|
74
|
+
"""
|
|
75
|
+
return super().process_image(image_data, **kwargs)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py
|
|
2
|
+
"""
|
|
3
|
+
CSV Metadata Extraction Module
|
|
4
|
+
|
|
5
|
+
Provides CSVMetadataExtractor class for extracting metadata from CSV files.
|
|
6
|
+
Implements BaseMetadataExtractor interface.
|
|
7
|
+
|
|
8
|
+
CSV differs from regular documents - it provides file structure information as metadata:
|
|
9
|
+
- File name, file size, modification time
|
|
10
|
+
- Encoding, delimiter
|
|
11
|
+
- Row/column count, header information
|
|
12
|
+
"""
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from typing import Any, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
20
|
+
BaseMetadataExtractor,
|
|
21
|
+
DocumentMetadata,
|
|
22
|
+
MetadataFormatter,
|
|
23
|
+
)
|
|
24
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_constants import DELIMITER_NAMES
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger("document-processor")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def format_file_size(size_bytes: int) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Convert file size to human-readable format.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
size_bytes: File size in bytes
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Formatted file size string (e.g., "1.5 MB")
|
|
38
|
+
"""
|
|
39
|
+
if size_bytes < 1024:
|
|
40
|
+
return f"{size_bytes} B"
|
|
41
|
+
elif size_bytes < 1024 * 1024:
|
|
42
|
+
return f"{size_bytes / 1024:.1f} KB"
|
|
43
|
+
elif size_bytes < 1024 * 1024 * 1024:
|
|
44
|
+
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
|
45
|
+
else:
|
|
46
|
+
return f"{size_bytes / (1024 * 1024 * 1024):.1f} GB"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_delimiter_name(delimiter: str) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Convert delimiter to human-readable name.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
delimiter: Delimiter character
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Human-readable delimiter name (e.g., "Comma (,)")
|
|
58
|
+
"""
|
|
59
|
+
return DELIMITER_NAMES.get(delimiter, repr(delimiter))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class CSVSourceInfo:
|
|
64
|
+
"""
|
|
65
|
+
Source information for CSV metadata extraction.
|
|
66
|
+
|
|
67
|
+
Container for data passed to CSVMetadataExtractor.extract().
|
|
68
|
+
"""
|
|
69
|
+
file_path: str
|
|
70
|
+
encoding: str
|
|
71
|
+
delimiter: str
|
|
72
|
+
rows: List[List[str]]
|
|
73
|
+
has_header: bool
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class CSVMetadataExtractor(BaseMetadataExtractor):
|
|
77
|
+
"""
|
|
78
|
+
CSV Metadata Extractor.
|
|
79
|
+
|
|
80
|
+
CSV 파일의 구조 정보를 메타데이터로 추출합니다.
|
|
81
|
+
|
|
82
|
+
지원 필드 (custom 필드에 저장):
|
|
83
|
+
- file_name, file_size, modified_time
|
|
84
|
+
- encoding, delimiter
|
|
85
|
+
- row_count, col_count, has_header, columns
|
|
86
|
+
|
|
87
|
+
사용법:
|
|
88
|
+
extractor = CSVMetadataExtractor()
|
|
89
|
+
source = CSVSourceInfo(
|
|
90
|
+
file_path="data.csv",
|
|
91
|
+
encoding="utf-8",
|
|
92
|
+
delimiter=",",
|
|
93
|
+
rows=parsed_rows,
|
|
94
|
+
has_header=True
|
|
95
|
+
)
|
|
96
|
+
metadata = extractor.extract(source)
|
|
97
|
+
text = extractor.format(metadata)
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# CSV 특화 필드 라벨
|
|
101
|
+
CSV_FIELD_LABELS = {
|
|
102
|
+
'file_name': '파일명',
|
|
103
|
+
'file_size': '파일 크기',
|
|
104
|
+
'modified_time': '수정일',
|
|
105
|
+
'encoding': '인코딩',
|
|
106
|
+
'delimiter': '구분자',
|
|
107
|
+
'row_count': '행 수',
|
|
108
|
+
'col_count': '열 수',
|
|
109
|
+
'has_header': '헤더 존재',
|
|
110
|
+
'columns': '컬럼 목록',
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
def __init__(self, **kwargs):
|
|
114
|
+
super().__init__(**kwargs)
|
|
115
|
+
# CSV용 커스텀 포맷터 설정
|
|
116
|
+
self._formatter.field_labels.update(self.CSV_FIELD_LABELS)
|
|
117
|
+
|
|
118
|
+
def extract(self, source: CSVSourceInfo) -> DocumentMetadata:
|
|
119
|
+
"""
|
|
120
|
+
CSV 파일에서 메타데이터를 추출합니다.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
source: CSVSourceInfo 객체 (파일 경로, 인코딩, 구분자, 행 데이터, 헤더 여부)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
추출된 메타데이터가 담긴 DocumentMetadata 인스턴스
|
|
127
|
+
"""
|
|
128
|
+
custom_fields: Dict[str, Any] = {}
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# 파일 정보
|
|
132
|
+
file_stat = os.stat(source.file_path)
|
|
133
|
+
file_name = os.path.basename(source.file_path)
|
|
134
|
+
|
|
135
|
+
custom_fields['file_name'] = file_name
|
|
136
|
+
custom_fields['file_size'] = format_file_size(file_stat.st_size)
|
|
137
|
+
custom_fields['modified_time'] = datetime.fromtimestamp(file_stat.st_mtime)
|
|
138
|
+
|
|
139
|
+
# CSV 구조 정보
|
|
140
|
+
custom_fields['encoding'] = source.encoding
|
|
141
|
+
custom_fields['delimiter'] = get_delimiter_name(source.delimiter)
|
|
142
|
+
custom_fields['row_count'] = len(source.rows)
|
|
143
|
+
custom_fields['col_count'] = len(source.rows[0]) if source.rows else 0
|
|
144
|
+
custom_fields['has_header'] = '예' if source.has_header else '아니오'
|
|
145
|
+
|
|
146
|
+
# 헤더 정보 (있는 경우)
|
|
147
|
+
if source.has_header and source.rows:
|
|
148
|
+
headers = [h.strip() for h in source.rows[0] if h.strip()]
|
|
149
|
+
if headers:
|
|
150
|
+
custom_fields['columns'] = ', '.join(headers[:10]) # 최대 10개
|
|
151
|
+
if len(source.rows[0]) > 10:
|
|
152
|
+
custom_fields['columns'] += f' 외 {len(source.rows[0]) - 10}개'
|
|
153
|
+
|
|
154
|
+
self.logger.debug(f"Extracted CSV metadata: {list(custom_fields.keys())}")
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
self.logger.warning(f"Failed to extract CSV metadata: {e}")
|
|
158
|
+
|
|
159
|
+
# CSV는 표준 필드가 없고 모두 custom 필드
|
|
160
|
+
return DocumentMetadata(custom=custom_fields)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
__all__ = [
|
|
164
|
+
'CSVMetadataExtractor',
|
|
165
|
+
'CSVSourceInfo',
|
|
166
|
+
'format_file_size',
|
|
167
|
+
'get_delimiter_name',
|
|
168
|
+
]
|