xgen-doc2chunk 0.1.5__tar.gz → 0.1.52__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/PKG-INFO +1 -1
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/pyproject.toml +1 -1
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +9 -1
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +5 -5
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +73 -5
- xgen_doc2chunk-0.1.52/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +1194 -0
- xgen_doc2chunk-0.1.5/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +0 -655
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/.gitignore +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/LICENSE +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/README.md +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/chunking.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/page_chunker.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/protected_regions.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/sheet_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/table_chunker.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/table_parser.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/text_chunker.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/document_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/chart_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/img_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/metadata_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/page_tag_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/storage_backend.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/table_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/utils.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/base_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_table.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_image.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_reprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/types.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_handler.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/base.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/__init__.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +0 -0
- {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_processor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.52
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xgen-doc2chunk"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.52"
|
|
8
8
|
description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -25,6 +25,9 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
|
|
|
25
25
|
)
|
|
26
26
|
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
|
|
27
27
|
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
|
|
28
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
|
|
29
|
+
apply_cjk_compat_mapping,
|
|
30
|
+
)
|
|
28
31
|
|
|
29
32
|
logger = logging.getLogger("document-processor")
|
|
30
33
|
|
|
@@ -873,7 +876,12 @@ def generate_html_from_cells(
|
|
|
873
876
|
content = ""
|
|
874
877
|
if col_idx < len(row_data):
|
|
875
878
|
content = row_data[col_idx]
|
|
876
|
-
|
|
879
|
+
|
|
880
|
+
# Apply CJK Compatibility character mapping to fix broken characters
|
|
881
|
+
# (e.g., 㛳→→, ㏙→(, ㏚→) etc. from Word→PDF conversion)
|
|
882
|
+
content = str(content).strip() if content else ""
|
|
883
|
+
content = apply_cjk_compat_mapping(content)
|
|
884
|
+
content = escape_html(content)
|
|
877
885
|
|
|
878
886
|
# Get span info (default to 1 if not found)
|
|
879
887
|
spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})
|
|
@@ -383,11 +383,11 @@ class TableQualityValidator:
|
|
|
383
383
|
# if num_rows > 5 and col2_has_paragraphs >= 2:
|
|
384
384
|
# return False, f"col2_paragraphs({col2_has_paragraphs})"
|
|
385
385
|
|
|
386
|
-
# Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
-
if num_rows > 10:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
# # Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
+
# if num_rows > 10:
|
|
388
|
+
# col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
|
|
389
|
+
# if col1_short_ratio >= 0.8 and col2_long_count >= 5:
|
|
390
|
+
# return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
|
|
391
391
|
|
|
392
392
|
return True, "valid"
|
|
393
393
|
|
|
@@ -3,6 +3,9 @@
|
|
|
3
3
|
PDF Text Extraction Module
|
|
4
4
|
|
|
5
5
|
Provides functions for extracting text blocks from PDF pages.
|
|
6
|
+
Includes support for:
|
|
7
|
+
- Fragmented text reconstruction (Word->PDF conversion issues)
|
|
8
|
+
- CJK Compatibility character mapping (broken character fixes)
|
|
6
9
|
"""
|
|
7
10
|
import logging
|
|
8
11
|
from typing import List, Tuple
|
|
@@ -17,6 +20,8 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import
|
|
|
17
20
|
TextQualityAnalyzer,
|
|
18
21
|
QualityAwareTextExtractor,
|
|
19
22
|
PageOCRFallbackEngine,
|
|
23
|
+
FragmentedTextReconstructor,
|
|
24
|
+
apply_cjk_compat_mapping,
|
|
20
25
|
)
|
|
21
26
|
|
|
22
27
|
logger = logging.getLogger("document-processor")
|
|
@@ -53,13 +58,76 @@ def extract_text_blocks(
|
|
|
53
58
|
analyzer = TextQualityAnalyzer(page, page_num)
|
|
54
59
|
page_analysis = analyzer.analyze_page()
|
|
55
60
|
|
|
56
|
-
# If quality is
|
|
61
|
+
# If quality is low, try text reconstruction first (before OCR)
|
|
57
62
|
if page_analysis.quality_result.needs_ocr:
|
|
63
|
+
quality_result = page_analysis.quality_result
|
|
58
64
|
logger.info(
|
|
59
|
-
f"[PDF] Page {page_num + 1}: Low text quality "
|
|
60
|
-
f"
|
|
61
|
-
f"PUA={
|
|
62
|
-
f"
|
|
65
|
+
f"[PDF] Page {page_num + 1}: Low text quality detected - "
|
|
66
|
+
f"score={quality_result.quality_score:.2f}, "
|
|
67
|
+
f"PUA={quality_result.pua_count}, "
|
|
68
|
+
f"CJK_Compat={quality_result.cjk_compat_count}, "
|
|
69
|
+
f"fragmented={quality_result.is_fragmented}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Try reconstruction for fragmented text or CJK Compat issues
|
|
73
|
+
if quality_result.is_fragmented or quality_result.cjk_compat_count > 0:
|
|
74
|
+
logger.info(
|
|
75
|
+
f"[PDF] Page {page_num + 1}: Attempting text reconstruction "
|
|
76
|
+
f"(excluding {len(table_bboxes)} table regions)"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Exclude table regions from reconstruction to avoid duplication
|
|
80
|
+
reconstructor = FragmentedTextReconstructor(
|
|
81
|
+
page, page_num, exclude_bboxes=table_bboxes
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Use section-based reconstruction for proper table positioning
|
|
85
|
+
if table_bboxes:
|
|
86
|
+
sections = reconstructor.reconstruct_with_sections()
|
|
87
|
+
|
|
88
|
+
if sections:
|
|
89
|
+
result_elements = []
|
|
90
|
+
for section in sections:
|
|
91
|
+
# Apply CJK Compatibility character mapping
|
|
92
|
+
cleaned_text = apply_cjk_compat_mapping(section['text'])
|
|
93
|
+
|
|
94
|
+
if cleaned_text.strip():
|
|
95
|
+
# Create element with proper Y position for sorting
|
|
96
|
+
result_elements.append(PageElement(
|
|
97
|
+
element_type=ElementType.TEXT,
|
|
98
|
+
content=cleaned_text,
|
|
99
|
+
bbox=(0, section['y_start'], page.rect.width, section['y_end']),
|
|
100
|
+
page_num=page_num
|
|
101
|
+
))
|
|
102
|
+
|
|
103
|
+
if result_elements:
|
|
104
|
+
logger.info(
|
|
105
|
+
f"[PDF] Page {page_num + 1}: Text reconstruction successful "
|
|
106
|
+
f"({len(result_elements)} sections)"
|
|
107
|
+
)
|
|
108
|
+
return result_elements
|
|
109
|
+
else:
|
|
110
|
+
# No tables - use simple reconstruction
|
|
111
|
+
reconstructed_text = reconstructor.reconstruct()
|
|
112
|
+
|
|
113
|
+
if reconstructed_text:
|
|
114
|
+
cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
|
|
115
|
+
|
|
116
|
+
logger.info(
|
|
117
|
+
f"[PDF] Page {page_num + 1}: Text reconstruction successful "
|
|
118
|
+
f"({len(cleaned_text)} chars)"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return [PageElement(
|
|
122
|
+
element_type=ElementType.TEXT,
|
|
123
|
+
content=cleaned_text,
|
|
124
|
+
bbox=(0, 0, page.rect.width, page.rect.height),
|
|
125
|
+
page_num=page_num
|
|
126
|
+
)]
|
|
127
|
+
|
|
128
|
+
# Fall back to OCR if reconstruction not applicable
|
|
129
|
+
logger.info(
|
|
130
|
+
f"[PDF] Page {page_num + 1}: Using OCR fallback"
|
|
63
131
|
)
|
|
64
132
|
|
|
65
133
|
extractor = QualityAwareTextExtractor(page, page_num)
|