xgen-doc2chunk 0.1.4__tar.gz → 0.1.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/PKG-INFO +1 -1
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/pyproject.toml +1 -1
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +9 -9
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/.gitignore +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/LICENSE +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/README.md +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/chunking.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/page_chunker.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/protected_regions.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/sheet_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/table_chunker.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/table_parser.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/chunking/text_chunker.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/document_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/chart_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/img_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/metadata_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/page_tag_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/storage_backend.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/table_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/utils.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/base_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_table.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/doc_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/doc_helpers/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_image.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/html_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/html_reprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/image_file_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/image_file_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/types.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_handler.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_helper/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/base.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/__init__.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +0 -0
- {xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_processor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.51
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xgen-doc2chunk"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.51"
|
|
8
8
|
description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -170,7 +170,7 @@ class TableQualityValidator:
|
|
|
170
170
|
if paragraph_count > 0:
|
|
171
171
|
# High probability of not being a table if paragraph-style text exists
|
|
172
172
|
paragraph_ratio = paragraph_count / max(1, filled_cells)
|
|
173
|
-
if paragraph_ratio > 0.
|
|
173
|
+
if paragraph_ratio > 0.60: # Relaxed from 25% to 60%
|
|
174
174
|
return False, 0.0, f"contains_paragraph_text({paragraph_count})"
|
|
175
175
|
elif paragraph_ratio > 0.1: # Relaxed from 5% to 10%
|
|
176
176
|
penalties.append(f"has_paragraph_cells({paragraph_count})")
|
|
@@ -379,15 +379,15 @@ class TableQualityValidator:
|
|
|
379
379
|
if col1_empty_ratio >= 0.6 and col2_long_ratio >= 0.3:
|
|
380
380
|
return False, f"col1_empty({col1_empty_ratio:.0%})_col2_long({col2_long_ratio:.0%})"
|
|
381
381
|
|
|
382
|
-
# Pattern 2: Many paragraph-style entries in second column
|
|
383
|
-
if num_rows > 5 and col2_has_paragraphs >= 2:
|
|
384
|
-
|
|
382
|
+
# # Pattern 2: Many paragraph-style entries in second column
|
|
383
|
+
# if num_rows > 5 and col2_has_paragraphs >= 2:
|
|
384
|
+
# return False, f"col2_paragraphs({col2_has_paragraphs})"
|
|
385
385
|
|
|
386
|
-
# Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
-
if num_rows > 10:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
# # Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
+
# if num_rows > 10:
|
|
388
|
+
# col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
|
|
389
|
+
# if col1_short_ratio >= 0.8 and col2_long_count >= 5:
|
|
390
|
+
# return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
|
|
391
391
|
|
|
392
392
|
return True, "valid"
|
|
393
393
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/chart_extractor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/chart_processor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/file_converter.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/img_processor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/metadata_extractor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/page_tag_processor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/preprocessor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/storage_backend.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/table_extractor.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/functions/table_processor.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/base_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/csv_helper/csv_table.py
RENAMED
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/doc_helpers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_handler.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/docx_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/excel_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/html_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/html_reprocessor.py
RENAMED
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_handler.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/image_file_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/pdf_helpers/types.py
RENAMED
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py
RENAMED
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/rtf_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_handler.py
RENAMED
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/core/processor/text_helper/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xgen_doc2chunk-0.1.4 → xgen_doc2chunk-0.1.51}/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|