xgen-doc2chunk 0.1.5__tar.gz → 0.1.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/PKG-INFO +1 -1
  2. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/pyproject.toml +1 -1
  3. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +9 -1
  4. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +5 -5
  5. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +73 -5
  6. xgen_doc2chunk-0.1.52/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +1194 -0
  7. xgen_doc2chunk-0.1.5/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +0 -655
  8. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/.gitignore +0 -0
  9. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/LICENSE +0 -0
  10. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/README.md +0 -0
  11. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/__init__.py +0 -0
  12. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/__init__.py +0 -0
  13. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/chunking.py +0 -0
  14. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/constants.py +0 -0
  15. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/page_chunker.py +0 -0
  16. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/protected_regions.py +0 -0
  17. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/sheet_processor.py +0 -0
  18. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/table_chunker.py +0 -0
  19. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/table_parser.py +0 -0
  20. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/chunking/text_chunker.py +0 -0
  21. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/__init__.py +0 -0
  22. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/document_processor.py +0 -0
  23. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/__init__.py +0 -0
  24. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/chart_extractor.py +0 -0
  25. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/chart_processor.py +0 -0
  26. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/file_converter.py +0 -0
  27. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/img_processor.py +0 -0
  28. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/metadata_extractor.py +0 -0
  29. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/page_tag_processor.py +0 -0
  30. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/preprocessor.py +0 -0
  31. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/storage_backend.py +0 -0
  32. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/table_extractor.py +0 -0
  33. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/table_processor.py +0 -0
  34. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/functions/utils.py +0 -0
  35. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/__init__.py +0 -0
  36. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/base_handler.py +0 -0
  37. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_handler.py +0 -0
  38. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/__init__.py +0 -0
  39. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +0 -0
  40. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +0 -0
  41. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +0 -0
  42. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +0 -0
  43. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +0 -0
  44. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +0 -0
  45. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +0 -0
  46. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/csv_helper/csv_table.py +0 -0
  47. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_handler.py +0 -0
  48. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/__init__.py +0 -0
  49. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +0 -0
  50. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +0 -0
  51. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +0 -0
  52. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_handler.py +0 -0
  53. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/__init__.py +0 -0
  54. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +0 -0
  55. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +0 -0
  56. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +0 -0
  57. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_image.py +0 -0
  58. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +0 -0
  59. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +0 -0
  60. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +0 -0
  61. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +0 -0
  62. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +0 -0
  63. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +0 -0
  64. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_handler.py +0 -0
  65. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/__init__.py +0 -0
  66. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +0 -0
  67. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +0 -0
  68. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +0 -0
  69. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +0 -0
  70. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +0 -0
  71. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +0 -0
  72. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +0 -0
  73. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +0 -0
  74. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +0 -0
  75. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_helper/__init__.py +0 -0
  76. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +0 -0
  77. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +0 -0
  78. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/html_reprocessor.py +0 -0
  79. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_handler.py +0 -0
  80. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/__init__.py +0 -0
  81. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +0 -0
  82. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +0 -0
  83. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +0 -0
  84. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +0 -0
  85. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +0 -0
  86. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +0 -0
  87. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +0 -0
  88. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +0 -0
  89. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +0 -0
  90. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +0 -0
  91. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +0 -0
  92. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_handler.py +0 -0
  93. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +0 -0
  94. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +0 -0
  95. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +0 -0
  96. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +0 -0
  97. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +0 -0
  98. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +0 -0
  99. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +0 -0
  100. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +0 -0
  101. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +0 -0
  102. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +0 -0
  103. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_handler.py +0 -0
  104. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/__init__.py +0 -0
  105. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +0 -0
  106. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +0 -0
  107. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +0 -0
  108. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_handler.py +0 -0
  109. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +0 -0
  110. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +0 -0
  111. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +0 -0
  112. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +0 -0
  113. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +0 -0
  114. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +0 -0
  115. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +0 -0
  116. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +0 -0
  117. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +0 -0
  118. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +0 -0
  119. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +0 -0
  120. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +0 -0
  121. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +0 -0
  122. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +0 -0
  123. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +0 -0
  124. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +0 -0
  125. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +0 -0
  126. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/types.py +0 -0
  127. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_handler.py +0 -0
  128. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/__init__.py +0 -0
  129. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +0 -0
  130. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +0 -0
  131. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +0 -0
  132. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +0 -0
  133. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +0 -0
  134. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +0 -0
  135. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +0 -0
  136. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +0 -0
  137. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +0 -0
  138. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +0 -0
  139. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_handler.py +0 -0
  140. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/__init__.py +0 -0
  141. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +0 -0
  142. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +0 -0
  143. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +0 -0
  144. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +0 -0
  145. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +0 -0
  146. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +0 -0
  147. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +0 -0
  148. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +0 -0
  149. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +0 -0
  150. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_handler.py +0 -0
  151. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/__init__.py +0 -0
  152. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +0 -0
  153. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +0 -0
  154. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +0 -0
  155. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/__init__.py +0 -0
  156. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/base.py +0 -0
  157. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/__init__.py +0 -0
  158. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +0 -0
  159. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +0 -0
  160. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +0 -0
  161. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +0 -0
  162. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +0 -0
  163. {xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/ocr/ocr_processor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.5
3
+ Version: 0.1.52
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "xgen-doc2chunk"
7
- version = "0.1.5"
7
+ version = "0.1.52"
8
8
  description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -25,6 +25,9 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
25
25
  )
26
26
  from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
27
27
  from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
28
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
29
+ apply_cjk_compat_mapping,
30
+ )
28
31
 
29
32
  logger = logging.getLogger("document-processor")
30
33
 
@@ -873,7 +876,12 @@ def generate_html_from_cells(
873
876
  content = ""
874
877
  if col_idx < len(row_data):
875
878
  content = row_data[col_idx]
876
- content = escape_html(str(content).strip() if content else "")
879
+
880
+ # Apply CJK Compatibility character mapping to fix broken characters
881
+ # (e.g., 㛳→→, ㏙→(, ㏚→) etc. from Word→PDF conversion)
882
+ content = str(content).strip() if content else ""
883
+ content = apply_cjk_compat_mapping(content)
884
+ content = escape_html(content)
877
885
 
878
886
  # Get span info (default to 1 if not found)
879
887
  spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})
@@ -383,11 +383,11 @@ class TableQualityValidator:
383
383
  # if num_rows > 5 and col2_has_paragraphs >= 2:
384
384
  # return False, f"col2_paragraphs({col2_has_paragraphs})"
385
385
 
386
- # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
- if num_rows > 10:
388
- col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
- if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
- return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
386
+ # # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
+ # if num_rows > 10:
388
+ # col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
+ # if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
+ # return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
391
391
 
392
392
  return True, "valid"
393
393
 
@@ -3,6 +3,9 @@
3
3
  PDF Text Extraction Module
4
4
 
5
5
  Provides functions for extracting text blocks from PDF pages.
6
+ Includes support for:
7
+ - Fragmented text reconstruction (Word->PDF conversion issues)
8
+ - CJK Compatibility character mapping (broken character fixes)
6
9
  """
7
10
  import logging
8
11
  from typing import List, Tuple
@@ -17,6 +20,8 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import
17
20
  TextQualityAnalyzer,
18
21
  QualityAwareTextExtractor,
19
22
  PageOCRFallbackEngine,
23
+ FragmentedTextReconstructor,
24
+ apply_cjk_compat_mapping,
20
25
  )
21
26
 
22
27
  logger = logging.getLogger("document-processor")
@@ -53,13 +58,76 @@ def extract_text_blocks(
53
58
  analyzer = TextQualityAnalyzer(page, page_num)
54
59
  page_analysis = analyzer.analyze_page()
55
60
 
56
- # If quality is too low, use full page OCR fallback
61
+ # If quality is low, try text reconstruction first (before OCR)
57
62
  if page_analysis.quality_result.needs_ocr:
63
+ quality_result = page_analysis.quality_result
58
64
  logger.info(
59
- f"[PDF] Page {page_num + 1}: Low text quality "
60
- f"({page_analysis.quality_result.quality_score:.2f}), "
61
- f"PUA={page_analysis.quality_result.pua_count}, "
62
- f"using OCR fallback"
65
+ f"[PDF] Page {page_num + 1}: Low text quality detected - "
66
+ f"score={quality_result.quality_score:.2f}, "
67
+ f"PUA={quality_result.pua_count}, "
68
+ f"CJK_Compat={quality_result.cjk_compat_count}, "
69
+ f"fragmented={quality_result.is_fragmented}"
70
+ )
71
+
72
+ # Try reconstruction for fragmented text or CJK Compat issues
73
+ if quality_result.is_fragmented or quality_result.cjk_compat_count > 0:
74
+ logger.info(
75
+ f"[PDF] Page {page_num + 1}: Attempting text reconstruction "
76
+ f"(excluding {len(table_bboxes)} table regions)"
77
+ )
78
+
79
+ # Exclude table regions from reconstruction to avoid duplication
80
+ reconstructor = FragmentedTextReconstructor(
81
+ page, page_num, exclude_bboxes=table_bboxes
82
+ )
83
+
84
+ # Use section-based reconstruction for proper table positioning
85
+ if table_bboxes:
86
+ sections = reconstructor.reconstruct_with_sections()
87
+
88
+ if sections:
89
+ result_elements = []
90
+ for section in sections:
91
+ # Apply CJK Compatibility character mapping
92
+ cleaned_text = apply_cjk_compat_mapping(section['text'])
93
+
94
+ if cleaned_text.strip():
95
+ # Create element with proper Y position for sorting
96
+ result_elements.append(PageElement(
97
+ element_type=ElementType.TEXT,
98
+ content=cleaned_text,
99
+ bbox=(0, section['y_start'], page.rect.width, section['y_end']),
100
+ page_num=page_num
101
+ ))
102
+
103
+ if result_elements:
104
+ logger.info(
105
+ f"[PDF] Page {page_num + 1}: Text reconstruction successful "
106
+ f"({len(result_elements)} sections)"
107
+ )
108
+ return result_elements
109
+ else:
110
+ # No tables - use simple reconstruction
111
+ reconstructed_text = reconstructor.reconstruct()
112
+
113
+ if reconstructed_text:
114
+ cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
115
+
116
+ logger.info(
117
+ f"[PDF] Page {page_num + 1}: Text reconstruction successful "
118
+ f"({len(cleaned_text)} chars)"
119
+ )
120
+
121
+ return [PageElement(
122
+ element_type=ElementType.TEXT,
123
+ content=cleaned_text,
124
+ bbox=(0, 0, page.rect.width, page.rect.height),
125
+ page_num=page_num
126
+ )]
127
+
128
+ # Fall back to OCR if reconstruction not applicable
129
+ logger.info(
130
+ f"[PDF] Page {page_num + 1}: Using OCR fallback"
63
131
  )
64
132
 
65
133
  extractor = QualityAwareTextExtractor(page, page_num)