xgen-doc2chunk 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/PKG-INFO +1 -1
  2. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/pyproject.toml +1 -1
  3. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +4 -4
  4. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/.gitignore +0 -0
  5. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/LICENSE +0 -0
  6. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/README.md +0 -0
  7. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/__init__.py +0 -0
  8. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/__init__.py +0 -0
  9. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/chunking.py +0 -0
  10. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/constants.py +0 -0
  11. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/page_chunker.py +0 -0
  12. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/protected_regions.py +0 -0
  13. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/sheet_processor.py +0 -0
  14. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/table_chunker.py +0 -0
  15. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/table_parser.py +0 -0
  16. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/chunking/text_chunker.py +0 -0
  17. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/__init__.py +0 -0
  18. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/document_processor.py +0 -0
  19. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/__init__.py +0 -0
  20. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/chart_extractor.py +0 -0
  21. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/chart_processor.py +0 -0
  22. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/file_converter.py +0 -0
  23. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/img_processor.py +0 -0
  24. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/metadata_extractor.py +0 -0
  25. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/page_tag_processor.py +0 -0
  26. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/preprocessor.py +0 -0
  27. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/storage_backend.py +0 -0
  28. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/table_extractor.py +0 -0
  29. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/table_processor.py +0 -0
  30. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/functions/utils.py +0 -0
  31. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/__init__.py +0 -0
  32. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/base_handler.py +0 -0
  33. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_handler.py +0 -0
  34. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/__init__.py +0 -0
  35. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +0 -0
  36. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +0 -0
  37. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +0 -0
  38. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +0 -0
  39. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +0 -0
  40. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +0 -0
  41. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +0 -0
  42. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/csv_helper/csv_table.py +0 -0
  43. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/doc_handler.py +0 -0
  44. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/doc_helpers/__init__.py +0 -0
  45. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +0 -0
  46. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +0 -0
  47. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +0 -0
  48. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_handler.py +0 -0
  49. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/__init__.py +0 -0
  50. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +0 -0
  51. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +0 -0
  52. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +0 -0
  53. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_image.py +0 -0
  54. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +0 -0
  55. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +0 -0
  56. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +0 -0
  57. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +0 -0
  58. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +0 -0
  59. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +0 -0
  60. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_handler.py +0 -0
  61. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/__init__.py +0 -0
  62. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +0 -0
  63. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +0 -0
  64. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +0 -0
  65. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +0 -0
  66. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +0 -0
  67. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +0 -0
  68. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +0 -0
  69. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +0 -0
  70. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +0 -0
  71. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/html_helper/__init__.py +0 -0
  72. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +0 -0
  73. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +0 -0
  74. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/html_reprocessor.py +0 -0
  75. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_handler.py +0 -0
  76. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/__init__.py +0 -0
  77. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +0 -0
  78. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +0 -0
  79. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +0 -0
  80. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +0 -0
  81. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +0 -0
  82. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +0 -0
  83. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +0 -0
  84. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +0 -0
  85. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +0 -0
  86. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +0 -0
  87. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +0 -0
  88. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_handler.py +0 -0
  89. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +0 -0
  90. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +0 -0
  91. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +0 -0
  92. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +0 -0
  93. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +0 -0
  94. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +0 -0
  95. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +0 -0
  96. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +0 -0
  97. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +0 -0
  98. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +0 -0
  99. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/image_file_handler.py +0 -0
  100. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/image_file_helper/__init__.py +0 -0
  101. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +0 -0
  102. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +0 -0
  103. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +0 -0
  104. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_handler.py +0 -0
  105. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +0 -0
  106. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +0 -0
  107. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +0 -0
  108. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +0 -0
  109. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +0 -0
  110. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +0 -0
  111. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +0 -0
  112. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +0 -0
  113. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +0 -0
  114. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +0 -0
  115. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +0 -0
  116. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +0 -0
  117. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +0 -0
  118. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +0 -0
  119. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +0 -0
  120. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +0 -0
  121. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +0 -0
  122. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +0 -0
  123. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +0 -0
  124. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +0 -0
  125. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/pdf_helpers/types.py +0 -0
  126. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_handler.py +0 -0
  127. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/__init__.py +0 -0
  128. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +0 -0
  129. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +0 -0
  130. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +0 -0
  131. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +0 -0
  132. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +0 -0
  133. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +0 -0
  134. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +0 -0
  135. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +0 -0
  136. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +0 -0
  137. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +0 -0
  138. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_handler.py +0 -0
  139. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/__init__.py +0 -0
  140. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +0 -0
  141. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +0 -0
  142. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +0 -0
  143. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +0 -0
  144. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +0 -0
  145. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +0 -0
  146. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +0 -0
  147. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +0 -0
  148. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +0 -0
  149. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/text_handler.py +0 -0
  150. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/text_helper/__init__.py +0 -0
  151. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +0 -0
  152. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +0 -0
  153. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +0 -0
  154. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/__init__.py +0 -0
  155. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/base.py +0 -0
  156. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_engine/__init__.py +0 -0
  157. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +0 -0
  158. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +0 -0
  159. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +0 -0
  160. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +0 -0
  161. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +0 -0
  162. {xgen_doc2chunk-0.1.3 → xgen_doc2chunk-0.1.5}/xgen_doc2chunk/ocr/ocr_processor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "xgen-doc2chunk"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -170,7 +170,7 @@ class TableQualityValidator:
170
170
  if paragraph_count > 0:
171
171
  # High probability of not being a table if paragraph-style text exists
172
172
  paragraph_ratio = paragraph_count / max(1, filled_cells)
173
- if paragraph_ratio > 0.25: # Relaxed from 15% to 25%
173
+ if paragraph_ratio > 0.60: # Relaxed from 25% to 60%
174
174
  return False, 0.0, f"contains_paragraph_text({paragraph_count})"
175
175
  elif paragraph_ratio > 0.1: # Relaxed from 5% to 10%
176
176
  penalties.append(f"has_paragraph_cells({paragraph_count})")
@@ -379,9 +379,9 @@ class TableQualityValidator:
379
379
  if col1_empty_ratio >= 0.6 and col2_long_ratio >= 0.3:
380
380
  return False, f"col1_empty({col1_empty_ratio:.0%})_col2_long({col2_long_ratio:.0%})"
381
381
 
382
- # Pattern 2: Many paragraph-style entries in second column
383
- if num_rows > 5 and col2_has_paragraphs >= 2:
384
- return False, f"col2_paragraphs({col2_has_paragraphs})"
382
+ # # Pattern 2: Many paragraph-style entries in second column
383
+ # if num_rows > 5 and col2_has_paragraphs >= 2:
384
+ # return False, f"col2_paragraphs({col2_has_paragraphs})"
385
385
 
386
386
  # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
387
  if num_rows > 10:
File without changes
File without changes