xgen-doc2chunk 0.1.5__py3-none-any.whl → 0.1.51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -383,11 +383,11 @@ class TableQualityValidator:
383
383
  # if num_rows > 5 and col2_has_paragraphs >= 2:
384
384
  # return False, f"col2_paragraphs({col2_has_paragraphs})"
385
385
 
386
- # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
- if num_rows > 10:
388
- col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
- if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
- return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
386
+ # # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
+ # if num_rows > 10:
388
+ # col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
+ # if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
+ # return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
391
391
 
392
392
  return True, "valid"
393
393
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.5
3
+ Version: 0.1.51
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -115,7 +115,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm
115
115
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
116
116
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
117
117
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
118
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=7qI_kcY-scGaLPChkAeCtkQD9GAsD_NryMQw1nNMUwU,16075
118
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=rI5QAdqqJfiITZxu4bAf50pD7aIjVlhkYFsc2pt4i8c,16085
119
119
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
120
120
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=8rCAnLvNRSVvIAbEiggXawrMOo-zWpMxwDc5Rrk19Co,22520
121
121
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
155
155
  xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
156
156
  xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
157
157
  xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
158
- xgen_doc2chunk-0.1.5.dist-info/METADATA,sha256=qBfTY7YCh61_spWvm_TkEaN9zLeOKKz0LdzpMD_RKgM,7623
159
- xgen_doc2chunk-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
- xgen_doc2chunk-0.1.5.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
- xgen_doc2chunk-0.1.5.dist-info/RECORD,,
158
+ xgen_doc2chunk-0.1.51.dist-info/METADATA,sha256=F-uK8guKjZrLidKLYtDeOwSJhmbeIGvaUrc6D3Kl6M0,7624
159
+ xgen_doc2chunk-0.1.51.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
+ xgen_doc2chunk-0.1.51.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
+ xgen_doc2chunk-0.1.51.dist-info/RECORD,,