PyPI - xgen-doc2chunk - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.51__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.4py3-none-any.whl → 0.1.51py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py CHANGED Viewed

@@ -170,7 +170,7 @@ class TableQualityValidator:
         if paragraph_count > 0:
             # High probability of not being a table if paragraph-style text exists
             paragraph_ratio = paragraph_count / max(1, filled_cells)
-            if paragraph_ratio > 0.25:  # Relaxed from 15% to 25%
+            if paragraph_ratio > 0.60:  # Relaxed from 25% to 60%
                 return False, 0.0, f"contains_paragraph_text({paragraph_count})"
             elif paragraph_ratio > 0.1:  # Relaxed from 5% to 10%
                 penalties.append(f"has_paragraph_cells({paragraph_count})")
@@ -379,15 +379,15 @@ class TableQualityValidator:
             if col1_empty_ratio >= 0.6 and col2_long_ratio >= 0.3:
                 return False, f"col1_empty({col1_empty_ratio:.0%})_col2_long({col2_long_ratio:.0%})"
-        # Pattern 2: Many paragraph-style entries in second column
-        if num_rows > 5 and col2_has_paragraphs >= 2:
-            return False, f"col2_paragraphs({col2_has_paragraphs})"
+        # # Pattern 2: Many paragraph-style entries in second column
+        # if num_rows > 5 and col2_has_paragraphs >= 2:
+        #     return False, f"col2_paragraphs({col2_has_paragraphs})"
-        # Pattern 3: If first column is short and second is long overall, likely body text not key-value
-        if num_rows > 10:
-            col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
-            if col1_short_ratio >= 0.8 and col2_long_count >= 5:
-                return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
+        # # Pattern 3: If first column is short and second is long overall, likely body text not key-value
+        # if num_rows > 10:
+        #     col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
+        #     if col1_short_ratio >= 0.8 and col2_long_count >= 5:
+        #         return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
         return True, "valid"

{xgen_doc2chunk-0.1.4.dist-info → xgen_doc2chunk-0.1.51.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xgen-doc2chunk
-Version: 0.1.4
+Version: 0.1.51
 Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
 Project-URL: Homepage, https://github.com/master0419/doc2chunk
 Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme

{xgen_doc2chunk-0.1.4.dist-info → xgen_doc2chunk-0.1.51.dist-info}/RECORD RENAMED Viewed

@@ -115,7 +115,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
-xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=HXHl0tukTUHFSIWxQUcrYs8lYJ8gZnYV12HtSezWIho,16069
+xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=rI5QAdqqJfiITZxu4bAf50pD7aIjVlhkYFsc2pt4i8c,16085
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=8rCAnLvNRSVvIAbEiggXawrMOo-zWpMxwDc5Rrk19Co,22520
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
 xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
 xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
 xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
-xgen_doc2chunk-0.1.4.dist-info/METADATA,sha256=IfRE6mjY7MVBaifP6sdZXm66AMZburo5IWqrdNne-wk,7623
-xgen_doc2chunk-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-xgen_doc2chunk-0.1.4.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
-xgen_doc2chunk-0.1.4.dist-info/RECORD,,
+xgen_doc2chunk-0.1.51.dist-info/METADATA,sha256=F-uK8guKjZrLidKLYtDeOwSJhmbeIGvaUrc6D3Kl6M0,7624
+xgen_doc2chunk-0.1.51.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+xgen_doc2chunk-0.1.51.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
+xgen_doc2chunk-0.1.51.dist-info/RECORD,,

{xgen_doc2chunk-0.1.4.dist-info → xgen_doc2chunk-0.1.51.dist-info}/WHEEL RENAMED Viewed

File without changes

{xgen_doc2chunk-0.1.4.dist-info → xgen_doc2chunk-0.1.51.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

xgen-doc2chunk 0.1.4__py3-none-any.whl → 0.1.51__py3-none-any.whl

xgen-doc2chunk 0.1.4py3-none-any.whl → 0.1.51py3-none-any.whl