xgen-doc2chunk 0.1.5__py3-none-any.whl → 0.1.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +5 -5
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.51.dist-info}/METADATA +1 -1
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.51.dist-info}/RECORD +5 -5
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.51.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.51.dist-info}/licenses/LICENSE +0 -0
|
@@ -383,11 +383,11 @@ class TableQualityValidator:
|
|
|
383
383
|
# if num_rows > 5 and col2_has_paragraphs >= 2:
|
|
384
384
|
# return False, f"col2_paragraphs({col2_has_paragraphs})"
|
|
385
385
|
|
|
386
|
-
# Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
-
if num_rows > 10:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
# # Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
+
# if num_rows > 10:
|
|
388
|
+
# col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
|
|
389
|
+
# if col1_short_ratio >= 0.8 and col2_long_count >= 5:
|
|
390
|
+
# return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
|
|
391
391
|
|
|
392
392
|
return True, "valid"
|
|
393
393
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.51
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|
|
@@ -115,7 +115,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm
|
|
|
115
115
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
|
|
116
116
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
|
|
117
117
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
|
|
118
|
-
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=
|
|
118
|
+
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=rI5QAdqqJfiITZxu4bAf50pD7aIjVlhkYFsc2pt4i8c,16085
|
|
119
119
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
|
|
120
120
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=8rCAnLvNRSVvIAbEiggXawrMOo-zWpMxwDc5Rrk19Co,22520
|
|
121
121
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
|
|
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
|
|
|
155
155
|
xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
|
|
156
156
|
xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
|
|
157
157
|
xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
|
|
158
|
-
xgen_doc2chunk-0.1.
|
|
159
|
-
xgen_doc2chunk-0.1.
|
|
160
|
-
xgen_doc2chunk-0.1.
|
|
161
|
-
xgen_doc2chunk-0.1.
|
|
158
|
+
xgen_doc2chunk-0.1.51.dist-info/METADATA,sha256=F-uK8guKjZrLidKLYtDeOwSJhmbeIGvaUrc6D3Kl6M0,7624
|
|
159
|
+
xgen_doc2chunk-0.1.51.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
160
|
+
xgen_doc2chunk-0.1.51.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
|
|
161
|
+
xgen_doc2chunk-0.1.51.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|