xgen-doc2chunk 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -170,7 +170,7 @@ class TableQualityValidator:
170
170
  if paragraph_count > 0:
171
171
  # High probability of not being a table if paragraph-style text exists
172
172
  paragraph_ratio = paragraph_count / max(1, filled_cells)
173
- if paragraph_ratio > 0.25: # Relaxed from 15% to 25%
173
+ if paragraph_ratio > 0.60: # Relaxed from 25% to 60%
174
174
  return False, 0.0, f"contains_paragraph_text({paragraph_count})"
175
175
  elif paragraph_ratio > 0.1: # Relaxed from 5% to 10%
176
176
  penalties.append(f"has_paragraph_cells({paragraph_count})")
@@ -379,9 +379,9 @@ class TableQualityValidator:
379
379
  if col1_empty_ratio >= 0.6 and col2_long_ratio >= 0.3:
380
380
  return False, f"col1_empty({col1_empty_ratio:.0%})_col2_long({col2_long_ratio:.0%})"
381
381
 
382
- # Pattern 2: Many paragraph-style entries in second column
383
- if num_rows > 5 and col2_has_paragraphs >= 2:
384
- return False, f"col2_paragraphs({col2_has_paragraphs})"
382
+ # # Pattern 2: Many paragraph-style entries in second column
383
+ # if num_rows > 5 and col2_has_paragraphs >= 2:
384
+ # return False, f"col2_paragraphs({col2_has_paragraphs})"
385
385
 
386
386
  # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
387
  if num_rows > 10:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -115,7 +115,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm
115
115
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
116
116
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
117
117
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
118
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=HXHl0tukTUHFSIWxQUcrYs8lYJ8gZnYV12HtSezWIho,16069
118
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=7qI_kcY-scGaLPChkAeCtkQD9GAsD_NryMQw1nNMUwU,16075
119
119
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
120
120
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=8rCAnLvNRSVvIAbEiggXawrMOo-zWpMxwDc5Rrk19Co,22520
121
121
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
155
155
  xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
156
156
  xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
157
157
  xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
158
- xgen_doc2chunk-0.1.3.dist-info/METADATA,sha256=Rq17VLzDtXqBDAIGYe1ASoDq6MtHQRfvd5R9O_HKz3Q,7623
159
- xgen_doc2chunk-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
- xgen_doc2chunk-0.1.3.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
- xgen_doc2chunk-0.1.3.dist-info/RECORD,,
158
+ xgen_doc2chunk-0.1.5.dist-info/METADATA,sha256=qBfTY7YCh61_spWvm_TkEaN9zLeOKKz0LdzpMD_RKgM,7623
159
+ xgen_doc2chunk-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
+ xgen_doc2chunk-0.1.5.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
+ xgen_doc2chunk-0.1.5.dist-info/RECORD,,