PyPI - preocr - Versions diffs - 1.2.1__tar.gz → 1.2.2__tar.gz - Mend

preocr 1.2.1tar.gz → 1.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{preocr-1.2.1 → preocr-1.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: preocr
-Version: 1.2.1
+Version: 1.2.2
 Summary: A fast, CPU-only library that intelligently detects whether files need OCR processing before expensive OCR operations. Uses hybrid adaptive pipeline for 92-95% accuracy.
 Author: PreOCR Contributors
 License-Expression: Apache-2.0

{preocr-1.2.1 → preocr-1.2.2}/preocr/analysis/page_detection.py RENAMED Viewed

@@ -42,11 +42,11 @@ def analyze_pdf_pages(
         2. Adjusted based on consistency:
            - If all pages are consistent (all need OCR or all don't), confidence +0.1
            - If pages are mixed (some need OCR, some don't), confidence -0.1
         This means:
         - Uniform documents (all scanned or all digital) get higher confidence
         - Mixed documents get lower confidence, reflecting the uncertainty
         Per-page confidence:
         - Pages with text: 0.95 (high confidence)
         - Pages without text: 0.80 if completely empty, 0.60 if sparse text

{preocr-1.2.1 → preocr-1.2.2}/preocr/core/decision.py RENAMED Viewed

@@ -22,44 +22,44 @@ def calculate_ocr_score(
 ) -> float:
     """
     Calculate OCR_SCORE using pixel-aware scoring model.
-    OCR_SCORE = 0.35 * image_ratio + 0.25 * (1 - alphabet_ratio) +
+    OCR_SCORE = 0.35 * image_ratio + 0.25 * (1 - alphabet_ratio) +
                 0.2 * low_text_density + 0.2 * font_suspicion
     Args:
         text_length: Length of extracted text
         image_coverage: Image coverage percentage (0-100)
         text_coverage: Text coverage percentage (0-100)
         config: Optional Config object
     Returns:
         OCR_SCORE (0.0-1.0) where higher score indicates more likely to need OCR
     """
     if config is None:
         config = _DEFAULT_CONFIG
     # Calculate image_ratio from image_coverage (convert percentage to ratio)
     image_ratio = image_coverage / 100.0 if image_coverage > 0 else 0.0
     # Approximate alphabet_ratio (normalized text length factor)
     max_expected_text = 10000  # Reasonable max for a page
     alphabet_ratio = min(text_length / max_expected_text, 1.0) if text_length > 0 else 0.0
     # Calculate low_text_density (inverse of text_coverage, normalized)
     text_density = text_coverage / 100.0 if text_coverage > 0 else 0.0
     low_text_density = 1.0 - min(text_density, 1.0)
     # Font suspicion: higher when text_length is very low
     font_suspicion = 1.0 - min(text_length / 50.0, 1.0) if text_length < 50 else 0.0
     # Calculate OCR score
     ocr_score = (
-        0.35 * image_ratio +
-        0.25 * (1.0 - alphabet_ratio) +
-        0.20 * low_text_density +
-        0.20 * font_suspicion
+        0.35 * image_ratio
+        + 0.25 * (1.0 - alphabet_ratio)
+        + 0.20 * low_text_density
+        + 0.20 * font_suspicion
     )
     return round(ocr_score, 3)
@@ -71,24 +71,24 @@ def calculate_confidence_from_signals(
 ) -> float:
     """
     Calculate confidence score from signals using unified approach.
     Priority:
     1. Use OCR_SCORE if available (most accurate)
     2. Use layout-based calculation
     3. Fallback to text-length based
     Args:
         signals: Dictionary of signals from signals.collect_signals()
         needs_ocr: Boolean indicating if OCR is needed
         ocr_score: Optional OCR_SCORE (0.0-1.0) if already calculated
         config: Optional Config object
     Returns:
         Confidence score (0.0-1.0)
     """
     if config is None:
         config = _DEFAULT_CONFIG
     # Priority 1: Use OCR_SCORE if available (most accurate)
     if ocr_score is not None and config.use_ocr_score_confidence:
         # Calibrate OCR_SCORE to confidence range (0.50-0.95)
@@ -99,13 +99,13 @@ def calculate_confidence_from_signals(
             # Lower OCR_SCORE = higher confidence for "no OCR"
             confidence = 0.50 + ((1.0 - ocr_score) * 0.45)  # Range: 0.50-0.95
         return round(confidence, 2)
     # Priority 2: Layout-based (if layout data available)
     layout_type = signals.get("layout_type")
     if layout_type and layout_type != "unknown":
         text_coverage = float(signals.get("text_coverage", 0.0))
         image_coverage = float(signals.get("image_coverage", 0.0))
         if needs_ocr:
             # More images = higher confidence
             image_factor = min(image_coverage / 100.0, 1.0)
@@ -115,7 +115,7 @@ def calculate_confidence_from_signals(
             text_factor = min(text_coverage / 100.0, 1.0)
             confidence = 0.70 + (text_factor * 0.25)  # Range: 0.70-0.95
         return round(confidence, 2)
     # Priority 3: Text-length based fallback
     text_length = signals.get("text_length", 0)
     if needs_ocr:
@@ -129,7 +129,7 @@ def calculate_confidence_from_signals(
         # More text = higher confidence (digital)
         text_factor = min(text_length / 1000.0, 1.0)
         confidence = 0.75 + (text_factor * 0.20)  # Range: 0.75-0.95
     return round(confidence, 2)
@@ -205,21 +205,25 @@ def decide(
         is_mixed_content = signals.get("is_mixed_content", False)
         text_coverage = signals.get("text_coverage", 0.0)
         image_coverage = signals.get("image_coverage", 0.0)
         # Calculate image_ratio from image_coverage (convert percentage to ratio)
         # Also check OpenCV results if available (more accurate for scanned PDFs)
         opencv_layout = signals.get("opencv_layout", {})
         image_coverage_opencv = opencv_layout.get("image_coverage", 0.0) if opencv_layout else 0.0
         # Use OpenCV image_coverage if available (more accurate), otherwise use layout image_coverage
-        effective_image_coverage = image_coverage_opencv if image_coverage_opencv > 0 else image_coverage
+        effective_image_coverage = (
+            image_coverage_opencv if image_coverage_opencv > 0 else image_coverage
+        )
         image_ratio = effective_image_coverage / 100.0 if effective_image_coverage > 0 else 0.0
         # Calculate OCR_SCORE for unified confidence calculation
         ocr_score = None
         if layout_type and layout_type != "unknown":
-            ocr_score = calculate_ocr_score(text_length, effective_image_coverage, text_coverage, config)
+            ocr_score = calculate_ocr_score(
+                text_length, effective_image_coverage, text_coverage, config
+            )
         # 🔥 Hybrid Rule: Sweet spot for OCR detection
         # If image_ratio > 0.75 AND extracted_text_length < 30 → OCR
         # This catches scanned PDFs that are image-heavy with minimal extractable text
@@ -237,7 +241,7 @@ def decide(
                 CATEGORY_UNSTRUCTURED,
                 ReasonCode.PDF_SCANNED,
             )
         # Alternative: If text_length is very low (< 30) and we have layout data suggesting images
         # This handles cases where scanned PDFs aren't detected as images but have no text
         if text_length < 30 and layout_type and layout_type != "unknown":
@@ -276,7 +280,7 @@ def decide(
                         CATEGORY_UNSTRUCTURED,
                         ReasonCode.PDF_MIXED,
                     )
                 # If text coverage is significant, might not need full OCR
                 if text_length >= config.min_text_length and text_coverage > 10:
                     confidence = calculate_confidence_from_signals(
@@ -359,7 +363,7 @@ def decide(
                 CATEGORY_UNSTRUCTURED,
                 ReasonCode.PDF_SCANNED,
             )
         # Fallback to text-length based decision (when layout analysis not available)
         if text_length >= config.min_text_length:
             # Use unified confidence calculation (fallback mode)
@@ -472,21 +476,21 @@ def refine_with_opencv(
     image_coverage_opencv = opencv_result.get("image_coverage", 0.0)
     has_text_regions = opencv_result.get("has_text_regions", False)
     layout_type = opencv_result.get("layout_type", "unknown")
     # Calculate OCR_SCORE from OpenCV results for unified confidence
     ocr_score_opencv = calculate_ocr_score(
         text_length, image_coverage_opencv, text_coverage_opencv, config
     )
     # Update signals with OpenCV layout data for confidence calculation
     signals_with_opencv = signals.copy()
     signals_with_opencv["layout_type"] = layout_type
     signals_with_opencv["text_coverage"] = text_coverage_opencv
     signals_with_opencv["image_coverage"] = image_coverage_opencv
     # Calculate image_ratio from image_coverage (convert percentage to ratio)
     image_ratio = image_coverage_opencv / 100.0 if image_coverage_opencv > 0 else 0.0
     # 🔥 Hybrid Rule: Sweet spot for OCR detection (applied in OpenCV refinement too)
     # If image_ratio > 0.75 AND extracted_text_length < 30 → OCR
     if image_ratio > 0.75 and text_length < 30:
@@ -526,7 +530,7 @@ def refine_with_opencv(
                 CATEGORY_UNSTRUCTURED,
                 ReasonCode.PDF_MIXED,
             )
         if text_length >= config.min_text_length and text_coverage_opencv > 15:
             # Digital text document - use unified confidence calculation
             confidence = calculate_confidence_from_signals(
@@ -607,7 +611,10 @@ def refine_with_opencv(
     if (initial_needs_ocr and not has_text_regions) or (not initial_needs_ocr and has_text_regions):
         # Calculate OCR_SCORE-based confidence
         ocr_confidence = calculate_confidence_from_signals(
-            signals_with_opencv, needs_ocr=initial_needs_ocr, ocr_score=ocr_score_opencv, config=config
+            signals_with_opencv,
+            needs_ocr=initial_needs_ocr,
+            ocr_score=ocr_score_opencv,
+            config=config,
         )
         # Weighted combination: 30% initial, 70% OCR_SCORE-based (OpenCV is more accurate)
         confidence = (initial_confidence * 0.3) + (ocr_confidence * 0.7)

{preocr-1.2.1 → preocr-1.2.2}/preocr/core/detector.py RENAMED Viewed

@@ -66,20 +66,20 @@ def needs_ocr(
     Note on Confidence Scores:
         Confidence scores may vary between page_level=True and page_level=False modes:
         - **Without page_level**: Confidence is calculated based on document-level heuristics
           and OpenCV analysis (if triggered). Typical range: 0.60-0.95.
         - **With page_level=True**: Confidence is calculated as the average of per-page
           confidence scores, adjusted for consistency. For mixed documents (some pages
           need OCR, some don't), confidence may be lower due to the averaging effect.
           Typical range: 0.60-0.95, but may be lower for mixed documents.
         - **Why the difference**: Page-level analysis provides more granular information
           but averages confidence across pages. Document-level analysis uses overall
           text extraction and layout analysis, which can be more confident for uniform
           documents.
         Both modes are accurate; the difference reflects the analysis granularity.
         Use page_level=True when you need per-page decisions, otherwise use the
         default (page_level=False) for faster, document-level decisions.
@@ -187,7 +187,7 @@ def needs_ocr(
         if opencv_result:
             # Add OpenCV results to signals BEFORE refining (so hybrid rule can use it)
             collected_signals["opencv_layout"] = opencv_result
             # Refine decision based on OpenCV analysis
             needs_ocr_flag, reason, confidence, category, reason_code = decision.refine_with_opencv(
                 collected_signals,
@@ -218,14 +218,14 @@ def needs_ocr(
     if page_analysis and "pages" in page_analysis:
         page_count = page_analysis.get("page_count", 0)
         pages_list = page_analysis.get("pages", [])
         # Only add page-level data if it's valid
         if page_count > 0 and len(pages_list) > 0:
             result["pages"] = pages_list
             result["page_count"] = page_count
             result["pages_needing_ocr"] = page_analysis.get("pages_needing_ocr", 0)
             result["pages_with_text"] = page_analysis.get("pages_with_text", 0)
             # Override overall decision with page-level analysis only if data is valid
             if page_analysis.get("overall_needs_ocr") is not None:
                 # Validate that page-level analysis is complete and consistent

{preocr-1.2.1 → preocr-1.2.2}/preocr/core/extractor.py RENAMED Viewed

@@ -134,8 +134,8 @@ def extract_native_data(
     # Format output
     return format_result(
-        result,
-        output_format=output_format,
+        result,
+        output_format=output_format,
         markdown_clean=markdown_clean,
-        include_metadata=include_metadata
+        include_metadata=include_metadata,
     )

{preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/base.py RENAMED Viewed

@@ -75,7 +75,7 @@ def calculate_confidence(
         text_quality: Quality of text (0.0-1.0), based on font size and clarity
         extraction_method: Method used ("pdfplumber" = 0.9, "pymupdf" = 0.8)
         element_type_certainty: How certain we are about classification (0.0-1.0)
-        bbox_accuracy: How well-defined the bbox is (0.0-1.0)
+        bbox_accuracy: How well-defined the bbox is (0.0-1.0)
     Returns:
         Confidence score between 0.0 and 1.0

{preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/formatters.py RENAMED Viewed

@@ -46,17 +46,17 @@ def format_as_json(result: ExtractionResult) -> Dict[str, Any]:
 def format_as_markdown(result: ExtractionResult, clean: bool = False) -> str:
     """
     Format result as LLM-ready markdown.
     Args:
         result: ExtractionResult to format
         clean: If True, output only content without metadata (file paths, confidence scores, etc.)
                If False, include all metadata (default: False for backward compatibility)
     Returns:
         Markdown string
     """
     lines = []
     # If clean mode, skip all metadata and just output content
     if clean:
         return _format_as_clean_markdown(result)
@@ -191,14 +191,14 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
     Perfect for LLM consumption - just the text content.
     """
     lines = []
     # Tables - just the table content
     if result.tables:
         for table in result.tables:
             table_md = _format_table_as_markdown(table)
             lines.append(table_md)
             lines.append("")
     # Forms - just field names and values
     if result.forms:
         for form in result.forms:
@@ -207,7 +207,7 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
             elif form.value:
                 lines.append(form.value)
             lines.append("")
     # Elements (text content) - main content
     if result.elements:
         # Group by page
@@ -217,11 +217,11 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
             if page_num not in elements_by_page:
                 elements_by_page[page_num] = []
             elements_by_page[page_num].append(elem)
         # Sort pages
         for page_num in sorted(elements_by_page.keys()):
             page_elements = elements_by_page[page_num]
             # Sort by reading order if available
             if result.reading_order:
                 page_elements.sort(
@@ -231,7 +231,7 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
                         else 9999
                     )
                 )
             for elem in page_elements:
                 if elem.element_type == ElementType.TITLE:
                     lines.append(f"# {elem.text}")
@@ -249,7 +249,7 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
                 elif elem.text:
                     lines.append(elem.text)
                     lines.append("")
     return "\n".join(lines).strip()

preocr 1.2.1__tar.gz → 1.2.2__tar.gz

preocr 1.2.1tar.gz → 1.2.2tar.gz