PyPI - natural-pdf - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +131 -45
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +113 -22
natural_pdf/core/pdf.py +477 -75
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +222 -108
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.33.dist-info/RECORD +0 -118
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

tools/bad_pdf_eval/evaluate_quality.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""Evaluate the quality of LLM enrichment suggestions.
+This script analyzes the code suggestions to identify:
+- Use of modern features (Guides API, extract_table)
+- Avoidance of anti-patterns (placeholder text, unnecessary TATR)
+- Practical, working code
+"""
+import json
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List
+def analyze_code_quality(code: str) -> Dict[str, Any]:
+    """Analyze a code suggestion for quality indicators."""
+    quality = {
+        "uses_guides": bool(
+            re.search(r"from natural_pdf\.analyzers import Guides|Guides\(|Guides\.", code)
+        ),
+        "uses_tatr": bool(re.search(r'analyze_layout\([\'"]tatr[\'"]?\)', code)),
+        "uses_extract_table": bool(re.search(r"\.extract_table\(\)", code)),
+        "has_placeholder_text": bool(
+            re.search(r'[\'"](?:AnchorText|Texts|Also|HeaderText|TableHeader)[\'"]', code)
+        ),
+        "uses_real_text": bool(
+            re.search(
+                r'[\'"](?!AnchorText|Texts|Also|HeaderText|TableHeader)[^\'"{}\[\]]+[\'"]', code
+            )
+        )
+        and not bool(
+            re.search(r'[\'"](?:AnchorText|Texts|Also|HeaderText|TableHeader)[\'"]', code)
+        ),
+        "uses_snap_to_whitespace": bool(re.search(r"snap_to_whitespace", code)),
+        "uses_parent_navigation": bool(re.search(r"\.parent\(", code)),
+        "uses_until": bool(re.search(r"until\s*=", code)),
+        "has_ocr_call": bool(re.search(r"apply_ocr\(", code)),
+        "removes_text_layer": bool(re.search(r"remove_text_layer\(|text_layer\s*=\s*False", code)),
+    }
+    # Calculate score based on quality indicators
+    score = 0
+    if quality["uses_guides"]:
+        score += 3  # Major positive: modern approach
+    if quality["uses_tatr"]:
+        score += 1  # Minor positive: valid for complex layouts
+    if quality["uses_extract_table"]:
+        score += 2  # Positive: using singular method
+    if quality["uses_real_text"]:
+        score += 2  # Positive: using actual anchors
+    if quality["uses_snap_to_whitespace"]:
+        score += 2  # Positive: modern Guides feature
+    if quality["uses_parent_navigation"]:
+        score += 1  # Positive: robust navigation
+    if quality["uses_until"]:
+        score += 1  # Positive when appropriate: precise region selection
+    quality["score"] = score
+    quality["max_score"] = 12
+    return quality
+def analyze_difficult_elements(elements: List[str]) -> Dict[str, int]:
+    """Count types of difficult elements identified."""
+    patterns = {
+        "tiny_font": r"tiny.*font|small.*font|font.*size|<\s*\d+\s*pt",
+        "rtl_language": r"arabic|hebrew|rtl|right.*to.*left",
+        "scanned": r"scanned|image.*only|no.*text.*layer",
+        "complex_layout": r"column|multi.*column|layout",
+        "handwritten": r"handwritten|hand.*written",
+        "redacted": r"redact",
+    }
+    counts = defaultdict(int)
+    for element in elements:
+        element_lower = element.lower()
+        for category, pattern in patterns.items():
+            if re.search(pattern, element_lower):
+                counts[category] += 1
+    return dict(counts)
+def evaluate_submission(submission_path: Path) -> Dict[str, Any]:
+    """Evaluate a single submission's enrichment quality."""
+    summary_path = submission_path / "summary.json"
+    if not summary_path.exists():
+        return None
+    data = json.loads(summary_path.read_text())
+    result = {
+        "submission_id": data.get("submission_id", submission_path.name),
+        "has_doc_enrichment": bool(data.get("code_suggestion")),
+        "doc_code_quality": None,
+        "difficult_elements_analysis": None,
+        "page_code_quality": [],
+    }
+    # Analyze document-level code
+    if data.get("code_suggestion"):
+        result["doc_code_quality"] = analyze_code_quality(data["code_suggestion"])
+    # Analyze difficult elements
+    if data.get("difficult_elements"):
+        result["difficult_elements_analysis"] = analyze_difficult_elements(
+            data["difficult_elements"]
+        )
+    # Analyze page-level code
+    for page in data.get("pages", []):
+        if page.get("code_suggestion"):
+            page_quality = analyze_code_quality(page["code_suggestion"])
+            page_quality["page_number"] = page.get("page_number")
+            result["page_code_quality"].append(page_quality)
+    return result
+def main():
+    """Analyze all submissions and generate quality report."""
+    eval_dir = Path("eval_results")
+    results = []
+    for submission_dir in eval_dir.iterdir():
+        if submission_dir.is_dir() and (submission_dir / "summary.json").exists():
+            result = evaluate_submission(submission_dir)
+            if result:
+                results.append(result)
+    # Aggregate statistics
+    stats = {
+        "total_submissions": len(results),
+        "with_doc_enrichment": sum(1 for r in results if r["has_doc_enrichment"]),
+        "using_guides": 0,
+        "using_tatr": 0,
+        "using_placeholders": 0,
+        "avg_quality_score": 0,
+        "difficult_elements_breakdown": defaultdict(int),
+    }
+    all_scores = []
+    for result in results:
+        if result["doc_code_quality"]:
+            quality = result["doc_code_quality"]
+            if quality["uses_guides"]:
+                stats["using_guides"] += 1
+            if quality["uses_tatr"]:
+                stats["using_tatr"] += 1
+            if quality["has_placeholder_text"]:
+                stats["using_placeholders"] += 1
+            all_scores.append(quality["score"])
+        if result["difficult_elements_analysis"]:
+            for elem_type, count in result["difficult_elements_analysis"].items():
+                stats["difficult_elements_breakdown"][elem_type] += count
+    if all_scores:
+        stats["avg_quality_score"] = sum(all_scores) / len(all_scores)
+    # Generate report
+    print("\n=== Natural PDF Evaluation Quality Report ===\n")
+    print(f"Total submissions analyzed: {stats['total_submissions']}")
+    print(f"With document enrichment: {stats['with_doc_enrichment']}")
+    print(f"\nCode Quality Metrics:")
+    print(
+        f"  Using Guides API: {stats['using_guides']} ({stats['using_guides']/stats['with_doc_enrichment']*100:.1f}%)"
+    )
+    print(
+        f"  Using TATR: {stats['using_tatr']} ({stats['using_tatr']/stats['with_doc_enrichment']*100:.1f}%)"
+    )
+    print(
+        f"  Using placeholders: {stats['using_placeholders']} ({stats['using_placeholders']/stats['with_doc_enrichment']*100:.1f}%)"
+    )
+    print(f"  Average quality score: {stats['avg_quality_score']:.1f}/12")
+    print(f"\nDifficult Elements Identified:")
+    for elem_type, count in sorted(
+        stats["difficult_elements_breakdown"].items(), key=lambda x: x[1], reverse=True
+    ):
+        print(f"  {elem_type}: {count}")
+    # Save detailed results
+    output_path = eval_dir / "quality_analysis.json"
+    with open(output_path, "w") as f:
+        json.dump({"stats": stats, "detailed_results": results}, f, indent=2)
+    print(f"\nDetailed results saved to: {output_path}")
+if __name__ == "__main__":
+    main()

tools/bad_pdf_eval/export_enrichment_csv.py CHANGED Viewed

@@ -11,7 +11,7 @@ import argparse
 import csv
 import json
 from pathlib import Path
-from typing import List, Dict
+from typing import Dict, List
 ROOT = Path(__file__).resolve().parent.parent.parent  # repo root
 EVAL_DIR = ROOT / "eval_results"
@@ -29,17 +29,21 @@ def collect_records() -> List[Dict[str, str]]:
         if not tp and not cs:
             # Skip summaries without enrichment at doc level
             continue
-        records.append({
-            "id": data.get("submission_id", summary_path.parent.name),
-            "thought_process": tp.replace("\n", " ").strip(),
-            "code_suggestion": cs.replace("\n", " ").strip(),
-        })
+        records.append(
+            {
+                "id": data.get("submission_id", summary_path.parent.name),
+                "thought_process": tp.replace("\n", " ").strip(),
+                "code_suggestion": cs.replace("\n", " ").strip(),
+            }
+        )
     return records
 def main():
     ap = argparse.ArgumentParser(description="Export enriched summaries to CSV.")
-    ap.add_argument("--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path")
+    ap.add_argument(
+        "--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path"
+    )
     args = ap.parse_args()
     records = collect_records()
@@ -59,4 +63,4 @@ def main():
 if __name__ == "__main__":
-    main()
+    main()

tools/bad_pdf_eval/llm_enrich.py CHANGED Viewed

@@ -8,6 +8,7 @@ Environment
 -----------
 OPENAI_API_KEY must be set or passed via --api-key.
 """
 from __future__ import annotations
 import argparse
@@ -16,16 +17,17 @@ import json
 import os
 import textwrap
 from pathlib import Path
-from typing import Dict, Any, List
+from typing import Any, Dict, List
 from openai import OpenAI
-from pydantic import BaseModel, Field
 from PIL import Image
+from pydantic import BaseModel, Field
 ROOT = Path(__file__).resolve().parent.parent.parent  # repo root
 EVAL_DIR = ROOT / "eval_results"
 CHEATSHEET_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_CheatSheet.md"
 WORKFLOWS_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_Workflows.md"
+DECISION_TREE_PATH = ROOT / "tools" / "bad_pdf_eval" / "extraction_decision_tree.md"
 def read_md(path: Path) -> str:
@@ -43,6 +45,7 @@ def img_to_b64_jpeg(path: Path, max_px: int = 1024) -> str:
 from io import BytesIO
 def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
     """Return OpenAI chat prompt messages list."""
     cheatsheet = read_md(CHEATSHEET_PATH)
@@ -52,7 +55,10 @@ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
     if page.get("image") and Path(page["image"]).exists():
         try:
             b64 = img_to_b64_jpeg(Path(page["image"]))
-            image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
+            image_section = {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
+            }
         except Exception:
             pass
@@ -93,6 +99,7 @@ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
 def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
     cheatsheet = read_md(CHEATSHEET_PATH)
     workflows = read_md(WORKFLOWS_PATH)
+    decision_tree = read_md(DECISION_TREE_PATH)
     pdf_overview = [
         f"PDF: {Path(summary['pdf']).name}",
@@ -106,7 +113,10 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
         if page.get("image") and Path(page["image"]).exists():
             try:
                 b64 = img_to_b64_jpeg(Path(page["image"]))
-                image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
+                image_section = {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
+                }
             except Exception:
                 pass
         context_json = {
@@ -117,12 +127,14 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
             "blob_sample": page.get("blobs_sample", []),
             "ocr_sample": page.get("ocr_sample", []),
         }
-        per_page_sections.append({
-            "page_number": page["page_number"],
-            "goal_tag": page.get("goal_tag") or "generic_extraction",
-            "image": image_section,
-            "context": context_json,
-        })
+        per_page_sections.append(
+            {
+                "page_number": page["page_number"],
+                "goal_tag": page.get("goal_tag") or "generic_extraction",
+                "image": image_section,
+                "context": context_json,
+            }
+        )
     sys_msg = textwrap.dedent(
         """
@@ -134,15 +146,23 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
         Extraction strategy:
           1. Start with the text layer: `page.extract_text()`, `page.extract_table()`, or region selectors.
-          2. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
+          2. For tables, strongly prefer the Guides API over TATR:
+             • Use `Guides.from_content()` with actual column headers as markers
+             • Apply `.snap_to_whitespace()` to auto-align to natural gaps
+             • Only fall back to TATR for genuinely complex multi-table pages
+          3. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
              between anchors via `.find()`, `.below()`, `.above()`, `.until()`, `.expand()`, etc.
-             Example: `page.find('text:contains(Violations)').below(until='text:bold')`.
-          3. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
+             Example: `page.find('text:contains("Violations")').below(until='text:bold')`.
+          4. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
              as this allows your code to work on potentially other similar pages of the document.
-          3. Fall back to TATR or other vision models only if the text layer is unusable.
-             • Blanket advice like "run analyze_layout('tatr') on every page" is discouraged—
-               only invoke a layout model when anchor-based text extraction genuinely fails
-               or evidence shows complex unruled tables that require cell-grid inference.
+          5. Direct region extraction often works: `region.extract_table()` without any layout model.
+        Recent improvements to leverage:
+          • Tiny text (<7pt) is now extracted reliably - no need to flag as difficult
+          • RTL languages (Arabic, Hebrew) work automatically with proper BiDi
+          • Use `.extract_table()` (singular) which returns TableResult with .df property
+          • Guides API can detect lines from pixels directly - no vector lines needed
+          • Can discard corrupted text layers with `PDF(..., text_layer=False)` or `page.remove_text_layer()`
         Handle tables, key-value forms, and free-form paragraphs with the same anchor-driven approach. Key-value
         forms might be easily extracted with .ask(...) or .extract(), feel free to mention as an option
@@ -158,30 +178,31 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
         a fluent API, and for loops are discouraged.
         Return ONE JSON object with exactly these keys:
-          • thought_process – concise reasoning and feature/enhancement requests (≤4 short paragraphs)
+          • thought_process – concise reasoning about your approach, noting if Guides would work better than TATR
           • code_suggestion – executable Python snippet using natural_pdf
-          • difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (tiny fonts in `describe`, scanned_image flag, missing text layer, no ruling lines inferred from `layout_*` arrays, etc.).  If no difficult element is evident, return an empty list.  Do *not* speculate.
-          • test_case – short description of how this PDF/page could be turned into an automated regression test (e.g. "assert tbl.df.shape == (12, 5)")
+          • difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (exclude tiny fonts unless <5pt, exclude RTL languages). If no difficult element is evident, return an empty list. Do *not* speculate.
+          • test_case – short description of how this PDF/page could be turned into an automated regression test
         Code-style expectations:
           • Use **real sample text** from the page as anchors — never placeholders such as
-            "AnchorText", "Texts", or "Also".  If no stable anchor is visible, state that
-            fact in the *thought_process* and leave a TODO rather than shipping a placeholder.
+            "AnchorText", "Texts", or "Also".  Look in the inspect/describe data for actual text.
           • When a page is flagged as *scanned_image* (or no text layer exists) your code
             MUST call `page.apply_ocr()` *before* any `.find()` or `.extract_text()` calls.
+          • If text appears as "(cid:xxx)" in the evidence, use `page.remove_text_layer()` or
+            `PDF(..., text_layer=False)` before OCR to avoid corrupted text interference.
+          • For table extraction, show Guides-based approach first, TATR only as fallback
           • Prefer `header_el.parent('table')` (up-tree navigation) over a global
-            `page.find('table')[i]` positional index — this is more robust when multiple tables
-            are present.
-          • For tables, assume Natural-PDF returns a `TableResult`; use `tbl.df` or
-            `tbl.to_df(header='first')` instead of manually building a DataFrame unless you
-            need custom header/skiprows logic.
-          • Explicitly name the extractor (`analyze_layout('tatr')`, `analyze_layout('detectron')`)
-            instead of vague comments like "YOLO fallback".
+            `page.find('table')[i]` positional index — this is more robust to layout changes.
+          • Use `.below()` or `.above()` to select regions. Add `until=` only when you need to
+            stop before reaching the page edge (e.g., before another section). Going to page edge
+            is fine without `until`.
+          • Keep page-level suggestions consistent with document-level patterns (same extraction approach)
         """
     )
     messages = [
         {"role": "system", "content": sys_msg},
+        {"role": "system", "content": "DECISION TREE:\n" + decision_tree},
         {"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
         {"role": "system", "content": "WORKFLOWS:\n" + workflows},
     ]
@@ -205,10 +226,21 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
 class DocOutput(BaseModel):
     """LLM enrichment for a whole PDF (single object)."""
-    thought_process: str = Field(..., description="Overall reasoning about the PDF and extraction plan")
-    code_suggestion: str = Field(..., description="Python snippet using natural_pdf to achieve the user goal for this PDF")
-    difficult_elements: List[str] = Field(..., description="Bullet list of page features that are *hard* for any extraction engine")
-    test_case: str = Field(..., description="Short description of how this PDF/page could be turned into an automated regression test")
+    thought_process: str = Field(
+        ...,
+        description="Overall reasoning about the PDF and extraction plan, noting whether Guides API would be better than TATR for tables",
+    )
+    code_suggestion: str = Field(
+        ...,
+        description="Python snippet using natural_pdf, preferring Guides API over TATR for table extraction",
+    )
+    difficult_elements: List[str] = Field(
+        ...,
+        description="Bullet list of page features that are genuinely hard (not tiny fonts >5pt or RTL languages)",
+    )
+    test_case: str = Field(
+        ..., description="Specific assertion that could verify the extraction worked correctly"
+    )
 def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
@@ -225,9 +257,7 @@ def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
     msgs = build_pdf_prompt(summary)
     completion = client.beta.chat.completions.parse(
-        model=model,
-        messages=msgs,
-        response_format=DocOutput
+        model=model, messages=msgs, response_format=DocOutput
     )
     # Expect exactly one function call in the first choice
@@ -249,10 +279,12 @@ def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--submission", help="Submission ID to enrich (folder name)")
     ap.add_argument("--model", default="o3")
-    ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env")
+    ap.add_argument(
+        "--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env"
+    )
     ap.add_argument("--force", action="store_true", help="overwrite existing enrichment")
     args = ap.parse_args()
     if not args.api_key:
         raise SystemExit("OPENAI_API_KEY not provided")
@@ -270,4 +302,4 @@ def main():
 if __name__ == "__main__":
-    main()
+    main()

natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl