natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,289 @@
1
+ """Enhanced LLM enrichment with automatic retry for low-scoring suggestions.
2
+
3
+ Usage
4
+ -----
5
+ python -m tools.bad_pdf_eval.llm_enrich_with_retry --submission ja6EqV1 --model gpt-4o
6
+
7
+ Environment
8
+ -----------
9
+ OPENAI_API_KEY must be set or passed via --api-key.
10
+ """
11
+
12
+ import argparse
13
+ import concurrent.futures as _futures
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Iterable, List
18
+
19
+ from openai import OpenAI
20
+ from pydantic import BaseModel, Field
21
+
22
+ # Import quality evaluation
23
+ from tools.bad_pdf_eval.evaluate_quality import analyze_code_quality
24
+
25
+ # Import existing functions and classes
26
+ from tools.bad_pdf_eval.llm_enrich import (
27
+ CHEATSHEET_PATH,
28
+ DECISION_TREE_PATH,
29
+ EVAL_DIR,
30
+ WORKFLOWS_PATH,
31
+ DocOutput,
32
+ build_pdf_prompt,
33
+ build_prompt,
34
+ img_to_b64_jpeg,
35
+ read_md,
36
+ )
37
+
38
+ # Global variable
39
+ FORCE = False
40
+
41
+
42
+ class RetryOutput(BaseModel):
43
+ """Improved version after feedback."""
44
+
45
+ thought_process: str = Field(
46
+ ..., description="Revised reasoning addressing the specific feedback"
47
+ )
48
+ code_suggestion: str = Field(
49
+ ..., description="Improved Python snippet addressing all feedback points"
50
+ )
51
+
52
+
53
+ def build_retry_prompt(
54
+ original_code: str, quality_analysis: Dict[str, Any], context: Dict[str, Any]
55
+ ) -> List[Dict[str, Any]]:
56
+ """Build a retry prompt with specific feedback."""
57
+
58
+ feedback_points = []
59
+
60
+ # Build specific feedback based on what's missing
61
+ if not quality_analysis["uses_guides"] and "table" in context.get("goal", "").lower():
62
+ feedback_points.append(
63
+ "• Your code doesn't use the Guides API for table extraction. "
64
+ "Use `Guides.from_content()` with actual column headers from the PDF, "
65
+ "then `snap_to_whitespace()` for better results."
66
+ )
67
+
68
+ if not quality_analysis["uses_real_text"]:
69
+ feedback_points.append(
70
+ "• Use actual text from the inspect/describe data as anchors. "
71
+ "Look for real headers, labels, or unique text in the evidence."
72
+ )
73
+
74
+ if not quality_analysis["uses_until"]:
75
+ feedback_points.append(
76
+ "• Use the `until=` parameter in `.below()` or `.above()` calls "
77
+ "to define region boundaries based on content, not pixels."
78
+ )
79
+
80
+ if quality_analysis["uses_tatr"] and quality_analysis["score"] < 6:
81
+ feedback_points.append(
82
+ "• Consider if TATR is really necessary. Can you use Guides or "
83
+ "direct region extraction instead?"
84
+ )
85
+
86
+ if not quality_analysis["uses_snap_to_whitespace"] and quality_analysis["uses_guides"]:
87
+ feedback_points.append(
88
+ "• Add `.snap_to_whitespace()` after creating guides to auto-align "
89
+ "to natural gaps in the content."
90
+ )
91
+
92
+ retry_prompt = f"""
93
+ Your previous code suggestion scored {quality_analysis['score']}/12 in our quality evaluation.
94
+ Here's specific feedback to improve it:
95
+
96
+ {chr(10).join(feedback_points)}
97
+
98
+ Original code:
99
+ ```python
100
+ {original_code}
101
+ ```
102
+
103
+ Please provide an improved version that addresses all the feedback points.
104
+ Focus on using modern Natural PDF features and patterns.
105
+ """
106
+
107
+ messages = [
108
+ {
109
+ "role": "system",
110
+ "content": "You are a Natural PDF expert. Improve the code based on specific feedback.",
111
+ },
112
+ {"role": "user", "content": retry_prompt},
113
+ ]
114
+
115
+ return messages
116
+
117
+
118
+ def enrich_with_retry(
119
+ summary_path: Path,
120
+ api_key: str,
121
+ model: str = "gpt-4o",
122
+ retry_threshold: int = 6,
123
+ max_retries: int = 2,
124
+ ):
125
+ """Enrich with automatic retry for low-quality suggestions.
126
+
127
+ The function will keep *all* attempts (initial + retries) in `attempts` list so we can
128
+ analyse which feedback helped. The highest-scoring version becomes the primary
129
+ `thought_process` / `code_suggestion` stored at the root level.
130
+ """
131
+
132
+ summary = json.loads(summary_path.read_text())
133
+
134
+ # Skip if already enriched (unless forced)
135
+ if (
136
+ not FORCE
137
+ and summary.get("thought_process")
138
+ and summary.get("code_suggestion")
139
+ and summary.get("attempts")
140
+ ):
141
+ print(f"[skip] {summary_path.parent.name}: already enriched with attempts")
142
+ return
143
+
144
+ print(f"[send] {summary_path.parent.name}: requesting initial enrichment")
145
+
146
+ client = OpenAI(api_key=api_key)
147
+ msgs = build_pdf_prompt(summary)
148
+
149
+ attempts: List[Dict[str, Any]] = [] # keep all versions
150
+
151
+ # Initial attempt
152
+ completion = client.beta.chat.completions.parse(
153
+ model=model, messages=msgs, response_format=DocOutput
154
+ )
155
+
156
+ doc_out = completion.choices[0].message.parsed
157
+ quality = analyze_code_quality(doc_out.code_suggestion)
158
+ print(f"Initial quality score: {quality['score']}/12")
159
+ attempts.append(
160
+ {
161
+ "attempt": 0,
162
+ "score": quality["score"],
163
+ "thought_process": doc_out.thought_process,
164
+ "code_suggestion": doc_out.code_suggestion,
165
+ }
166
+ )
167
+
168
+ best_doc = doc_out
169
+ best_score = quality["score"]
170
+
171
+ # Retry loop
172
+ retry_count = 0
173
+ while quality["score"] < retry_threshold and retry_count < max_retries:
174
+ retry_count += 1
175
+ print(f"[retry {retry_count}] Score below threshold, requesting improvement...")
176
+
177
+ retry_msgs = build_retry_prompt(
178
+ doc_out.code_suggestion, quality, {"goal": summary.get("goal", "")}
179
+ )
180
+
181
+ retry_completion = client.beta.chat.completions.parse(
182
+ model=model, messages=retry_msgs, response_format=RetryOutput
183
+ )
184
+ retry_out = retry_completion.choices[0].message.parsed
185
+
186
+ # Evaluate new version
187
+ new_quality = analyze_code_quality(retry_out.code_suggestion)
188
+ print(f"Retry {retry_count} quality score: {new_quality['score']}/12")
189
+
190
+ # Record attempt details
191
+ attempts.append(
192
+ {
193
+ "attempt": retry_count,
194
+ "score": new_quality["score"],
195
+ "thought_process": retry_out.thought_process,
196
+ "code_suggestion": retry_out.code_suggestion,
197
+ }
198
+ )
199
+
200
+ # Update best if improved
201
+ if new_quality["score"] > best_score:
202
+ best_score = new_quality["score"]
203
+ best_doc.thought_process = retry_out.thought_process
204
+ best_doc.code_suggestion = retry_out.code_suggestion
205
+
206
+ # Prepare for next iteration
207
+ quality = new_quality
208
+ doc_out.code_suggestion = retry_out.code_suggestion
209
+ doc_out.thought_process = retry_out.thought_process
210
+
211
+ # Save results – keep best version at root, all attempts nested
212
+ summary["thought_process"] = best_doc.thought_process
213
+ summary["code_suggestion"] = best_doc.code_suggestion
214
+ summary["difficult_elements"] = getattr(
215
+ best_doc, "difficult_elements", summary.get("difficult_elements")
216
+ )
217
+ summary["test_case"] = getattr(best_doc, "test_case", summary.get("test_case"))
218
+ summary["quality_score"] = best_score
219
+ summary["retry_count"] = retry_count
220
+ summary["attempts"] = attempts # new key for analysis
221
+
222
+ summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False))
223
+ print(
224
+ f"[update] Best score: {best_score}/12 after {retry_count} retries (kept all {len(attempts)} attempts)"
225
+ )
226
+
227
+
228
+ def main():
229
+ ap = argparse.ArgumentParser()
230
+ ap.add_argument("--submission", help="Submission ID to enrich")
231
+ ap.add_argument("--model", default="gpt-4o")
232
+ ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"))
233
+ ap.add_argument("--force", action="store_true")
234
+ ap.add_argument(
235
+ "--retry-threshold", type=int, default=6, help="Minimum quality score before retry"
236
+ )
237
+ ap.add_argument("--max-retries", type=int, default=2, help="Maximum number of retry attempts")
238
+ ap.add_argument(
239
+ "--workers",
240
+ type=int,
241
+ default=1,
242
+ help="Number of parallel workers (use 1 to disable parallelism)",
243
+ )
244
+ args = ap.parse_args()
245
+
246
+ if not args.api_key:
247
+ raise SystemExit("OPENAI_API_KEY not provided")
248
+
249
+ global FORCE
250
+ FORCE = args.force
251
+
252
+ def _get_paths() -> Iterable[Path]:
253
+ if args.submission:
254
+ p = EVAL_DIR / args.submission / "summary.json"
255
+ if not p.exists():
256
+ raise SystemExit("summary.json not found")
257
+ return [p]
258
+ else:
259
+ return list(EVAL_DIR.glob("*/summary.json"))
260
+
261
+ paths = _get_paths()
262
+
263
+ if args.workers <= 1:
264
+ # Sequential processing
265
+ for p in paths:
266
+ try:
267
+ enrich_with_retry(
268
+ p, args.api_key, args.model, args.retry_threshold, args.max_retries
269
+ )
270
+ except Exception as e:
271
+ print(f"[error] {p.parent.name}: {e}")
272
+ else:
273
+ # Parallel processing with thread pool (IO-bound)
274
+ print(f"Running with {args.workers} parallel workers…")
275
+
276
+ def _safe_process(p: Path):
277
+ try:
278
+ enrich_with_retry(
279
+ p, args.api_key, args.model, args.retry_threshold, args.max_retries
280
+ )
281
+ except Exception as exc:
282
+ print(f"[error] {p.parent.name}: {exc}")
283
+
284
+ with _futures.ThreadPoolExecutor(max_workers=args.workers) as ex:
285
+ list(ex.map(_safe_process, paths))
286
+
287
+
288
+ if __name__ == "__main__":
289
+ main()
@@ -14,4 +14,4 @@ def save_json(data: Dict[str, Any], path: Path):
14
14
 
15
15
 
16
16
  def log_section(title: str):
17
- console.rule(f"[bold cyan]{title}")
17
+ console.rule(f"[bold cyan]{title}")
@@ -1,8 +1,9 @@
1
1
  import re
2
+ import ssl
3
+ import urllib.request
2
4
  from pathlib import Path
3
5
  from typing import Optional
4
- import urllib.request
5
- import ssl
6
+
6
7
  from rich.console import Console
7
8
 
8
9
  ROOT_DIR = Path(__file__).resolve().parent.parent.parent # project root
@@ -103,7 +104,9 @@ def find_local_pdf(submission_id: str, pdf_url: Optional[str] = None) -> Optiona
103
104
  req = urllib.request.Request(pdf_url, headers={"User-Agent": "Mozilla/5.0"})
104
105
  # Disable SSL verification edge-cases the storage host sometimes triggers
105
106
  ctx = ssl.create_default_context()
106
- with urllib.request.urlopen(req, context=ctx, timeout=30) as resp, open(dest_path, "wb") as f:
107
+ with urllib.request.urlopen(req, context=ctx, timeout=30) as resp, open(
108
+ dest_path, "wb"
109
+ ) as f:
107
110
  f.write(resp.read())
108
111
  except Exception as e:
109
112
  # Fallback: try requests if available (venv usually has it)
@@ -124,4 +127,4 @@ def find_local_pdf(submission_id: str, pdf_url: Optional[str] = None) -> Optiona
124
127
  return None
125
128
 
126
129
  # None found
127
- return None
130
+ return None
@@ -1,118 +0,0 @@
1
- natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
2
- natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
3
- natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
4
- natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
5
- natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
6
- natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
7
- natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
8
- natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
9
- natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
10
- natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
11
- natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
12
- natural_pdf/analyzers/layout/gemini.py,sha256=ldECVCQ5HNQA3Omjg2NOsTrJXslyYb0vErDncmLIiuE,10510
13
- natural_pdf/analyzers/layout/layout_analyzer.py,sha256=1v23FVCIGzkoiyRqiLZBwGZssBFKphtMossMENMuMxE,15519
14
- natural_pdf/analyzers/layout/layout_manager.py,sha256=vDXBAaNwvp68CRcEPH58MGLxx01OdVgzOh7Uv53L6fs,10319
15
- natural_pdf/analyzers/layout/layout_options.py,sha256=-Nv6bcu4_pqSCN6uNhCZ9mvoCBtRDZIUkO6kjkuLXsg,7703
16
- natural_pdf/analyzers/layout/paddle.py,sha256=tX2bI1yayAdmRhvsfZ_Ygs7zAG5e9eW-pLJkw4NUpBQ,21325
17
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
18
- natural_pdf/analyzers/layout/surya.py,sha256=ugRXPIHiLoh65lfbbiXO317TbgdtQ-5kVN1nonEf4ws,9778
19
- natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
20
- natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
21
- natural_pdf/analyzers/layout/yolo.py,sha256=ruchj28sxar0DWDALwUz1j30z0CLIEp2QAs0gLVvC4E,8346
22
- natural_pdf/classification/manager.py,sha256=pOP2LvJpTBGItvdIODnk735DXq7F2qqxN4AKmBORM3c,21775
23
- natural_pdf/classification/mixin.py,sha256=nYpmHQ4BlrealdPtIJt-_idME5o-xKLKNuAdIHzWL6c,7580
24
- natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZHWdWJzmsU,3239
25
- natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
26
- natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
27
- natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
- natural_pdf/core/element_manager.py,sha256=A6GJk9kwTzt-aSz4-SWaRHLZRbIMFFLce3CpxSyfkV4,51749
29
- natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
30
- natural_pdf/core/page.py,sha256=843_Fyk1gxZ8nqERJjjjoRD3iM4pFJy9a0zQSyMthiQ,128476
31
- natural_pdf/core/pdf.py,sha256=mC4GZjPXx_bK6RUlhLpnJnapkHDhbgJpgpcUJOvb7OE,75290
32
- natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
33
- natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
34
- natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
35
- natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
36
- natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
37
- natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
38
- natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
39
- natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
40
- natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
41
- natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
42
- natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
43
- natural_pdf/elements/region.py,sha256=8SKhzCJ6sELZxJcM2i_58YhEKU6HBvaJ7Oj6E3bOsHw,139523
44
- natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
45
- natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
46
- natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
47
- natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
48
- natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
49
- natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
50
- natural_pdf/exporters/original_pdf.py,sha256=dtvC4er6TWOfqq-n24Pejw3mlAuPd8IVyihggJtcf0s,6634
51
- natural_pdf/exporters/paddleocr.py,sha256=RBP03GCk0mLeC7tWtuti8AIUHlpOrtvbWkE2n7Ja7k8,19484
52
- natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
53
- natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
55
- natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
56
- natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6HcsM,4979
57
- natural_pdf/extraction/mixin.py,sha256=_5wGnzOCEuRWhqdSUV1Lqo9HIi56YC4MWzbBxOkOEKU,23160
58
- natural_pdf/extraction/result.py,sha256=D5DhjxLW7IvhEkvsAP7Zs2YA8K4hyuoTg681CSn5qA0,1825
59
- natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
60
- natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
61
- natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
62
- natural_pdf/flows/flow.py,sha256=I61BpFVDQyo6ORsmoqoYiOEP1DBRp0vgDJjm_V8frhc,10562
63
- natural_pdf/flows/region.py,sha256=s_YAT_0KsrwUs73hhU9xr_35Ufr__XNhRjHSQkxcfYU,27647
64
- natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
65
- natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
66
- natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
67
- natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
68
- natural_pdf/ocr/engine_paddle.py,sha256=9tQZl1VqN6d_KEWUY_S9tfrDLiR4FCHMjgSRNwPlsu8,16152
69
- natural_pdf/ocr/engine_surya.py,sha256=lOvSbZk53VKFVxRmqcQzM_0dHVdwTkRGiDZ9AWCgL1Q,5951
70
- natural_pdf/ocr/ocr_factory.py,sha256=qjGL3hm_nTzxjwYWP0JE7dCFXZjKN8Z7f9c0oqasb9M,5262
71
- natural_pdf/ocr/ocr_manager.py,sha256=jFJI8v3coapKpERoUlP-ptwguZG_Dl4VlclD0xQ6Us8,16192
72
- natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
73
- natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
74
- natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
75
- natural_pdf/qa/document_qa.py,sha256=cli1E9NBSVtT5Qo6n7ZRd7BpstnbpZfkljX69LGTYU8,19608
76
- natural_pdf/qa/qa_result.py,sha256=_q4dlSqsjtgomcI8-pqbOT69lqQKnEMkhZNydoxEkkE,2227
77
- natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
78
- natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
79
- natural_pdf/search/numpy_search_service.py,sha256=MoPBlyHTDqah1IrwBzyglEyiXlF4wqaU_5mml_ngvGc,10328
80
- natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
81
- natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
82
- natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
83
- natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
84
- natural_pdf/selectors/parser.py,sha256=T9r7XZhM1cGSYQrc9amUHbFtX-zBqd9_YPK0scwCjAQ,34231
85
- natural_pdf/tables/__init__.py,sha256=y65LM2wnu81yzvOX-J_5NXiIK4vEUtHa3EM1xv-0ttQ,105
86
- natural_pdf/tables/result.py,sha256=OYc-MjnP-VRTVaY-pBt84E-d8N3AaqzwAud0hHt5sVY,3979
87
- natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
88
- natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
89
- natural_pdf/utils/bidi_mirror.py,sha256=SAe5SnL-xG5Wyo3LtkMttLdsnQqZhzAebLc7BAe6LhQ,1150
90
- natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
91
- natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
92
- natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
93
- natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
94
- natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
95
- natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
96
- natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
97
- natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
98
- natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
99
- natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
100
- natural_pdf-0.1.32.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
101
- optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
102
- optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
103
- optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
104
- optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
105
- optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
106
- tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
107
- tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
108
- tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
109
- tools/bad_pdf_eval/eval_suite.py,sha256=-MK-XLqBo1025sccwYL6tnf7mZ1ZEpxu6EsTYv2ppmU,4294
110
- tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrOw1q6ARMl-EazIU,1906
111
- tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
112
- tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
113
- tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
114
- natural_pdf-0.1.32.dist-info/METADATA,sha256=CMZIo2BjeLh-b9hezQHMLehZP8brUflCQ69dLtfFyxo,6711
115
- natural_pdf-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
116
- natural_pdf-0.1.32.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
117
- natural_pdf-0.1.32.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
118
- natural_pdf-0.1.32.dist-info/RECORD,,