natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,289 @@
|
|
1
|
+
"""Enhanced LLM enrichment with automatic retry for low-scoring suggestions.
|
2
|
+
|
3
|
+
Usage
|
4
|
+
-----
|
5
|
+
python -m tools.bad_pdf_eval.llm_enrich_with_retry --submission ja6EqV1 --model gpt-4o
|
6
|
+
|
7
|
+
Environment
|
8
|
+
-----------
|
9
|
+
OPENAI_API_KEY must be set or passed via --api-key.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import argparse
|
13
|
+
import concurrent.futures as _futures
|
14
|
+
import json
|
15
|
+
import os
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Any, Dict, Iterable, List
|
18
|
+
|
19
|
+
from openai import OpenAI
|
20
|
+
from pydantic import BaseModel, Field
|
21
|
+
|
22
|
+
# Import quality evaluation
|
23
|
+
from tools.bad_pdf_eval.evaluate_quality import analyze_code_quality
|
24
|
+
|
25
|
+
# Import existing functions and classes
|
26
|
+
from tools.bad_pdf_eval.llm_enrich import (
|
27
|
+
CHEATSHEET_PATH,
|
28
|
+
DECISION_TREE_PATH,
|
29
|
+
EVAL_DIR,
|
30
|
+
WORKFLOWS_PATH,
|
31
|
+
DocOutput,
|
32
|
+
build_pdf_prompt,
|
33
|
+
build_prompt,
|
34
|
+
img_to_b64_jpeg,
|
35
|
+
read_md,
|
36
|
+
)
|
37
|
+
|
38
|
+
# Global variable
|
39
|
+
FORCE = False
|
40
|
+
|
41
|
+
|
42
|
+
class RetryOutput(BaseModel):
|
43
|
+
"""Improved version after feedback."""
|
44
|
+
|
45
|
+
thought_process: str = Field(
|
46
|
+
..., description="Revised reasoning addressing the specific feedback"
|
47
|
+
)
|
48
|
+
code_suggestion: str = Field(
|
49
|
+
..., description="Improved Python snippet addressing all feedback points"
|
50
|
+
)
|
51
|
+
|
52
|
+
|
53
|
+
def build_retry_prompt(
|
54
|
+
original_code: str, quality_analysis: Dict[str, Any], context: Dict[str, Any]
|
55
|
+
) -> List[Dict[str, Any]]:
|
56
|
+
"""Build a retry prompt with specific feedback."""
|
57
|
+
|
58
|
+
feedback_points = []
|
59
|
+
|
60
|
+
# Build specific feedback based on what's missing
|
61
|
+
if not quality_analysis["uses_guides"] and "table" in context.get("goal", "").lower():
|
62
|
+
feedback_points.append(
|
63
|
+
"• Your code doesn't use the Guides API for table extraction. "
|
64
|
+
"Use `Guides.from_content()` with actual column headers from the PDF, "
|
65
|
+
"then `snap_to_whitespace()` for better results."
|
66
|
+
)
|
67
|
+
|
68
|
+
if not quality_analysis["uses_real_text"]:
|
69
|
+
feedback_points.append(
|
70
|
+
"• Use actual text from the inspect/describe data as anchors. "
|
71
|
+
"Look for real headers, labels, or unique text in the evidence."
|
72
|
+
)
|
73
|
+
|
74
|
+
if not quality_analysis["uses_until"]:
|
75
|
+
feedback_points.append(
|
76
|
+
"• Use the `until=` parameter in `.below()` or `.above()` calls "
|
77
|
+
"to define region boundaries based on content, not pixels."
|
78
|
+
)
|
79
|
+
|
80
|
+
if quality_analysis["uses_tatr"] and quality_analysis["score"] < 6:
|
81
|
+
feedback_points.append(
|
82
|
+
"• Consider if TATR is really necessary. Can you use Guides or "
|
83
|
+
"direct region extraction instead?"
|
84
|
+
)
|
85
|
+
|
86
|
+
if not quality_analysis["uses_snap_to_whitespace"] and quality_analysis["uses_guides"]:
|
87
|
+
feedback_points.append(
|
88
|
+
"• Add `.snap_to_whitespace()` after creating guides to auto-align "
|
89
|
+
"to natural gaps in the content."
|
90
|
+
)
|
91
|
+
|
92
|
+
retry_prompt = f"""
|
93
|
+
Your previous code suggestion scored {quality_analysis['score']}/12 in our quality evaluation.
|
94
|
+
Here's specific feedback to improve it:
|
95
|
+
|
96
|
+
{chr(10).join(feedback_points)}
|
97
|
+
|
98
|
+
Original code:
|
99
|
+
```python
|
100
|
+
{original_code}
|
101
|
+
```
|
102
|
+
|
103
|
+
Please provide an improved version that addresses all the feedback points.
|
104
|
+
Focus on using modern Natural PDF features and patterns.
|
105
|
+
"""
|
106
|
+
|
107
|
+
messages = [
|
108
|
+
{
|
109
|
+
"role": "system",
|
110
|
+
"content": "You are a Natural PDF expert. Improve the code based on specific feedback.",
|
111
|
+
},
|
112
|
+
{"role": "user", "content": retry_prompt},
|
113
|
+
]
|
114
|
+
|
115
|
+
return messages
|
116
|
+
|
117
|
+
|
118
|
+
def enrich_with_retry(
|
119
|
+
summary_path: Path,
|
120
|
+
api_key: str,
|
121
|
+
model: str = "gpt-4o",
|
122
|
+
retry_threshold: int = 6,
|
123
|
+
max_retries: int = 2,
|
124
|
+
):
|
125
|
+
"""Enrich with automatic retry for low-quality suggestions.
|
126
|
+
|
127
|
+
The function will keep *all* attempts (initial + retries) in `attempts` list so we can
|
128
|
+
analyse which feedback helped. The highest-scoring version becomes the primary
|
129
|
+
`thought_process` / `code_suggestion` stored at the root level.
|
130
|
+
"""
|
131
|
+
|
132
|
+
summary = json.loads(summary_path.read_text())
|
133
|
+
|
134
|
+
# Skip if already enriched (unless forced)
|
135
|
+
if (
|
136
|
+
not FORCE
|
137
|
+
and summary.get("thought_process")
|
138
|
+
and summary.get("code_suggestion")
|
139
|
+
and summary.get("attempts")
|
140
|
+
):
|
141
|
+
print(f"[skip] {summary_path.parent.name}: already enriched with attempts")
|
142
|
+
return
|
143
|
+
|
144
|
+
print(f"[send] {summary_path.parent.name}: requesting initial enrichment")
|
145
|
+
|
146
|
+
client = OpenAI(api_key=api_key)
|
147
|
+
msgs = build_pdf_prompt(summary)
|
148
|
+
|
149
|
+
attempts: List[Dict[str, Any]] = [] # keep all versions
|
150
|
+
|
151
|
+
# Initial attempt
|
152
|
+
completion = client.beta.chat.completions.parse(
|
153
|
+
model=model, messages=msgs, response_format=DocOutput
|
154
|
+
)
|
155
|
+
|
156
|
+
doc_out = completion.choices[0].message.parsed
|
157
|
+
quality = analyze_code_quality(doc_out.code_suggestion)
|
158
|
+
print(f"Initial quality score: {quality['score']}/12")
|
159
|
+
attempts.append(
|
160
|
+
{
|
161
|
+
"attempt": 0,
|
162
|
+
"score": quality["score"],
|
163
|
+
"thought_process": doc_out.thought_process,
|
164
|
+
"code_suggestion": doc_out.code_suggestion,
|
165
|
+
}
|
166
|
+
)
|
167
|
+
|
168
|
+
best_doc = doc_out
|
169
|
+
best_score = quality["score"]
|
170
|
+
|
171
|
+
# Retry loop
|
172
|
+
retry_count = 0
|
173
|
+
while quality["score"] < retry_threshold and retry_count < max_retries:
|
174
|
+
retry_count += 1
|
175
|
+
print(f"[retry {retry_count}] Score below threshold, requesting improvement...")
|
176
|
+
|
177
|
+
retry_msgs = build_retry_prompt(
|
178
|
+
doc_out.code_suggestion, quality, {"goal": summary.get("goal", "")}
|
179
|
+
)
|
180
|
+
|
181
|
+
retry_completion = client.beta.chat.completions.parse(
|
182
|
+
model=model, messages=retry_msgs, response_format=RetryOutput
|
183
|
+
)
|
184
|
+
retry_out = retry_completion.choices[0].message.parsed
|
185
|
+
|
186
|
+
# Evaluate new version
|
187
|
+
new_quality = analyze_code_quality(retry_out.code_suggestion)
|
188
|
+
print(f"Retry {retry_count} quality score: {new_quality['score']}/12")
|
189
|
+
|
190
|
+
# Record attempt details
|
191
|
+
attempts.append(
|
192
|
+
{
|
193
|
+
"attempt": retry_count,
|
194
|
+
"score": new_quality["score"],
|
195
|
+
"thought_process": retry_out.thought_process,
|
196
|
+
"code_suggestion": retry_out.code_suggestion,
|
197
|
+
}
|
198
|
+
)
|
199
|
+
|
200
|
+
# Update best if improved
|
201
|
+
if new_quality["score"] > best_score:
|
202
|
+
best_score = new_quality["score"]
|
203
|
+
best_doc.thought_process = retry_out.thought_process
|
204
|
+
best_doc.code_suggestion = retry_out.code_suggestion
|
205
|
+
|
206
|
+
# Prepare for next iteration
|
207
|
+
quality = new_quality
|
208
|
+
doc_out.code_suggestion = retry_out.code_suggestion
|
209
|
+
doc_out.thought_process = retry_out.thought_process
|
210
|
+
|
211
|
+
# Save results – keep best version at root, all attempts nested
|
212
|
+
summary["thought_process"] = best_doc.thought_process
|
213
|
+
summary["code_suggestion"] = best_doc.code_suggestion
|
214
|
+
summary["difficult_elements"] = getattr(
|
215
|
+
best_doc, "difficult_elements", summary.get("difficult_elements")
|
216
|
+
)
|
217
|
+
summary["test_case"] = getattr(best_doc, "test_case", summary.get("test_case"))
|
218
|
+
summary["quality_score"] = best_score
|
219
|
+
summary["retry_count"] = retry_count
|
220
|
+
summary["attempts"] = attempts # new key for analysis
|
221
|
+
|
222
|
+
summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False))
|
223
|
+
print(
|
224
|
+
f"[update] Best score: {best_score}/12 after {retry_count} retries (kept all {len(attempts)} attempts)"
|
225
|
+
)
|
226
|
+
|
227
|
+
|
228
|
+
def main():
|
229
|
+
ap = argparse.ArgumentParser()
|
230
|
+
ap.add_argument("--submission", help="Submission ID to enrich")
|
231
|
+
ap.add_argument("--model", default="gpt-4o")
|
232
|
+
ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"))
|
233
|
+
ap.add_argument("--force", action="store_true")
|
234
|
+
ap.add_argument(
|
235
|
+
"--retry-threshold", type=int, default=6, help="Minimum quality score before retry"
|
236
|
+
)
|
237
|
+
ap.add_argument("--max-retries", type=int, default=2, help="Maximum number of retry attempts")
|
238
|
+
ap.add_argument(
|
239
|
+
"--workers",
|
240
|
+
type=int,
|
241
|
+
default=1,
|
242
|
+
help="Number of parallel workers (use 1 to disable parallelism)",
|
243
|
+
)
|
244
|
+
args = ap.parse_args()
|
245
|
+
|
246
|
+
if not args.api_key:
|
247
|
+
raise SystemExit("OPENAI_API_KEY not provided")
|
248
|
+
|
249
|
+
global FORCE
|
250
|
+
FORCE = args.force
|
251
|
+
|
252
|
+
def _get_paths() -> Iterable[Path]:
|
253
|
+
if args.submission:
|
254
|
+
p = EVAL_DIR / args.submission / "summary.json"
|
255
|
+
if not p.exists():
|
256
|
+
raise SystemExit("summary.json not found")
|
257
|
+
return [p]
|
258
|
+
else:
|
259
|
+
return list(EVAL_DIR.glob("*/summary.json"))
|
260
|
+
|
261
|
+
paths = _get_paths()
|
262
|
+
|
263
|
+
if args.workers <= 1:
|
264
|
+
# Sequential processing
|
265
|
+
for p in paths:
|
266
|
+
try:
|
267
|
+
enrich_with_retry(
|
268
|
+
p, args.api_key, args.model, args.retry_threshold, args.max_retries
|
269
|
+
)
|
270
|
+
except Exception as e:
|
271
|
+
print(f"[error] {p.parent.name}: {e}")
|
272
|
+
else:
|
273
|
+
# Parallel processing with thread pool (IO-bound)
|
274
|
+
print(f"Running with {args.workers} parallel workers…")
|
275
|
+
|
276
|
+
def _safe_process(p: Path):
|
277
|
+
try:
|
278
|
+
enrich_with_retry(
|
279
|
+
p, args.api_key, args.model, args.retry_threshold, args.max_retries
|
280
|
+
)
|
281
|
+
except Exception as exc:
|
282
|
+
print(f"[error] {p.parent.name}: {exc}")
|
283
|
+
|
284
|
+
with _futures.ThreadPoolExecutor(max_workers=args.workers) as ex:
|
285
|
+
list(ex.map(_safe_process, paths))
|
286
|
+
|
287
|
+
|
288
|
+
if __name__ == "__main__":
|
289
|
+
main()
|
tools/bad_pdf_eval/reporter.py
CHANGED
tools/bad_pdf_eval/utils.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import re
|
2
|
+
import ssl
|
3
|
+
import urllib.request
|
2
4
|
from pathlib import Path
|
3
5
|
from typing import Optional
|
4
|
-
|
5
|
-
import ssl
|
6
|
+
|
6
7
|
from rich.console import Console
|
7
8
|
|
8
9
|
ROOT_DIR = Path(__file__).resolve().parent.parent.parent # project root
|
@@ -103,7 +104,9 @@ def find_local_pdf(submission_id: str, pdf_url: Optional[str] = None) -> Optiona
|
|
103
104
|
req = urllib.request.Request(pdf_url, headers={"User-Agent": "Mozilla/5.0"})
|
104
105
|
# Disable SSL verification edge-cases the storage host sometimes triggers
|
105
106
|
ctx = ssl.create_default_context()
|
106
|
-
with urllib.request.urlopen(req, context=ctx, timeout=30) as resp, open(
|
107
|
+
with urllib.request.urlopen(req, context=ctx, timeout=30) as resp, open(
|
108
|
+
dest_path, "wb"
|
109
|
+
) as f:
|
107
110
|
f.write(resp.read())
|
108
111
|
except Exception as e:
|
109
112
|
# Fallback: try requests if available (venv usually has it)
|
@@ -124,4 +127,4 @@ def find_local_pdf(submission_id: str, pdf_url: Optional[str] = None) -> Optiona
|
|
124
127
|
return None
|
125
128
|
|
126
129
|
# None found
|
127
|
-
return None
|
130
|
+
return None
|
@@ -1,118 +0,0 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
|
-
natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
|
3
|
-
natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
|
4
|
-
natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
|
5
|
-
natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
|
6
|
-
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
7
|
-
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
8
|
-
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
9
|
-
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
10
|
-
natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
|
11
|
-
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
12
|
-
natural_pdf/analyzers/layout/gemini.py,sha256=ldECVCQ5HNQA3Omjg2NOsTrJXslyYb0vErDncmLIiuE,10510
|
13
|
-
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=1v23FVCIGzkoiyRqiLZBwGZssBFKphtMossMENMuMxE,15519
|
14
|
-
natural_pdf/analyzers/layout/layout_manager.py,sha256=vDXBAaNwvp68CRcEPH58MGLxx01OdVgzOh7Uv53L6fs,10319
|
15
|
-
natural_pdf/analyzers/layout/layout_options.py,sha256=-Nv6bcu4_pqSCN6uNhCZ9mvoCBtRDZIUkO6kjkuLXsg,7703
|
16
|
-
natural_pdf/analyzers/layout/paddle.py,sha256=tX2bI1yayAdmRhvsfZ_Ygs7zAG5e9eW-pLJkw4NUpBQ,21325
|
17
|
-
natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
|
18
|
-
natural_pdf/analyzers/layout/surya.py,sha256=ugRXPIHiLoh65lfbbiXO317TbgdtQ-5kVN1nonEf4ws,9778
|
19
|
-
natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
|
20
|
-
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
21
|
-
natural_pdf/analyzers/layout/yolo.py,sha256=ruchj28sxar0DWDALwUz1j30z0CLIEp2QAs0gLVvC4E,8346
|
22
|
-
natural_pdf/classification/manager.py,sha256=pOP2LvJpTBGItvdIODnk735DXq7F2qqxN4AKmBORM3c,21775
|
23
|
-
natural_pdf/classification/mixin.py,sha256=nYpmHQ4BlrealdPtIJt-_idME5o-xKLKNuAdIHzWL6c,7580
|
24
|
-
natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZHWdWJzmsU,3239
|
25
|
-
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
26
|
-
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
27
|
-
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
|
-
natural_pdf/core/element_manager.py,sha256=DbRzAKD3to5NpKc73Q-TXZIZkhx8zZtbi_UNu5K7AAU,52766
|
29
|
-
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
30
|
-
natural_pdf/core/page.py,sha256=k4jezvsLqL07Raglc-rZmMnsVwBMo_A_OerklpBIejY,129477
|
31
|
-
natural_pdf/core/pdf.py,sha256=u0ZCPuIijNecU-AJHLvqfAYVCr9h7MgUKnlEtH6RoZI,75969
|
32
|
-
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
33
|
-
natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
|
34
|
-
natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
|
35
|
-
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
36
|
-
natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
|
37
|
-
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
38
|
-
natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
|
39
|
-
natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
|
40
|
-
natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
|
41
|
-
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
42
|
-
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
43
|
-
natural_pdf/elements/region.py,sha256=23J5Tv7ffAgz3IBgDXPq9Ab_lLg2Sog7elFRb6nvvZE,140541
|
44
|
-
natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
|
45
|
-
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
46
|
-
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
47
|
-
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
48
|
-
natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
|
49
|
-
natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
|
50
|
-
natural_pdf/exporters/original_pdf.py,sha256=dtvC4er6TWOfqq-n24Pejw3mlAuPd8IVyihggJtcf0s,6634
|
51
|
-
natural_pdf/exporters/paddleocr.py,sha256=RBP03GCk0mLeC7tWtuti8AIUHlpOrtvbWkE2n7Ja7k8,19484
|
52
|
-
natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
|
53
|
-
natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
54
|
-
natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
|
55
|
-
natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
|
56
|
-
natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6HcsM,4979
|
57
|
-
natural_pdf/extraction/mixin.py,sha256=_5wGnzOCEuRWhqdSUV1Lqo9HIi56YC4MWzbBxOkOEKU,23160
|
58
|
-
natural_pdf/extraction/result.py,sha256=D5DhjxLW7IvhEkvsAP7Zs2YA8K4hyuoTg681CSn5qA0,1825
|
59
|
-
natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
|
60
|
-
natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
|
61
|
-
natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
|
62
|
-
natural_pdf/flows/flow.py,sha256=I61BpFVDQyo6ORsmoqoYiOEP1DBRp0vgDJjm_V8frhc,10562
|
63
|
-
natural_pdf/flows/region.py,sha256=s_YAT_0KsrwUs73hhU9xr_35Ufr__XNhRjHSQkxcfYU,27647
|
64
|
-
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
65
|
-
natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
|
66
|
-
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
67
|
-
natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
|
68
|
-
natural_pdf/ocr/engine_paddle.py,sha256=9tQZl1VqN6d_KEWUY_S9tfrDLiR4FCHMjgSRNwPlsu8,16152
|
69
|
-
natural_pdf/ocr/engine_surya.py,sha256=lOvSbZk53VKFVxRmqcQzM_0dHVdwTkRGiDZ9AWCgL1Q,5951
|
70
|
-
natural_pdf/ocr/ocr_factory.py,sha256=qjGL3hm_nTzxjwYWP0JE7dCFXZjKN8Z7f9c0oqasb9M,5262
|
71
|
-
natural_pdf/ocr/ocr_manager.py,sha256=jFJI8v3coapKpERoUlP-ptwguZG_Dl4VlclD0xQ6Us8,16192
|
72
|
-
natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
|
73
|
-
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
74
|
-
natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
|
75
|
-
natural_pdf/qa/document_qa.py,sha256=cli1E9NBSVtT5Qo6n7ZRd7BpstnbpZfkljX69LGTYU8,19608
|
76
|
-
natural_pdf/qa/qa_result.py,sha256=_q4dlSqsjtgomcI8-pqbOT69lqQKnEMkhZNydoxEkkE,2227
|
77
|
-
natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
|
78
|
-
natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
|
79
|
-
natural_pdf/search/numpy_search_service.py,sha256=MoPBlyHTDqah1IrwBzyglEyiXlF4wqaU_5mml_ngvGc,10328
|
80
|
-
natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
|
81
|
-
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
82
|
-
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
83
|
-
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
84
|
-
natural_pdf/selectors/parser.py,sha256=T9r7XZhM1cGSYQrc9amUHbFtX-zBqd9_YPK0scwCjAQ,34231
|
85
|
-
natural_pdf/tables/__init__.py,sha256=y65LM2wnu81yzvOX-J_5NXiIK4vEUtHa3EM1xv-0ttQ,105
|
86
|
-
natural_pdf/tables/result.py,sha256=OYc-MjnP-VRTVaY-pBt84E-d8N3AaqzwAud0hHt5sVY,3979
|
87
|
-
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
88
|
-
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
89
|
-
natural_pdf/utils/bidi_mirror.py,sha256=SAe5SnL-xG5Wyo3LtkMttLdsnQqZhzAebLc7BAe6LhQ,1150
|
90
|
-
natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
|
91
|
-
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
92
|
-
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
93
|
-
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
94
|
-
natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
|
95
|
-
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
96
|
-
natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
|
97
|
-
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
98
|
-
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
99
|
-
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
100
|
-
natural_pdf-0.1.33.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
101
|
-
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
102
|
-
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
103
|
-
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
104
|
-
optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
|
105
|
-
optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
|
106
|
-
tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
107
|
-
tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
|
108
|
-
tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
|
109
|
-
tools/bad_pdf_eval/eval_suite.py,sha256=-MK-XLqBo1025sccwYL6tnf7mZ1ZEpxu6EsTYv2ppmU,4294
|
110
|
-
tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrOw1q6ARMl-EazIU,1906
|
111
|
-
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
112
|
-
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
113
|
-
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
114
|
-
natural_pdf-0.1.33.dist-info/METADATA,sha256=mSAwh3vuD9aRvO_AC_XBZG5sw9SeiuidC86a7kuV--I,6711
|
115
|
-
natural_pdf-0.1.33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
116
|
-
natural_pdf-0.1.33.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
117
|
-
natural_pdf-0.1.33.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
118
|
-
natural_pdf-0.1.33.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|