natural-pdf 0.1.32__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/element_manager.py +62 -42
- natural_pdf/core/page.py +27 -2
- natural_pdf/core/pdf.py +16 -3
- natural_pdf/elements/region.py +19 -6
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/RECORD +10 -10
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/top_level.txt +0 -0
@@ -146,7 +146,7 @@ class ElementManager:
|
|
146
146
|
contained in the Page class, providing better separation of concerns.
|
147
147
|
"""
|
148
148
|
|
149
|
-
def __init__(self, page, font_attrs=None):
|
149
|
+
def __init__(self, page, font_attrs=None, load_text: bool = True):
|
150
150
|
"""
|
151
151
|
Initialize the ElementManager.
|
152
152
|
|
@@ -156,9 +156,11 @@ class ElementManager:
|
|
156
156
|
Default: ['fontname', 'size', 'bold', 'italic']
|
157
157
|
None: Only consider spatial relationships
|
158
158
|
List: Custom attributes to consider
|
159
|
+
load_text: Whether to load text elements from the PDF (default: True).
|
159
160
|
"""
|
160
161
|
self._page = page
|
161
162
|
self._elements = None # Lazy-loaded
|
163
|
+
self._load_text = load_text
|
162
164
|
# Default to splitting by fontname, size, bold, italic if not specified
|
163
165
|
# Renamed internal variable for clarity
|
164
166
|
self._word_split_attributes = (
|
@@ -175,11 +177,15 @@ class ElementManager:
|
|
175
177
|
|
176
178
|
logger.debug(f"Page {self._page.number}: Loading elements...")
|
177
179
|
|
178
|
-
# 1. Prepare character dictionaries
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
180
|
+
# 1. Prepare character dictionaries only if loading text
|
181
|
+
if self._load_text:
|
182
|
+
prepared_char_dicts = self._prepare_char_dicts()
|
183
|
+
logger.debug(
|
184
|
+
f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
|
185
|
+
)
|
186
|
+
else:
|
187
|
+
prepared_char_dicts = []
|
188
|
+
logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
|
183
189
|
|
184
190
|
# -------------------------------------------------------------
|
185
191
|
# Detect strikethrough (horizontal strike-out lines) on raw
|
@@ -189,61 +195,75 @@ class ElementManager:
|
|
189
195
|
# belong to the same word.
|
190
196
|
# -------------------------------------------------------------
|
191
197
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
198
|
+
if self._load_text and prepared_char_dicts:
|
199
|
+
try:
|
200
|
+
self._mark_strikethrough_chars(prepared_char_dicts)
|
201
|
+
except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
|
202
|
+
logger.warning(
|
203
|
+
f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
|
204
|
+
exc_info=True,
|
205
|
+
)
|
199
206
|
|
200
207
|
# -------------------------------------------------------------
|
201
208
|
# Detect underlines on raw characters (must come after strike so
|
202
209
|
# both attributes are present before word grouping).
|
203
210
|
# -------------------------------------------------------------
|
204
211
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
+
if self._load_text and prepared_char_dicts:
|
213
|
+
try:
|
214
|
+
self._mark_underline_chars(prepared_char_dicts)
|
215
|
+
except Exception as u_err: # pragma: no cover
|
216
|
+
logger.warning(
|
217
|
+
f"Page {self._page.number}: Underline detection failed – {u_err}",
|
218
|
+
exc_info=True,
|
219
|
+
)
|
212
220
|
|
213
221
|
# Detect highlights
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
222
|
+
if self._load_text and prepared_char_dicts:
|
223
|
+
try:
|
224
|
+
self._mark_highlight_chars(prepared_char_dicts)
|
225
|
+
except Exception as h_err:
|
226
|
+
logger.warning(
|
227
|
+
f"Page {self._page.number}: Highlight detection failed – {h_err}",
|
228
|
+
exc_info=True,
|
229
|
+
)
|
221
230
|
|
222
231
|
# Create a mapping from character dict to index for efficient lookup
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
232
|
+
if self._load_text:
|
233
|
+
char_to_index = {}
|
234
|
+
for idx, char_dict in enumerate(prepared_char_dicts):
|
235
|
+
key = (
|
236
|
+
char_dict.get("x0", 0),
|
237
|
+
char_dict.get("top", 0),
|
238
|
+
char_dict.get("text", ""),
|
239
|
+
)
|
240
|
+
char_to_index[key] = idx
|
241
|
+
else:
|
242
|
+
char_to_index = {}
|
231
243
|
|
232
244
|
# 2. Instantiate the custom word extractor
|
233
245
|
# Prefer page-level config over PDF-level for tolerance lookup
|
246
|
+
word_elements: List[TextElement] = []
|
247
|
+
|
248
|
+
# Get config objects (needed for auto_text_tolerance check)
|
234
249
|
page_config = getattr(self._page, "_config", {})
|
235
250
|
pdf_config = getattr(self._page._parent, "_config", {})
|
236
|
-
|
237
|
-
#
|
238
|
-
xt =
|
239
|
-
yt =
|
251
|
+
|
252
|
+
# Initialize tolerance variables
|
253
|
+
xt = None
|
254
|
+
yt = None
|
240
255
|
use_flow = pdf_config.get("use_text_flow", False)
|
256
|
+
|
257
|
+
if self._load_text and prepared_char_dicts:
|
258
|
+
# Start with any explicitly supplied tolerances (may be None)
|
259
|
+
xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
|
260
|
+
yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
|
241
261
|
|
242
262
|
# ------------------------------------------------------------------
|
243
263
|
# Auto-adaptive tolerance: scale based on median character size when
|
244
264
|
# requested and explicit values are absent.
|
245
265
|
# ------------------------------------------------------------------
|
246
|
-
if pdf_config.get("auto_text_tolerance", True):
|
266
|
+
if self._load_text and pdf_config.get("auto_text_tolerance", True):
|
247
267
|
import statistics
|
248
268
|
|
249
269
|
sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
|
@@ -323,7 +343,6 @@ class ElementManager:
|
|
323
343
|
current_line_key = line_key
|
324
344
|
lines[-1].append(char_dict)
|
325
345
|
|
326
|
-
word_elements: List[TextElement] = []
|
327
346
|
# Process each line separately with direction detection
|
328
347
|
for line_chars in lines:
|
329
348
|
if not line_chars:
|
@@ -480,7 +499,8 @@ class ElementManager:
|
|
480
499
|
except Exception:
|
481
500
|
w._obj["highlight_color"] = dominant_color
|
482
501
|
|
483
|
-
generated_words
|
502
|
+
# generated_words defaults to empty list if text loading is disabled
|
503
|
+
generated_words = word_elements if self._load_text else []
|
484
504
|
logger.debug(
|
485
505
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
486
506
|
)
|
natural_pdf/core/page.py
CHANGED
@@ -101,7 +101,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
101
101
|
with improved selection, navigation, extraction, and question-answering capabilities.
|
102
102
|
"""
|
103
103
|
|
104
|
-
def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
|
104
|
+
def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
|
105
105
|
"""
|
106
106
|
Initialize a page wrapper.
|
107
107
|
|
@@ -110,10 +110,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
110
110
|
parent: Parent PDF object
|
111
111
|
index: Index of this page in the PDF (0-based)
|
112
112
|
font_attrs: Font attributes to consider when grouping characters into words.
|
113
|
+
load_text: Whether to load text elements from the PDF (default: True).
|
113
114
|
"""
|
114
115
|
self._page = page
|
115
116
|
self._parent = parent
|
116
117
|
self._index = index
|
118
|
+
self._load_text = load_text
|
117
119
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
118
120
|
self._exclusions = [] # List to store exclusion functions/regions
|
119
121
|
self._skew_angle: Optional[float] = None # Stores detected skew angle
|
@@ -136,7 +138,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
136
138
|
self._config = dict(getattr(self._parent, "_config", {}))
|
137
139
|
|
138
140
|
# Initialize ElementManager, passing font_attrs
|
139
|
-
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
141
|
+
self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
|
140
142
|
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
141
143
|
# --- NEW --- Central registry for analysis results
|
142
144
|
self.analyses: Dict[str, Any] = {}
|
@@ -2998,6 +3000,29 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2998
3000
|
"""
|
2999
3001
|
return self.find_all('*').inspect(limit=limit)
|
3000
3002
|
|
3003
|
+
def remove_text_layer(self) -> "Page":
|
3004
|
+
"""
|
3005
|
+
Remove all text elements from this page.
|
3006
|
+
|
3007
|
+
This removes all text elements (words and characters) from the page,
|
3008
|
+
effectively clearing the text layer.
|
3009
|
+
|
3010
|
+
Returns:
|
3011
|
+
Self for method chaining
|
3012
|
+
"""
|
3013
|
+
logger.info(f"Page {self.number}: Removing all text elements...")
|
3014
|
+
|
3015
|
+
# Remove all words and chars from the element manager
|
3016
|
+
removed_words = len(self._element_mgr.words)
|
3017
|
+
removed_chars = len(self._element_mgr.chars)
|
3018
|
+
|
3019
|
+
# Clear the lists
|
3020
|
+
self._element_mgr._elements["words"] = []
|
3021
|
+
self._element_mgr._elements["chars"] = []
|
3022
|
+
|
3023
|
+
logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
|
3024
|
+
return self
|
3025
|
+
|
3001
3026
|
@property
|
3002
3027
|
def lines(self) -> List[Any]:
|
3003
3028
|
"""Get all line elements on this page."""
|
natural_pdf/core/pdf.py
CHANGED
@@ -108,12 +108,13 @@ class _LazyPageList(Sequence):
|
|
108
108
|
also supported and will materialise pages on demand.
|
109
109
|
"""
|
110
110
|
|
111
|
-
def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
|
111
|
+
def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True):
|
112
112
|
self._parent_pdf = parent_pdf
|
113
113
|
self._plumber_pdf = plumber_pdf
|
114
114
|
self._font_attrs = font_attrs
|
115
115
|
# One slot per pdfplumber page – initially all None
|
116
116
|
self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
|
117
|
+
self._load_text = load_text
|
117
118
|
|
118
119
|
# Internal helper -----------------------------------------------------
|
119
120
|
def _create_page(self, index: int) -> "Page":
|
@@ -123,7 +124,7 @@ class _LazyPageList(Sequence):
|
|
123
124
|
from natural_pdf.core.page import Page
|
124
125
|
|
125
126
|
plumber_page = self._plumber_pdf.pages[index]
|
126
|
-
cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
|
127
|
+
cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs, load_text=self._load_text)
|
127
128
|
self._cache[index] = cached
|
128
129
|
return cached
|
129
130
|
|
@@ -170,6 +171,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
170
171
|
keep_spaces: bool = True,
|
171
172
|
text_tolerance: Optional[dict] = None,
|
172
173
|
auto_text_tolerance: bool = True,
|
174
|
+
text_layer: bool = True,
|
173
175
|
):
|
174
176
|
"""
|
175
177
|
Initialize the enhanced PDF object.
|
@@ -181,11 +183,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
181
183
|
keep_spaces: Whether to include spaces in word elements
|
182
184
|
text_tolerance: PDFplumber-style tolerance settings
|
183
185
|
auto_text_tolerance: Whether to automatically scale text tolerance
|
186
|
+
text_layer: Whether to keep the existing text layer from the PDF (default: True).
|
187
|
+
If False, removes all existing text elements during initialization.
|
184
188
|
"""
|
185
189
|
self._original_path_or_stream = path_or_url_or_stream
|
186
190
|
self._temp_file = None
|
187
191
|
self._resolved_path = None
|
188
192
|
self._is_stream = False
|
193
|
+
self._text_layer = text_layer
|
189
194
|
stream_to_open = None
|
190
195
|
|
191
196
|
if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
|
@@ -257,7 +262,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
257
262
|
self._manager_registry = {}
|
258
263
|
|
259
264
|
# Lazily instantiate pages only when accessed
|
260
|
-
self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
|
265
|
+
self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer)
|
261
266
|
|
262
267
|
self._element_cache = {}
|
263
268
|
self._exclusions = []
|
@@ -267,6 +272,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
267
272
|
|
268
273
|
self._initialize_managers()
|
269
274
|
self._initialize_highlighter()
|
275
|
+
|
276
|
+
# Remove text layer if requested
|
277
|
+
if not self._text_layer:
|
278
|
+
logger.info("Removing text layer as requested (text_layer=False)")
|
279
|
+
# Text layer is not loaded when text_layer=False, so no need to remove
|
280
|
+
pass
|
281
|
+
|
270
282
|
# Analysis results accessed via self.analyses property (see below)
|
271
283
|
|
272
284
|
# --- Automatic cleanup when object is garbage-collected ---
|
@@ -1463,6 +1475,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1463
1475
|
reading_order=self._reading_order,
|
1464
1476
|
font_attrs=self._font_attrs,
|
1465
1477
|
keep_spaces=self._config.get("keep_spaces", True),
|
1478
|
+
text_layer=self._text_layer,
|
1466
1479
|
)
|
1467
1480
|
return new_pdf
|
1468
1481
|
except Exception as e:
|
natural_pdf/elements/region.py
CHANGED
@@ -2282,15 +2282,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2282
2282
|
if success:
|
2283
2283
|
removed_count += 1
|
2284
2284
|
|
2285
|
-
# Remove OCR elements overlapping this region
|
2285
|
+
# Remove ALL OCR elements overlapping this region
|
2286
|
+
# Remove elements with source=="ocr" (built-in OCR) or matching the source_label (previous custom OCR)
|
2286
2287
|
for word in list(self.page._element_mgr.words):
|
2287
|
-
|
2288
|
+
word_source = getattr(word, "source", "")
|
2289
|
+
# Match built-in OCR behavior: remove elements with source "ocr" exactly
|
2290
|
+
# Also remove elements with the same source_label to avoid duplicates
|
2291
|
+
if (word_source == "ocr" or word_source == source_label) and self.intersects(word):
|
2288
2292
|
_safe_remove(word)
|
2289
2293
|
|
2290
|
-
# Also
|
2291
|
-
for
|
2292
|
-
|
2293
|
-
|
2294
|
+
# Also remove char dicts if needed (matching built-in OCR)
|
2295
|
+
for char in list(self.page._element_mgr.chars):
|
2296
|
+
# char can be dict or TextElement; normalize
|
2297
|
+
char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
|
2298
|
+
if char_src == "ocr" or char_src == source_label:
|
2299
|
+
# Rough bbox for dicts
|
2300
|
+
if isinstance(char, dict):
|
2301
|
+
cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
|
2302
|
+
else:
|
2303
|
+
cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
|
2304
|
+
# Quick overlap check
|
2305
|
+
if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
|
2306
|
+
_safe_remove(char)
|
2294
2307
|
|
2295
2308
|
if removed_count > 0:
|
2296
2309
|
logger.info(
|
@@ -25,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
|
|
25
25
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
26
26
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
|
-
natural_pdf/core/element_manager.py,sha256=
|
28
|
+
natural_pdf/core/element_manager.py,sha256=DbRzAKD3to5NpKc73Q-TXZIZkhx8zZtbi_UNu5K7AAU,52766
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=k4jezvsLqL07Raglc-rZmMnsVwBMo_A_OerklpBIejY,129477
|
31
|
+
natural_pdf/core/pdf.py,sha256=u0ZCPuIijNecU-AJHLvqfAYVCr9h7MgUKnlEtH6RoZI,75969
|
32
32
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
33
33
|
natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
|
34
34
|
natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
|
@@ -40,7 +40,7 @@ natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtf
|
|
40
40
|
natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
|
41
41
|
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
42
42
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
43
|
-
natural_pdf/elements/region.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=23J5Tv7ffAgz3IBgDXPq9Ab_lLg2Sog7elFRb6nvvZE,140541
|
44
44
|
natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
|
45
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
46
46
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
@@ -97,7 +97,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
|
|
97
97
|
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
98
98
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
99
99
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
100
|
-
natural_pdf-0.1.
|
100
|
+
natural_pdf-0.1.33.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
101
101
|
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
102
102
|
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
103
103
|
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
@@ -111,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
|
|
111
111
|
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
112
112
|
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
113
113
|
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
114
|
-
natural_pdf-0.1.
|
115
|
-
natural_pdf-0.1.
|
116
|
-
natural_pdf-0.1.
|
117
|
-
natural_pdf-0.1.
|
118
|
-
natural_pdf-0.1.
|
114
|
+
natural_pdf-0.1.33.dist-info/METADATA,sha256=mSAwh3vuD9aRvO_AC_XBZG5sw9SeiuidC86a7kuV--I,6711
|
115
|
+
natural_pdf-0.1.33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
116
|
+
natural_pdf-0.1.33.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
117
|
+
natural_pdf-0.1.33.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
118
|
+
natural_pdf-0.1.33.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|