natural-pdf 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +40 -0
- natural_pdf/core/highlighting_service.py +4 -4
- natural_pdf/core/page.py +16 -2
- natural_pdf/describe/base.py +11 -1
- natural_pdf/describe/summary.py +26 -0
- natural_pdf/elements/base.py +2 -2
- natural_pdf/elements/collections.py +139 -100
- natural_pdf/elements/region.py +133 -12
- natural_pdf/elements/text.py +15 -7
- natural_pdf/flows/region.py +116 -1
- natural_pdf/qa/document_qa.py +162 -105
- natural_pdf/utils/text_extraction.py +34 -14
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +18 -18
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
natural_pdf/qa/document_qa.py
CHANGED
@@ -119,29 +119,52 @@ class DocumentQA:
|
|
119
119
|
def ask(
|
120
120
|
self,
|
121
121
|
image: Union[str, Image.Image, np.ndarray],
|
122
|
-
question: str,
|
122
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
123
123
|
word_boxes: List = None,
|
124
124
|
min_confidence: float = 0.1,
|
125
125
|
debug: bool = False,
|
126
126
|
debug_output_dir: str = "output",
|
127
|
-
) -> QAResult:
|
127
|
+
) -> Union[QAResult, List[QAResult]]:
|
128
128
|
"""
|
129
|
-
Ask
|
129
|
+
Ask one or more natural-language questions about the supplied document image.
|
130
|
+
|
131
|
+
This method now accepts a single *question* (``str``) **or** an
|
132
|
+
iterable of questions (``list``/``tuple`` of ``str``). When multiple
|
133
|
+
questions are provided they are executed in a single batch through the
|
134
|
+
underlying transformers pipeline which is considerably faster than
|
135
|
+
looping and calling :py:meth:`ask` repeatedly.
|
130
136
|
|
131
137
|
Args:
|
132
|
-
image: PIL Image
|
133
|
-
question:
|
134
|
-
word_boxes: Optional pre-extracted word
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
+
image: PIL ``Image``, ``numpy`` array, or path to an image file.
|
139
|
+
question: A question string *or* a list/tuple of question strings.
|
140
|
+
word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
|
141
|
+
format ``[[text, [x0, y0, x1, y1]], …]``.
|
142
|
+
min_confidence: Minimum confidence threshold below which an answer
|
143
|
+
will be marked as ``found = False``.
|
144
|
+
debug: If ``True`` intermediate artefacts will be written to
|
145
|
+
*debug_output_dir* to aid troubleshooting.
|
146
|
+
debug_output_dir: Directory where debug artefacts should be saved.
|
138
147
|
|
139
148
|
Returns:
|
140
|
-
QAResult
|
149
|
+
• A single :class:`QAResult` when *question* is a string.
|
150
|
+
• A ``list`` of :class:`QAResult`` objects (one per question) when
|
151
|
+
*question* is a list/tuple.
|
141
152
|
"""
|
142
153
|
if not self._is_initialized:
|
143
154
|
raise RuntimeError("DocumentQA is not properly initialized")
|
144
155
|
|
156
|
+
# Normalise *questions* to a list so we can treat batch and single
|
157
|
+
# uniformly. We'll remember if the caller supplied a single question
|
158
|
+
# so that we can preserve the original return type.
|
159
|
+
single_question = False
|
160
|
+
if isinstance(question, str):
|
161
|
+
questions = [question]
|
162
|
+
single_question = True
|
163
|
+
elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
|
164
|
+
questions = list(question)
|
165
|
+
else:
|
166
|
+
raise TypeError("'question' must be a string or a list/tuple of strings")
|
167
|
+
|
145
168
|
# Process the image
|
146
169
|
if isinstance(image, str):
|
147
170
|
# It's a file path
|
@@ -157,12 +180,16 @@ class DocumentQA:
|
|
157
180
|
else:
|
158
181
|
raise TypeError("Image must be a PIL Image, numpy array, or file path")
|
159
182
|
|
160
|
-
#
|
161
|
-
|
183
|
+
# ------------------------------------------------------------------
|
184
|
+
# Build the queries for the pipeline (either single dict or list).
|
185
|
+
# ------------------------------------------------------------------
|
186
|
+
def _build_query_dict(q: str):
|
187
|
+
d = {"image": image_obj, "question": q}
|
188
|
+
if word_boxes:
|
189
|
+
d["word_boxes"] = word_boxes
|
190
|
+
return d
|
162
191
|
|
163
|
-
|
164
|
-
if word_boxes:
|
165
|
-
query["word_boxes"] = word_boxes
|
192
|
+
queries = [_build_query_dict(q) for q in questions]
|
166
193
|
|
167
194
|
# Save debug information if requested
|
168
195
|
if debug:
|
@@ -198,48 +225,79 @@ class DocumentQA:
|
|
198
225
|
logger.info(f"Word boxes: {word_boxes_path}")
|
199
226
|
logger.info(f"Visualization: {vis_path}")
|
200
227
|
|
201
|
-
#
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
if debug:
|
208
|
-
result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
|
209
|
-
with open(result_path, "w") as f:
|
210
|
-
# Convert any non-serializable data
|
211
|
-
serializable_result = {
|
212
|
-
k: (
|
213
|
-
str(v)
|
214
|
-
if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
|
215
|
-
else v
|
216
|
-
)
|
217
|
-
for k, v in result.items()
|
218
|
-
}
|
219
|
-
json.dump(serializable_result, f, indent=2)
|
220
|
-
|
221
|
-
# Check confidence against threshold
|
222
|
-
if result["score"] < min_confidence:
|
223
|
-
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
224
|
-
return QAResult(
|
225
|
-
answer="",
|
226
|
-
confidence=result["score"],
|
227
|
-
start=result.get("start", -1),
|
228
|
-
end=result.get("end", -1),
|
229
|
-
found=False,
|
230
|
-
)
|
231
|
-
|
232
|
-
return QAResult(
|
233
|
-
answer=result["answer"],
|
234
|
-
confidence=result["score"],
|
235
|
-
start=result.get("start", 0),
|
236
|
-
end=result.get("end", 0),
|
237
|
-
found=True,
|
228
|
+
# ------------------------------------------------------------------
|
229
|
+
# Run the queries through the pipeline (batch or single) and collect
|
230
|
+
# *only the top answer* for each, mirroring the original behaviour.
|
231
|
+
# ------------------------------------------------------------------
|
232
|
+
logger.info(
|
233
|
+
f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
|
238
234
|
)
|
239
235
|
|
236
|
+
# When we pass a list the pipeline returns a list of per-question
|
237
|
+
# results; each per-question result is itself a list (top-k answers).
|
238
|
+
# We keep only the best answer (index 0) to maintain backwards
|
239
|
+
# compatibility.
|
240
|
+
raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
|
241
|
+
|
242
|
+
# Ensure we always have a list aligned with *questions*
|
243
|
+
if len(queries) == 1:
|
244
|
+
raw_results = [raw_results]
|
245
|
+
|
246
|
+
processed_results: List[QAResult] = []
|
247
|
+
|
248
|
+
for q, res in zip(questions, raw_results):
|
249
|
+
top_res = res[0] if isinstance(res, list) else res # pipeline may or may not nest
|
250
|
+
|
251
|
+
# Save per-question result in debug mode
|
252
|
+
if debug:
|
253
|
+
# File names: debug_qa_result_0.json, …
|
254
|
+
result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
|
255
|
+
try:
|
256
|
+
with open(result_path, "w") as f:
|
257
|
+
serializable = {
|
258
|
+
k: (
|
259
|
+
str(v)
|
260
|
+
if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
|
261
|
+
else v
|
262
|
+
)
|
263
|
+
for k, v in top_res.items()
|
264
|
+
}
|
265
|
+
json.dump(serializable, f, indent=2)
|
266
|
+
except Exception as e:
|
267
|
+
logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
|
268
|
+
|
269
|
+
# Apply confidence threshold
|
270
|
+
if top_res["score"] < min_confidence:
|
271
|
+
qa_res = QAResult(
|
272
|
+
question=q,
|
273
|
+
answer="",
|
274
|
+
confidence=top_res["score"],
|
275
|
+
start=top_res.get("start", -1),
|
276
|
+
end=top_res.get("end", -1),
|
277
|
+
found=False,
|
278
|
+
)
|
279
|
+
else:
|
280
|
+
qa_res = QAResult(
|
281
|
+
question=q,
|
282
|
+
answer=top_res["answer"],
|
283
|
+
confidence=top_res["score"],
|
284
|
+
start=top_res.get("start", 0),
|
285
|
+
end=top_res.get("end", 0),
|
286
|
+
found=True,
|
287
|
+
)
|
288
|
+
|
289
|
+
processed_results.append(qa_res)
|
290
|
+
|
291
|
+
# Return appropriately typed result (single item or list)
|
292
|
+
return processed_results[0] if single_question else processed_results
|
293
|
+
|
240
294
|
def ask_pdf_page(
|
241
|
-
self,
|
242
|
-
|
295
|
+
self,
|
296
|
+
page,
|
297
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
298
|
+
min_confidence: float = 0.1,
|
299
|
+
debug: bool = False,
|
300
|
+
) -> Union[QAResult, List[QAResult]]:
|
243
301
|
"""
|
244
302
|
Ask a question about a specific PDF page.
|
245
303
|
|
@@ -270,8 +328,8 @@ class DocumentQA:
|
|
270
328
|
page_image.save(temp_path)
|
271
329
|
|
272
330
|
try:
|
273
|
-
# Ask the question
|
274
|
-
|
331
|
+
# Ask the question(s)
|
332
|
+
result_obj = self.ask(
|
275
333
|
image=temp_path,
|
276
334
|
question=question,
|
277
335
|
word_boxes=word_boxes,
|
@@ -279,34 +337,35 @@ class DocumentQA:
|
|
279
337
|
debug=debug,
|
280
338
|
)
|
281
339
|
|
282
|
-
#
|
283
|
-
|
340
|
+
# Ensure we have a list for uniform processing
|
341
|
+
results = result_obj if isinstance(result_obj, list) else [result_obj]
|
284
342
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
end_idx = result.end
|
343
|
+
for res in results:
|
344
|
+
# Attach page reference
|
345
|
+
res.page_num = page.index
|
289
346
|
|
290
|
-
#
|
291
|
-
if
|
292
|
-
|
293
|
-
|
347
|
+
# Map answer span back to source elements
|
348
|
+
if res.found and "start" in res and "end" in res:
|
349
|
+
start_idx = res.start
|
350
|
+
end_idx = res.end
|
294
351
|
|
295
|
-
|
296
|
-
|
352
|
+
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
353
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
297
354
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
if element.text in matched_texts:
|
305
|
-
matched_texts.remove(element.text)
|
355
|
+
source_elements = []
|
356
|
+
for element in elements:
|
357
|
+
if hasattr(element, "text") and element.text in matched_texts:
|
358
|
+
source_elements.append(element)
|
359
|
+
if element.text in matched_texts:
|
360
|
+
matched_texts.remove(element.text)
|
306
361
|
|
307
|
-
|
362
|
+
res.source_elements = ElementCollection(source_elements)
|
308
363
|
|
309
|
-
|
364
|
+
# Return result(s) preserving original input type
|
365
|
+
if isinstance(question, (list, tuple)):
|
366
|
+
return results
|
367
|
+
else:
|
368
|
+
return results[0]
|
310
369
|
|
311
370
|
finally:
|
312
371
|
# Clean up temporary file
|
@@ -314,8 +373,12 @@ class DocumentQA:
|
|
314
373
|
os.remove(temp_path)
|
315
374
|
|
316
375
|
def ask_pdf_region(
|
317
|
-
self,
|
318
|
-
|
376
|
+
self,
|
377
|
+
region,
|
378
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
379
|
+
min_confidence: float = 0.1,
|
380
|
+
debug: bool = False,
|
381
|
+
) -> Union[QAResult, List[QAResult]]:
|
319
382
|
"""
|
320
383
|
Ask a question about a specific region of a PDF page.
|
321
384
|
|
@@ -352,8 +415,8 @@ class DocumentQA:
|
|
352
415
|
region_image.save(temp_path)
|
353
416
|
|
354
417
|
try:
|
355
|
-
# Ask the question
|
356
|
-
|
418
|
+
# Ask the question(s)
|
419
|
+
result_obj = self.ask(
|
357
420
|
image=temp_path,
|
358
421
|
question=question,
|
359
422
|
word_boxes=word_boxes,
|
@@ -361,35 +424,29 @@ class DocumentQA:
|
|
361
424
|
debug=debug,
|
362
425
|
)
|
363
426
|
|
364
|
-
|
365
|
-
result.region = region
|
366
|
-
result.page_num = region.page.index
|
427
|
+
results = result_obj if isinstance(result_obj, list) else [result_obj]
|
367
428
|
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
end_idx = result.end
|
429
|
+
for res in results:
|
430
|
+
res.region = region
|
431
|
+
res.page_num = region.page.index
|
372
432
|
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
# Since word_boxes may have filtered out some elements, we need to map indices
|
433
|
+
if res.found and "start" in res and "end" in res:
|
434
|
+
start_idx = res.start
|
435
|
+
end_idx = res.end
|
377
436
|
|
378
|
-
|
379
|
-
|
437
|
+
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
438
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
380
439
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
if element.text in matched_texts:
|
388
|
-
matched_texts.remove(element.text)
|
440
|
+
source_elements = []
|
441
|
+
for element in elements:
|
442
|
+
if hasattr(element, "text") and element.text in matched_texts:
|
443
|
+
source_elements.append(element)
|
444
|
+
if element.text in matched_texts:
|
445
|
+
matched_texts.remove(element.text)
|
389
446
|
|
390
|
-
|
447
|
+
res.source_elements = ElementCollection(source_elements)
|
391
448
|
|
392
|
-
return
|
449
|
+
return results if isinstance(question, (list, tuple)) else results[0]
|
393
450
|
|
394
451
|
finally:
|
395
452
|
# Clean up temporary file
|
@@ -63,9 +63,9 @@ def _get_layout_kwargs(
|
|
63
63
|
else:
|
64
64
|
logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
|
65
65
|
|
66
|
-
# 4. Ensure layout flag is present, defaulting to
|
66
|
+
# 4. Ensure layout flag is present, defaulting to False (caller can override)
|
67
67
|
if "layout" not in layout_kwargs:
|
68
|
-
layout_kwargs["layout"] =
|
68
|
+
layout_kwargs["layout"] = False
|
69
69
|
|
70
70
|
return layout_kwargs
|
71
71
|
|
@@ -203,24 +203,42 @@ def generate_text_layout(
|
|
203
203
|
logger.debug("generate_text_layout: No valid character dicts found after filtering.")
|
204
204
|
return ""
|
205
205
|
|
206
|
-
#
|
207
|
-
|
208
|
-
use_layout = layout_kwargs.pop("layout", True) # Extract layout flag, default True
|
206
|
+
# Make a working copy of user_kwargs so we can safely pop custom keys
|
207
|
+
incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
|
209
208
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
209
|
+
# --- Handle custom 'strip' option ------------------------------------
|
210
|
+
# * strip=True – post-process the final string to remove leading/trailing
|
211
|
+
# whitespace (typically used when layout=False)
|
212
|
+
# * strip=False – preserve whitespace exactly as produced.
|
213
|
+
# Default behaviour depends on the layout flag (see below).
|
214
|
+
explicit_strip_flag = incoming_kwargs.pop("strip", None) # May be None
|
215
|
+
|
216
|
+
# Prepare layout arguments now that we've removed the non-pdfplumber key
|
217
|
+
layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
|
218
|
+
use_layout = layout_kwargs.get("layout", False)
|
219
|
+
|
220
|
+
# Determine final strip behaviour: if caller specified override, honour it;
|
221
|
+
# otherwise default to !use_layout (True when layout=False, False when
|
222
|
+
# layout=True) per user request.
|
223
|
+
strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
|
217
224
|
|
218
225
|
try:
|
219
|
-
# Sort chars primarily by top, then x0 before layout analysis
|
220
|
-
#
|
226
|
+
# Sort chars primarily by top, then x0 before layout analysis – required by
|
227
|
+
# pdfplumber so that grouping into lines works deterministically.
|
221
228
|
valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
|
229
|
+
|
230
|
+
# Build the text map. `layout_kwargs` still contains the caller-specified or
|
231
|
+
# default "layout" flag, which chars_to_textmap will respect.
|
222
232
|
textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
|
223
233
|
result = textmap.as_string
|
234
|
+
|
235
|
+
# ----------------------------------------------------------------
|
236
|
+
# Optional post-processing strip
|
237
|
+
# ----------------------------------------------------------------
|
238
|
+
if strip_result and isinstance(result, str):
|
239
|
+
# Remove trailing spaces on each line then trim leading/trailing
|
240
|
+
# blank lines for a cleaner output while keeping internal newlines.
|
241
|
+
result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
|
224
242
|
except Exception as e:
|
225
243
|
# Fallback to simple join on error
|
226
244
|
logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
|
@@ -230,5 +248,7 @@ def generate_text_layout(
|
|
230
248
|
# Fallback already has sorted characters if layout was attempted
|
231
249
|
# Need to use the valid_char_dicts here too
|
232
250
|
result = "".join(c.get("text", "") for c in valid_char_dicts)
|
251
|
+
if strip_result:
|
252
|
+
result = result.strip()
|
233
253
|
|
234
254
|
return result
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Requires-Python: >=3.9
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: markdown
|
14
15
|
Requires-Dist: pandas
|
15
16
|
Requires-Dist: pdfplumber
|
16
17
|
Requires-Dist: colormath2
|
@@ -1,7 +1,7 @@
|
|
1
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
2
|
natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
|
3
3
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
4
|
-
natural_pdf/analyzers/shape_detection_mixin.py,sha256=
|
4
|
+
natural_pdf/analyzers/shape_detection_mixin.py,sha256=aHn4EMdbwOe8VWECPceGs5wN7gJP_kIxyAbmbNlNPSs,83634
|
5
5
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
6
6
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
7
7
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
@@ -25,21 +25,21 @@ natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiY
|
|
25
25
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
26
26
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
27
27
|
natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
|
28
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
29
|
-
natural_pdf/core/page.py,sha256=
|
28
|
+
natural_pdf/core/highlighting_service.py,sha256=DKoaxiiuQsWgtf6wSroMAIcFiqJOOF7dXhciYdQKdCw,38223
|
29
|
+
natural_pdf/core/page.py,sha256=TOtpUp5lRhDj32wv3yvRaS8kxPX6R9904OCC6uHFi84,119512
|
30
30
|
natural_pdf/core/pdf.py,sha256=qsSW4RxOJRmCnweLPMs0NhzkRfiAVdghTgnh4D_wuO4,74295
|
31
31
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
32
|
-
natural_pdf/describe/base.py,sha256=
|
32
|
+
natural_pdf/describe/base.py,sha256=mUvEydumXXPJ2FkWAYm1BbWrRWY81I0dMyQrEU32rmc,17256
|
33
33
|
natural_pdf/describe/elements.py,sha256=xD8wwR1z5IKat7RIwoAwQRUEL6zJTEwcOKorF4F-xPg,12717
|
34
34
|
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
35
|
-
natural_pdf/describe/summary.py,sha256=
|
35
|
+
natural_pdf/describe/summary.py,sha256=h5zy9zG7t27wFnJ2hEguGSoURtN2IR4x6WBO3aXB4eo,7980
|
36
36
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
37
|
-
natural_pdf/elements/base.py,sha256=
|
38
|
-
natural_pdf/elements/collections.py,sha256=
|
37
|
+
natural_pdf/elements/base.py,sha256=iw-Ab0o7eI69npt0gAxQvA14GPWHAAhkLrJ_JeKvIos,43309
|
38
|
+
natural_pdf/elements/collections.py,sha256=JrM42VPRtDOJ9Q9KIR3SrcbamiiCHXI4nzTq2BBkeEk,124223
|
39
39
|
natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
|
40
40
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
41
|
-
natural_pdf/elements/region.py,sha256=
|
42
|
-
natural_pdf/elements/text.py,sha256=
|
41
|
+
natural_pdf/elements/region.py,sha256=CVncbiCk8ivn04CI7Ob93O7UY0ANVpCJwikBt-jVWgg,123698
|
42
|
+
natural_pdf/elements/text.py,sha256=yshGrvdiBZSkYhQfdi6Yz6NN0kWvmqKHSSC82D829os,11470
|
43
43
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
44
44
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
45
45
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -58,7 +58,7 @@ natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU
|
|
58
58
|
natural_pdf/flows/collections.py,sha256=qGuSPFSPQF-wiYquG6STiSzg_o951MSsFEq_B44Jef8,28441
|
59
59
|
natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
|
60
60
|
natural_pdf/flows/flow.py,sha256=I61BpFVDQyo6ORsmoqoYiOEP1DBRp0vgDJjm_V8frhc,10562
|
61
|
-
natural_pdf/flows/region.py,sha256=
|
61
|
+
natural_pdf/flows/region.py,sha256=4U3S7pLEa3oCyPfS-hpD0lSXf8MWT-MdF9AsVvMJbWU,26670
|
62
62
|
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
63
63
|
natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
|
64
64
|
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
@@ -70,7 +70,7 @@ natural_pdf/ocr/ocr_manager.py,sha256=K2gpFo3e6RB1ouXOstlEAAYd14DbjBNt5RH6J7ZdDQ
|
|
70
70
|
natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
|
71
71
|
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
72
72
|
natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
|
73
|
-
natural_pdf/qa/document_qa.py,sha256=
|
73
|
+
natural_pdf/qa/document_qa.py,sha256=6-XuIEFf5BcVA_e85FBmAeXpNZgzZhTBDkNUMPAl-tc,17803
|
74
74
|
natural_pdf/qa/qa_result.py,sha256=_q4dlSqsjtgomcI8-pqbOT69lqQKnEMkhZNydoxEkkE,2227
|
75
75
|
natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
|
76
76
|
natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
|
@@ -88,13 +88,13 @@ natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2b
|
|
88
88
|
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
89
89
|
natural_pdf/utils/packaging.py,sha256=Jshxp6S1zfcqoZmFhdd7WOpL--b6rBSz-Y9mYqELXIY,21581
|
90
90
|
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
91
|
-
natural_pdf/utils/text_extraction.py,sha256=
|
91
|
+
natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
|
92
92
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
93
93
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
94
94
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
95
|
-
natural_pdf-0.1.
|
96
|
-
natural_pdf-0.1.
|
97
|
-
natural_pdf-0.1.
|
98
|
-
natural_pdf-0.1.
|
99
|
-
natural_pdf-0.1.
|
100
|
-
natural_pdf-0.1.
|
95
|
+
natural_pdf-0.1.24.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
96
|
+
natural_pdf-0.1.24.dist-info/METADATA,sha256=qcyQUXKXciLsomzdsdkQ4inSw_MJbczyj8oPq4KVGZQ,6684
|
97
|
+
natural_pdf-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
98
|
+
natural_pdf-0.1.24.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
99
|
+
natural_pdf-0.1.24.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
100
|
+
natural_pdf-0.1.24.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|