natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +43 -3
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/classification/mixin.py +35 -14
- natural_pdf/classification/results.py +16 -1
- natural_pdf/cli.py +1 -0
- natural_pdf/core/highlighting_service.py +23 -0
- natural_pdf/core/page.py +32 -2
- natural_pdf/core/pdf.py +24 -4
- natural_pdf/describe/base.py +11 -1
- natural_pdf/describe/summary.py +26 -0
- natural_pdf/elements/base.py +81 -3
- natural_pdf/elements/collections.py +162 -101
- natural_pdf/elements/region.py +187 -160
- natural_pdf/elements/text.py +15 -7
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +2 -2
- natural_pdf/extraction/mixin.py +295 -11
- natural_pdf/extraction/result.py +28 -1
- natural_pdf/flows/region.py +117 -2
- natural_pdf/ocr/engine_surya.py +25 -5
- natural_pdf/qa/__init__.py +2 -1
- natural_pdf/qa/document_qa.py +166 -113
- natural_pdf/qa/qa_result.py +55 -0
- natural_pdf/selectors/parser.py +22 -0
- natural_pdf/utils/text_extraction.py +34 -14
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
natural_pdf/qa/document_qa.py
CHANGED
@@ -8,6 +8,7 @@ import numpy as np
|
|
8
8
|
from PIL import Image, ImageDraw
|
9
9
|
|
10
10
|
from natural_pdf.elements.collections import ElementCollection
|
11
|
+
from .qa_result import QAResult
|
11
12
|
|
12
13
|
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
13
14
|
|
@@ -118,34 +119,52 @@ class DocumentQA:
|
|
118
119
|
def ask(
|
119
120
|
self,
|
120
121
|
image: Union[str, Image.Image, np.ndarray],
|
121
|
-
question: str,
|
122
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
122
123
|
word_boxes: List = None,
|
123
124
|
min_confidence: float = 0.1,
|
124
125
|
debug: bool = False,
|
125
126
|
debug_output_dir: str = "output",
|
126
|
-
) ->
|
127
|
+
) -> Union[QAResult, List[QAResult]]:
|
127
128
|
"""
|
128
|
-
Ask
|
129
|
+
Ask one or more natural-language questions about the supplied document image.
|
130
|
+
|
131
|
+
This method now accepts a single *question* (``str``) **or** an
|
132
|
+
iterable of questions (``list``/``tuple`` of ``str``). When multiple
|
133
|
+
questions are provided they are executed in a single batch through the
|
134
|
+
underlying transformers pipeline which is considerably faster than
|
135
|
+
looping and calling :py:meth:`ask` repeatedly.
|
129
136
|
|
130
137
|
Args:
|
131
|
-
image: PIL Image
|
132
|
-
question:
|
133
|
-
word_boxes: Optional pre-extracted word
|
134
|
-
|
135
|
-
|
136
|
-
|
138
|
+
image: PIL ``Image``, ``numpy`` array, or path to an image file.
|
139
|
+
question: A question string *or* a list/tuple of question strings.
|
140
|
+
word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
|
141
|
+
format ``[[text, [x0, y0, x1, y1]], …]``.
|
142
|
+
min_confidence: Minimum confidence threshold below which an answer
|
143
|
+
will be marked as ``found = False``.
|
144
|
+
debug: If ``True`` intermediate artefacts will be written to
|
145
|
+
*debug_output_dir* to aid troubleshooting.
|
146
|
+
debug_output_dir: Directory where debug artefacts should be saved.
|
137
147
|
|
138
148
|
Returns:
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
"start": start word index,
|
143
|
-
"end": end word index
|
144
|
-
}
|
149
|
+
• A single :class:`QAResult` when *question* is a string.
|
150
|
+
• A ``list`` of :class:`QAResult`` objects (one per question) when
|
151
|
+
*question* is a list/tuple.
|
145
152
|
"""
|
146
153
|
if not self._is_initialized:
|
147
154
|
raise RuntimeError("DocumentQA is not properly initialized")
|
148
155
|
|
156
|
+
# Normalise *questions* to a list so we can treat batch and single
|
157
|
+
# uniformly. We'll remember if the caller supplied a single question
|
158
|
+
# so that we can preserve the original return type.
|
159
|
+
single_question = False
|
160
|
+
if isinstance(question, str):
|
161
|
+
questions = [question]
|
162
|
+
single_question = True
|
163
|
+
elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
|
164
|
+
questions = list(question)
|
165
|
+
else:
|
166
|
+
raise TypeError("'question' must be a string or a list/tuple of strings")
|
167
|
+
|
149
168
|
# Process the image
|
150
169
|
if isinstance(image, str):
|
151
170
|
# It's a file path
|
@@ -161,12 +180,16 @@ class DocumentQA:
|
|
161
180
|
else:
|
162
181
|
raise TypeError("Image must be a PIL Image, numpy array, or file path")
|
163
182
|
|
164
|
-
#
|
165
|
-
|
183
|
+
# ------------------------------------------------------------------
|
184
|
+
# Build the queries for the pipeline (either single dict or list).
|
185
|
+
# ------------------------------------------------------------------
|
186
|
+
def _build_query_dict(q: str):
|
187
|
+
d = {"image": image_obj, "question": q}
|
188
|
+
if word_boxes:
|
189
|
+
d["word_boxes"] = word_boxes
|
190
|
+
return d
|
166
191
|
|
167
|
-
|
168
|
-
if word_boxes:
|
169
|
-
query["word_boxes"] = word_boxes
|
192
|
+
queries = [_build_query_dict(q) for q in questions]
|
170
193
|
|
171
194
|
# Save debug information if requested
|
172
195
|
if debug:
|
@@ -202,48 +225,79 @@ class DocumentQA:
|
|
202
225
|
logger.info(f"Word boxes: {word_boxes_path}")
|
203
226
|
logger.info(f"Visualization: {vis_path}")
|
204
227
|
|
205
|
-
#
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
"
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
228
|
+
# ------------------------------------------------------------------
|
229
|
+
# Run the queries through the pipeline (batch or single) and collect
|
230
|
+
# *only the top answer* for each, mirroring the original behaviour.
|
231
|
+
# ------------------------------------------------------------------
|
232
|
+
logger.info(
|
233
|
+
f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
|
234
|
+
)
|
235
|
+
|
236
|
+
# When we pass a list the pipeline returns a list of per-question
|
237
|
+
# results; each per-question result is itself a list (top-k answers).
|
238
|
+
# We keep only the best answer (index 0) to maintain backwards
|
239
|
+
# compatibility.
|
240
|
+
raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
|
241
|
+
|
242
|
+
# Ensure we always have a list aligned with *questions*
|
243
|
+
if len(queries) == 1:
|
244
|
+
raw_results = [raw_results]
|
245
|
+
|
246
|
+
processed_results: List[QAResult] = []
|
247
|
+
|
248
|
+
for q, res in zip(questions, raw_results):
|
249
|
+
top_res = res[0] if isinstance(res, list) else res # pipeline may or may not nest
|
250
|
+
|
251
|
+
# Save per-question result in debug mode
|
252
|
+
if debug:
|
253
|
+
# File names: debug_qa_result_0.json, …
|
254
|
+
result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
|
255
|
+
try:
|
256
|
+
with open(result_path, "w") as f:
|
257
|
+
serializable = {
|
258
|
+
k: (
|
259
|
+
str(v)
|
260
|
+
if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
|
261
|
+
else v
|
262
|
+
)
|
263
|
+
for k, v in top_res.items()
|
264
|
+
}
|
265
|
+
json.dump(serializable, f, indent=2)
|
266
|
+
except Exception as e:
|
267
|
+
logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
|
268
|
+
|
269
|
+
# Apply confidence threshold
|
270
|
+
if top_res["score"] < min_confidence:
|
271
|
+
qa_res = QAResult(
|
272
|
+
question=q,
|
273
|
+
answer="",
|
274
|
+
confidence=top_res["score"],
|
275
|
+
start=top_res.get("start", -1),
|
276
|
+
end=top_res.get("end", -1),
|
277
|
+
found=False,
|
278
|
+
)
|
279
|
+
else:
|
280
|
+
qa_res = QAResult(
|
281
|
+
question=q,
|
282
|
+
answer=top_res["answer"],
|
283
|
+
confidence=top_res["score"],
|
284
|
+
start=top_res.get("start", 0),
|
285
|
+
end=top_res.get("end", 0),
|
286
|
+
found=True,
|
287
|
+
)
|
288
|
+
|
289
|
+
processed_results.append(qa_res)
|
290
|
+
|
291
|
+
# Return appropriately typed result (single item or list)
|
292
|
+
return processed_results[0] if single_question else processed_results
|
243
293
|
|
244
294
|
def ask_pdf_page(
|
245
|
-
self,
|
246
|
-
|
295
|
+
self,
|
296
|
+
page,
|
297
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
298
|
+
min_confidence: float = 0.1,
|
299
|
+
debug: bool = False,
|
300
|
+
) -> Union[QAResult, List[QAResult]]:
|
247
301
|
"""
|
248
302
|
Ask a question about a specific PDF page.
|
249
303
|
|
@@ -253,7 +307,7 @@ class DocumentQA:
|
|
253
307
|
min_confidence: Minimum confidence threshold for answers
|
254
308
|
|
255
309
|
Returns:
|
256
|
-
|
310
|
+
QAResult instance with answer details
|
257
311
|
"""
|
258
312
|
# Ensure we have text elements on the page
|
259
313
|
if not page.find_all("text"):
|
@@ -274,8 +328,8 @@ class DocumentQA:
|
|
274
328
|
page_image.save(temp_path)
|
275
329
|
|
276
330
|
try:
|
277
|
-
# Ask the question
|
278
|
-
|
331
|
+
# Ask the question(s)
|
332
|
+
result_obj = self.ask(
|
279
333
|
image=temp_path,
|
280
334
|
question=question,
|
281
335
|
word_boxes=word_boxes,
|
@@ -283,34 +337,35 @@ class DocumentQA:
|
|
283
337
|
debug=debug,
|
284
338
|
)
|
285
339
|
|
286
|
-
#
|
287
|
-
|
340
|
+
# Ensure we have a list for uniform processing
|
341
|
+
results = result_obj if isinstance(result_obj, list) else [result_obj]
|
288
342
|
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
end_idx = result["end"]
|
343
|
+
for res in results:
|
344
|
+
# Attach page reference
|
345
|
+
res.page_num = page.index
|
293
346
|
|
294
|
-
#
|
295
|
-
if
|
296
|
-
|
297
|
-
|
347
|
+
# Map answer span back to source elements
|
348
|
+
if res.found and "start" in res and "end" in res:
|
349
|
+
start_idx = res.start
|
350
|
+
end_idx = res.end
|
298
351
|
|
299
|
-
|
300
|
-
|
352
|
+
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
353
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
301
354
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
if element.text in matched_texts:
|
309
|
-
matched_texts.remove(element.text)
|
355
|
+
source_elements = []
|
356
|
+
for element in elements:
|
357
|
+
if hasattr(element, "text") and element.text in matched_texts:
|
358
|
+
source_elements.append(element)
|
359
|
+
if element.text in matched_texts:
|
360
|
+
matched_texts.remove(element.text)
|
310
361
|
|
311
|
-
|
362
|
+
res.source_elements = ElementCollection(source_elements)
|
312
363
|
|
313
|
-
|
364
|
+
# Return result(s) preserving original input type
|
365
|
+
if isinstance(question, (list, tuple)):
|
366
|
+
return results
|
367
|
+
else:
|
368
|
+
return results[0]
|
314
369
|
|
315
370
|
finally:
|
316
371
|
# Clean up temporary file
|
@@ -318,8 +373,12 @@ class DocumentQA:
|
|
318
373
|
os.remove(temp_path)
|
319
374
|
|
320
375
|
def ask_pdf_region(
|
321
|
-
self,
|
322
|
-
|
376
|
+
self,
|
377
|
+
region,
|
378
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
379
|
+
min_confidence: float = 0.1,
|
380
|
+
debug: bool = False,
|
381
|
+
) -> Union[QAResult, List[QAResult]]:
|
323
382
|
"""
|
324
383
|
Ask a question about a specific region of a PDF page.
|
325
384
|
|
@@ -329,7 +388,7 @@ class DocumentQA:
|
|
329
388
|
min_confidence: Minimum confidence threshold for answers
|
330
389
|
|
331
390
|
Returns:
|
332
|
-
|
391
|
+
QAResult instance with answer details
|
333
392
|
"""
|
334
393
|
# Get all text elements within the region
|
335
394
|
elements = region.find_all("text")
|
@@ -356,8 +415,8 @@ class DocumentQA:
|
|
356
415
|
region_image.save(temp_path)
|
357
416
|
|
358
417
|
try:
|
359
|
-
# Ask the question
|
360
|
-
|
418
|
+
# Ask the question(s)
|
419
|
+
result_obj = self.ask(
|
361
420
|
image=temp_path,
|
362
421
|
question=question,
|
363
422
|
word_boxes=word_boxes,
|
@@ -365,35 +424,29 @@ class DocumentQA:
|
|
365
424
|
debug=debug,
|
366
425
|
)
|
367
426
|
|
368
|
-
|
369
|
-
result["region"] = region
|
370
|
-
result["page_num"] = region.page.index
|
427
|
+
results = result_obj if isinstance(result_obj, list) else [result_obj]
|
371
428
|
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
end_idx = result["end"]
|
429
|
+
for res in results:
|
430
|
+
res.region = region
|
431
|
+
res.page_num = region.page.index
|
376
432
|
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
# Since word_boxes may have filtered out some elements, we need to map indices
|
433
|
+
if res.found and "start" in res and "end" in res:
|
434
|
+
start_idx = res.start
|
435
|
+
end_idx = res.end
|
381
436
|
|
382
|
-
|
383
|
-
|
437
|
+
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
438
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
384
439
|
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
if element.text in matched_texts:
|
392
|
-
matched_texts.remove(element.text)
|
440
|
+
source_elements = []
|
441
|
+
for element in elements:
|
442
|
+
if hasattr(element, "text") and element.text in matched_texts:
|
443
|
+
source_elements.append(element)
|
444
|
+
if element.text in matched_texts:
|
445
|
+
matched_texts.remove(element.text)
|
393
446
|
|
394
|
-
|
447
|
+
res.source_elements = ElementCollection(source_elements)
|
395
448
|
|
396
|
-
return
|
449
|
+
return results if isinstance(question, (list, tuple)) else results[0]
|
397
450
|
|
398
451
|
finally:
|
399
452
|
# Clean up temporary file
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class QAResult(dict):
|
2
|
+
"""Dictionary-like container for Document QA results with a convenient ``show`` method.
|
3
|
+
|
4
|
+
This class behaves exactly like a regular ``dict`` so existing code that
|
5
|
+
expects a mapping will continue to work. In addition it exposes:
|
6
|
+
|
7
|
+
• ``show()`` – delegates to the underlying ``source_elements.show`` if those
|
8
|
+
elements are present (added automatically by ``ask_pdf_page`` and
|
9
|
+
``ask_pdf_region``). This provides a quick way to visualise where an
|
10
|
+
answer was found in the document.
|
11
|
+
|
12
|
+
• Attribute access (e.g. ``result.answer``) as sugar for the usual
|
13
|
+
``result["answer"]``.
|
14
|
+
"""
|
15
|
+
|
16
|
+
# ---------------------------------------------------------------------
|
17
|
+
# Convenience helpers
|
18
|
+
# ---------------------------------------------------------------------
|
19
|
+
def show(self, *args, **kwargs):
|
20
|
+
"""Display the answer region by delegating to ``source_elements.show``.
|
21
|
+
|
22
|
+
Any positional or keyword arguments are forwarded to
|
23
|
+
``ElementCollection.show``.
|
24
|
+
"""
|
25
|
+
source = self.get("source_elements")
|
26
|
+
if source is None:
|
27
|
+
raise AttributeError(
|
28
|
+
"QAResult does not contain 'source_elements'; nothing to show()."
|
29
|
+
)
|
30
|
+
if not hasattr(source, "show"):
|
31
|
+
raise AttributeError(
|
32
|
+
"'source_elements' object has no 'show' method; cannot visualise."
|
33
|
+
)
|
34
|
+
return source.show(*args, **kwargs)
|
35
|
+
|
36
|
+
# ------------------------------------------------------------------
|
37
|
+
# Attribute <-> key delegation so ``result.answer`` works
|
38
|
+
# ------------------------------------------------------------------
|
39
|
+
def __getattr__(self, item):
|
40
|
+
try:
|
41
|
+
return self[item]
|
42
|
+
except KeyError as exc:
|
43
|
+
raise AttributeError(item) from exc
|
44
|
+
|
45
|
+
def __setattr__(self, key, value):
|
46
|
+
# Store all non-dunder attributes in the underlying mapping so that
|
47
|
+
# they remain serialisable.
|
48
|
+
if key.startswith("__") and key.endswith("__"):
|
49
|
+
super().__setattr__(key, value)
|
50
|
+
else:
|
51
|
+
self[key] = value
|
52
|
+
|
53
|
+
# Ensure ``copy`` keeps the subclass type
|
54
|
+
def copy(self):
|
55
|
+
return QAResult(self)
|
natural_pdf/selectors/parser.py
CHANGED
@@ -698,6 +698,28 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
698
698
|
|
699
699
|
filter_lambda = contains_check
|
700
700
|
|
701
|
+
# --- Handle :startswith and :starts-with (alias) --- #
|
702
|
+
elif name in ("starts-with", "startswith") and args is not None:
|
703
|
+
filter_name = f"pseudo-class :{name}({args!r})"
|
704
|
+
|
705
|
+
def startswith_check(element, arg=args):
|
706
|
+
if not hasattr(element, "text") or not element.text:
|
707
|
+
return False
|
708
|
+
return str(element.text).startswith(str(arg))
|
709
|
+
|
710
|
+
filter_lambda = startswith_check
|
711
|
+
|
712
|
+
# --- Handle :endswith and :ends-with (alias) --- #
|
713
|
+
elif name in ("ends-with", "endswith") and args is not None:
|
714
|
+
filter_name = f"pseudo-class :{name}({args!r})"
|
715
|
+
|
716
|
+
def endswith_check(element, arg=args):
|
717
|
+
if not hasattr(element, "text") or not element.text:
|
718
|
+
return False
|
719
|
+
return str(element.text).endswith(str(arg))
|
720
|
+
|
721
|
+
filter_lambda = endswith_check
|
722
|
+
|
701
723
|
elif name == "starts-with" and args is not None:
|
702
724
|
filter_lambda = (
|
703
725
|
lambda el, arg=args: hasattr(el, "text")
|
@@ -63,9 +63,9 @@ def _get_layout_kwargs(
|
|
63
63
|
else:
|
64
64
|
logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
|
65
65
|
|
66
|
-
# 4. Ensure layout flag is present, defaulting to
|
66
|
+
# 4. Ensure layout flag is present, defaulting to False (caller can override)
|
67
67
|
if "layout" not in layout_kwargs:
|
68
|
-
layout_kwargs["layout"] =
|
68
|
+
layout_kwargs["layout"] = False
|
69
69
|
|
70
70
|
return layout_kwargs
|
71
71
|
|
@@ -203,24 +203,42 @@ def generate_text_layout(
|
|
203
203
|
logger.debug("generate_text_layout: No valid character dicts found after filtering.")
|
204
204
|
return ""
|
205
205
|
|
206
|
-
#
|
207
|
-
|
208
|
-
use_layout = layout_kwargs.pop("layout", True) # Extract layout flag, default True
|
206
|
+
# Make a working copy of user_kwargs so we can safely pop custom keys
|
207
|
+
incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
|
209
208
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
209
|
+
# --- Handle custom 'strip' option ------------------------------------
|
210
|
+
# * strip=True – post-process the final string to remove leading/trailing
|
211
|
+
# whitespace (typically used when layout=False)
|
212
|
+
# * strip=False – preserve whitespace exactly as produced.
|
213
|
+
# Default behaviour depends on the layout flag (see below).
|
214
|
+
explicit_strip_flag = incoming_kwargs.pop("strip", None) # May be None
|
215
|
+
|
216
|
+
# Prepare layout arguments now that we've removed the non-pdfplumber key
|
217
|
+
layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
|
218
|
+
use_layout = layout_kwargs.get("layout", False)
|
219
|
+
|
220
|
+
# Determine final strip behaviour: if caller specified override, honour it;
|
221
|
+
# otherwise default to !use_layout (True when layout=False, False when
|
222
|
+
# layout=True) per user request.
|
223
|
+
strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
|
217
224
|
|
218
225
|
try:
|
219
|
-
# Sort chars primarily by top, then x0 before layout analysis
|
220
|
-
#
|
226
|
+
# Sort chars primarily by top, then x0 before layout analysis – required by
|
227
|
+
# pdfplumber so that grouping into lines works deterministically.
|
221
228
|
valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
|
229
|
+
|
230
|
+
# Build the text map. `layout_kwargs` still contains the caller-specified or
|
231
|
+
# default "layout" flag, which chars_to_textmap will respect.
|
222
232
|
textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
|
223
233
|
result = textmap.as_string
|
234
|
+
|
235
|
+
# ----------------------------------------------------------------
|
236
|
+
# Optional post-processing strip
|
237
|
+
# ----------------------------------------------------------------
|
238
|
+
if strip_result and isinstance(result, str):
|
239
|
+
# Remove trailing spaces on each line then trim leading/trailing
|
240
|
+
# blank lines for a cleaner output while keeping internal newlines.
|
241
|
+
result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
|
224
242
|
except Exception as e:
|
225
243
|
# Fallback to simple join on error
|
226
244
|
logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
|
@@ -230,5 +248,7 @@ def generate_text_layout(
|
|
230
248
|
# Fallback already has sorted characters if layout was attempted
|
231
249
|
# Need to use the valid_char_dicts here too
|
232
250
|
result = "".join(c.get("text", "") for c in valid_char_dicts)
|
251
|
+
if strip_result:
|
252
|
+
result = result.strip()
|
233
253
|
|
234
254
|
return result
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Requires-Python: >=3.9
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: markdown
|
14
15
|
Requires-Dist: pandas
|
15
16
|
Requires-Dist: pdfplumber
|
16
17
|
Requires-Dist: colormath2
|
@@ -22,12 +23,6 @@ Requires-Dist: tqdm
|
|
22
23
|
Requires-Dist: pydantic
|
23
24
|
Requires-Dist: jenkspy
|
24
25
|
Requires-Dist: scipy
|
25
|
-
Requires-Dist: torch
|
26
|
-
Requires-Dist: torchvision
|
27
|
-
Requires-Dist: transformers[sentencepiece]
|
28
|
-
Requires-Dist: huggingface_hub>=0.29.3
|
29
|
-
Requires-Dist: sentence-transformers
|
30
|
-
Requires-Dist: timm
|
31
26
|
Requires-Dist: ipywidgets>=7.0.0
|
32
27
|
Provides-Extra: test
|
33
28
|
Requires-Dist: pytest; extra == "test"
|
@@ -57,6 +52,7 @@ Requires-Dist: natural-pdf[test]; extra == "all"
|
|
57
52
|
Requires-Dist: natural-pdf[search]; extra == "all"
|
58
53
|
Requires-Dist: natural-pdf[favorites]; extra == "all"
|
59
54
|
Requires-Dist: natural-pdf[export-extras]; extra == "all"
|
55
|
+
Requires-Dist: natural-pdf[ai]; extra == "all"
|
60
56
|
Provides-Extra: deskew
|
61
57
|
Requires-Dist: deskew>=1.5; extra == "deskew"
|
62
58
|
Requires-Dist: img2pdf; extra == "deskew"
|
@@ -68,6 +64,15 @@ Requires-Dist: pikepdf; extra == "ocr-export"
|
|
68
64
|
Provides-Extra: export-extras
|
69
65
|
Requires-Dist: jupytext; extra == "export-extras"
|
70
66
|
Requires-Dist: nbformat; extra == "export-extras"
|
67
|
+
Provides-Extra: ai
|
68
|
+
Requires-Dist: sentence-transformers; extra == "ai"
|
69
|
+
Requires-Dist: torch; extra == "ai"
|
70
|
+
Requires-Dist: torchvision; extra == "ai"
|
71
|
+
Requires-Dist: transformers[sentencepiece]; extra == "ai"
|
72
|
+
Requires-Dist: huggingface_hub>=0.29.3; extra == "ai"
|
73
|
+
Requires-Dist: timm; extra == "ai"
|
74
|
+
Requires-Dist: doclayout_yolo; extra == "ai"
|
75
|
+
Requires-Dist: easyocr; extra == "ai"
|
71
76
|
Dynamic: license-file
|
72
77
|
|
73
78
|
# Natural PDF
|
@@ -87,25 +92,29 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
|
|
87
92
|
pip install natural-pdf
|
88
93
|
```
|
89
94
|
|
90
|
-
Need OCR engines, layout models, or other heavy add-ons? Install the **core** once, then use the helper
|
95
|
+
Need OCR engines, layout models, or other heavy add-ons? Install the **core** once, then use the helper `npdf` command to pull in exactly what you need:
|
91
96
|
|
92
97
|
```bash
|
93
|
-
#
|
94
|
-
npdf install
|
98
|
+
# Everything you need for classification, document-QA, semantic search, etc.
|
99
|
+
npdf install ai
|
95
100
|
|
96
101
|
# Surya OCR and the YOLO Doc-Layout detector in one go
|
97
102
|
npdf install surya yolo
|
98
103
|
|
104
|
+
# add PaddleOCR (+paddlex) after the fact
|
105
|
+
npdf install paddle
|
106
|
+
|
99
107
|
# see what's already on your machine
|
100
108
|
npdf list
|
101
109
|
```
|
102
110
|
|
103
|
-
|
104
|
-
classic
|
111
|
+
Lightweight extras such as `deskew` or `search` can still be added with
|
112
|
+
classic `pip install`:
|
105
113
|
|
106
114
|
```bash
|
107
115
|
pip install "natural-pdf[deskew]"
|
108
116
|
pip install "natural-pdf[search]"
|
117
|
+
pip install "natural-pdf[ai]"
|
109
118
|
```
|
110
119
|
|
111
120
|
More details in the [installation guide](https://jsoma.github.io/natural-pdf/installation/).
|
@@ -116,7 +125,7 @@ More details in the [installation guide](https://jsoma.github.io/natural-pdf/ins
|
|
116
125
|
from natural_pdf import PDF
|
117
126
|
|
118
127
|
# Open a PDF
|
119
|
-
pdf = PDF('
|
128
|
+
pdf = PDF('https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf')
|
120
129
|
page = pdf.pages[0]
|
121
130
|
|
122
131
|
# Extract all of the text on the page
|