natural-pdf 0.1.23__py3-none-any.whl → 0.1.26.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import json
2
2
  import logging
3
3
  import os
4
4
  import tempfile
5
+ import warnings
5
6
  from typing import Any, Dict, List, Optional, Tuple, Union
6
7
 
7
8
  import numpy as np
@@ -119,29 +120,52 @@ class DocumentQA:
119
120
  def ask(
120
121
  self,
121
122
  image: Union[str, Image.Image, np.ndarray],
122
- question: str,
123
+ question: Union[str, List[str], Tuple[str, ...]],
123
124
  word_boxes: List = None,
124
125
  min_confidence: float = 0.1,
125
126
  debug: bool = False,
126
127
  debug_output_dir: str = "output",
127
- ) -> QAResult:
128
+ ) -> Union[QAResult, List[QAResult]]:
128
129
  """
129
- Ask a question about document content.
130
+ Ask one or more natural-language questions about the supplied document image.
131
+
132
+ This method now accepts a single *question* (``str``) **or** an
133
+ iterable of questions (``list``/``tuple`` of ``str``). When multiple
134
+ questions are provided they are executed in a single batch through the
135
+ underlying transformers pipeline which is considerably faster than
136
+ looping and calling :py:meth:`ask` repeatedly.
130
137
 
131
138
  Args:
132
- image: PIL Image, numpy array, or path to image file
133
- question: Question to ask about the document
134
- word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
135
- min_confidence: Minimum confidence threshold for answers
136
- debug: Whether to save debug information
137
- debug_output_dir: Directory to save debug files
139
+ image: PIL ``Image``, ``numpy`` array, or path to an image file.
140
+ question: A question string *or* a list/tuple of question strings.
141
+ word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
142
+ format ``[[text, [x0, y0, x1, y1]], …]``.
143
+ min_confidence: Minimum confidence threshold below which an answer
144
+ will be marked as ``found = False``.
145
+ debug: If ``True`` intermediate artefacts will be written to
146
+ *debug_output_dir* to aid troubleshooting.
147
+ debug_output_dir: Directory where debug artefacts should be saved.
138
148
 
139
149
  Returns:
140
- QAResult instance with answer details
150
+ • A single :class:`QAResult` when *question* is a string.
151
+ • A ``list`` of :class:`QAResult`` objects (one per question) when
152
+ *question* is a list/tuple.
141
153
  """
142
154
  if not self._is_initialized:
143
155
  raise RuntimeError("DocumentQA is not properly initialized")
144
156
 
157
+ # Normalise *questions* to a list so we can treat batch and single
158
+ # uniformly. We'll remember if the caller supplied a single question
159
+ # so that we can preserve the original return type.
160
+ single_question = False
161
+ if isinstance(question, str):
162
+ questions = [question]
163
+ single_question = True
164
+ elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
165
+ questions = list(question)
166
+ else:
167
+ raise TypeError("'question' must be a string or a list/tuple of strings")
168
+
145
169
  # Process the image
146
170
  if isinstance(image, str):
147
171
  # It's a file path
@@ -157,12 +181,16 @@ class DocumentQA:
157
181
  else:
158
182
  raise TypeError("Image must be a PIL Image, numpy array, or file path")
159
183
 
160
- # Prepare the query
161
- query = {"image": image_obj, "question": question}
184
+ # ------------------------------------------------------------------
185
+ # Build the queries for the pipeline (either single dict or list).
186
+ # ------------------------------------------------------------------
187
+ def _build_query_dict(q: str):
188
+ d = {"image": image_obj, "question": q}
189
+ if word_boxes:
190
+ d["word_boxes"] = word_boxes
191
+ return d
162
192
 
163
- # Add word boxes if provided
164
- if word_boxes:
165
- query["word_boxes"] = word_boxes
193
+ queries = [_build_query_dict(q) for q in questions]
166
194
 
167
195
  # Save debug information if requested
168
196
  if debug:
@@ -198,48 +226,79 @@ class DocumentQA:
198
226
  logger.info(f"Word boxes: {word_boxes_path}")
199
227
  logger.info(f"Visualization: {vis_path}")
200
228
 
201
- # Run the query through the pipeline
202
- logger.info(f"Running document QA pipeline with question: {question}")
203
- result = self.pipe(query)[0]
204
- logger.info(f"Raw result: {result}")
205
-
206
- # Save the result if debugging
207
- if debug:
208
- result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
209
- with open(result_path, "w") as f:
210
- # Convert any non-serializable data
211
- serializable_result = {
212
- k: (
213
- str(v)
214
- if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
215
- else v
216
- )
217
- for k, v in result.items()
218
- }
219
- json.dump(serializable_result, f, indent=2)
220
-
221
- # Check confidence against threshold
222
- if result["score"] < min_confidence:
223
- logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
224
- return QAResult(
225
- answer="",
226
- confidence=result["score"],
227
- start=result.get("start", -1),
228
- end=result.get("end", -1),
229
- found=False,
230
- )
231
-
232
- return QAResult(
233
- answer=result["answer"],
234
- confidence=result["score"],
235
- start=result.get("start", 0),
236
- end=result.get("end", 0),
237
- found=True,
229
+ # ------------------------------------------------------------------
230
+ # Run the queries through the pipeline (batch or single) and collect
231
+ # *only the top answer* for each, mirroring the original behaviour.
232
+ # ------------------------------------------------------------------
233
+ logger.info(
234
+ f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
238
235
  )
239
236
 
237
+ # When we pass a list the pipeline returns a list of per-question
238
+ # results; each per-question result is itself a list (top-k answers).
239
+ # We keep only the best answer (index 0) to maintain backwards
240
+ # compatibility.
241
+ raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
242
+
243
+ # Ensure we always have a list aligned with *questions*
244
+ if len(queries) == 1:
245
+ raw_results = [raw_results]
246
+
247
+ processed_results: List[QAResult] = []
248
+
249
+ for q, res in zip(questions, raw_results):
250
+ top_res = res[0] if isinstance(res, list) else res # pipeline may or may not nest
251
+
252
+ # Save per-question result in debug mode
253
+ if debug:
254
+ # File names: debug_qa_result_0.json, …
255
+ result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
256
+ try:
257
+ with open(result_path, "w") as f:
258
+ serializable = {
259
+ k: (
260
+ str(v)
261
+ if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
262
+ else v
263
+ )
264
+ for k, v in top_res.items()
265
+ }
266
+ json.dump(serializable, f, indent=2)
267
+ except Exception as e:
268
+ logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
269
+
270
+ # Apply confidence threshold
271
+ if top_res["score"] < min_confidence:
272
+ qa_res = QAResult(
273
+ question=q,
274
+ answer="",
275
+ confidence=top_res["score"],
276
+ start=top_res.get("start", -1),
277
+ end=top_res.get("end", -1),
278
+ found=False,
279
+ )
280
+ else:
281
+ qa_res = QAResult(
282
+ question=q,
283
+ answer=top_res["answer"],
284
+ confidence=top_res["score"],
285
+ start=top_res.get("start", 0),
286
+ end=top_res.get("end", 0),
287
+ found=True,
288
+ )
289
+
290
+ processed_results.append(qa_res)
291
+
292
+ # Return appropriately typed result (single item or list)
293
+ return processed_results[0] if single_question else processed_results
294
+
240
295
  def ask_pdf_page(
241
- self, page, question: str, min_confidence: float = 0.1, debug: bool = False
242
- ) -> QAResult:
296
+ self,
297
+ page,
298
+ question: Union[str, List[str], Tuple[str, ...]],
299
+ min_confidence: float = 0.1,
300
+ debug: bool = False,
301
+ ) -> Union[QAResult, List[QAResult]]:
243
302
  """
244
303
  Ask a question about a specific PDF page.
245
304
 
@@ -252,13 +311,39 @@ class DocumentQA:
252
311
  QAResult instance with answer details
253
312
  """
254
313
  # Ensure we have text elements on the page
255
- if not page.find_all("text"):
256
- # Apply OCR if no text is available
257
- logger.info(f"No text elements found on page {page.index}, applying OCR")
258
- page.apply_ocr()
314
+ elements = page.find_all("text")
315
+ if not elements:
316
+ # Warn that no text was found and recommend OCR
317
+ warnings.warn(
318
+ f"No text elements found on page {page.index}. "
319
+ "Consider applying OCR first using page.apply_ocr() to extract text from images.",
320
+ UserWarning
321
+ )
322
+
323
+ # Return appropriate "not found" result(s)
324
+ if isinstance(question, (list, tuple)):
325
+ return [
326
+ QAResult(
327
+ question=q,
328
+ answer="",
329
+ confidence=0.0,
330
+ start=-1,
331
+ end=-1,
332
+ found=False,
333
+ )
334
+ for q in question
335
+ ]
336
+ else:
337
+ return QAResult(
338
+ question=question,
339
+ answer="",
340
+ confidence=0.0,
341
+ start=-1,
342
+ end=-1,
343
+ found=False,
344
+ )
259
345
 
260
346
  # Extract word boxes
261
- elements = page.find_all("text")
262
347
  word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
263
348
 
264
349
  # Generate a high-resolution image of the page
@@ -270,8 +355,8 @@ class DocumentQA:
270
355
  page_image.save(temp_path)
271
356
 
272
357
  try:
273
- # Ask the question
274
- result = self.ask(
358
+ # Ask the question(s)
359
+ result_obj = self.ask(
275
360
  image=temp_path,
276
361
  question=question,
277
362
  word_boxes=word_boxes,
@@ -279,34 +364,35 @@ class DocumentQA:
279
364
  debug=debug,
280
365
  )
281
366
 
282
- # Add page reference to the result
283
- result.page_num = page.index
367
+ # Ensure we have a list for uniform processing
368
+ results = result_obj if isinstance(result_obj, list) else [result_obj]
284
369
 
285
- # Add element references if possible
286
- if result.found and "start" in result and "end" in result:
287
- start_idx = result.start
288
- end_idx = result.end
370
+ for res in results:
371
+ # Attach page reference
372
+ res.page_num = page.index
289
373
 
290
- # Make sure we have valid indices and elements to work with
291
- if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
292
- # Find the actual source elements in the original list
293
- # Since word_boxes may have filtered out some elements, we need to map indices
374
+ # Map answer span back to source elements
375
+ if res.found and "start" in res and "end" in res:
376
+ start_idx = res.start
377
+ end_idx = res.end
294
378
 
295
- # Get the text from result word boxes
296
- matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
379
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
380
+ matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
297
381
 
298
- # Find corresponding elements in the full element list
299
- source_elements = []
300
- for element in elements:
301
- if hasattr(element, "text") and element.text in matched_texts:
302
- source_elements.append(element)
303
- # Remove from matched texts to avoid duplicates
304
- if element.text in matched_texts:
305
- matched_texts.remove(element.text)
382
+ source_elements = []
383
+ for element in elements:
384
+ if hasattr(element, "text") and element.text in matched_texts:
385
+ source_elements.append(element)
386
+ if element.text in matched_texts:
387
+ matched_texts.remove(element.text)
306
388
 
307
- result.source_elements = ElementCollection(source_elements)
389
+ res.source_elements = ElementCollection(source_elements)
308
390
 
309
- return result
391
+ # Return result(s) preserving original input type
392
+ if isinstance(question, (list, tuple)):
393
+ return results
394
+ else:
395
+ return results[0]
310
396
 
311
397
  finally:
312
398
  # Clean up temporary file
@@ -314,8 +400,12 @@ class DocumentQA:
314
400
  os.remove(temp_path)
315
401
 
316
402
  def ask_pdf_region(
317
- self, region, question: str, min_confidence: float = 0.1, debug: bool = False
318
- ) -> QAResult:
403
+ self,
404
+ region,
405
+ question: Union[str, List[str], Tuple[str, ...]],
406
+ min_confidence: float = 0.1,
407
+ debug: bool = False,
408
+ ) -> Union[QAResult, List[QAResult]]:
319
409
  """
320
410
  Ask a question about a specific region of a PDF page.
321
411
 
@@ -330,10 +420,37 @@ class DocumentQA:
330
420
  # Get all text elements within the region
331
421
  elements = region.find_all("text")
332
422
 
333
- # Apply OCR if needed
423
+ # Check if we have text elements
334
424
  if not elements:
335
- logger.info(f"No text elements found in region, applying OCR")
336
- elements = region.apply_ocr()
425
+ # Warn that no text was found and recommend OCR
426
+ warnings.warn(
427
+ f"No text elements found in region on page {region.page.index}. "
428
+ "Consider applying OCR first using region.apply_ocr() to extract text from images.",
429
+ UserWarning
430
+ )
431
+
432
+ # Return appropriate "not found" result(s)
433
+ if isinstance(question, (list, tuple)):
434
+ return [
435
+ QAResult(
436
+ question=q,
437
+ answer="",
438
+ confidence=0.0,
439
+ start=-1,
440
+ end=-1,
441
+ found=False,
442
+ )
443
+ for q in question
444
+ ]
445
+ else:
446
+ return QAResult(
447
+ question=question,
448
+ answer="",
449
+ confidence=0.0,
450
+ start=-1,
451
+ end=-1,
452
+ found=False,
453
+ )
337
454
 
338
455
  # Extract word boxes adjusted for the cropped region
339
456
  x0, top = int(region.x0), int(region.top)
@@ -352,8 +469,8 @@ class DocumentQA:
352
469
  region_image.save(temp_path)
353
470
 
354
471
  try:
355
- # Ask the question
356
- result = self.ask(
472
+ # Ask the question(s)
473
+ result_obj = self.ask(
357
474
  image=temp_path,
358
475
  question=question,
359
476
  word_boxes=word_boxes,
@@ -361,35 +478,29 @@ class DocumentQA:
361
478
  debug=debug,
362
479
  )
363
480
 
364
- # Add region reference to the result
365
- result.region = region
366
- result.page_num = region.page.index
481
+ results = result_obj if isinstance(result_obj, list) else [result_obj]
367
482
 
368
- # Add element references if possible
369
- if result.found and "start" in result and "end" in result:
370
- start_idx = result.start
371
- end_idx = result.end
483
+ for res in results:
484
+ res.region = region
485
+ res.page_num = region.page.index
372
486
 
373
- # Make sure we have valid indices and elements to work with
374
- if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
375
- # Find the actual source elements in the original list
376
- # Since word_boxes may have filtered out some elements, we need to map indices
487
+ if res.found and "start" in res and "end" in res:
488
+ start_idx = res.start
489
+ end_idx = res.end
377
490
 
378
- # Get the text from result word boxes
379
- matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
491
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
492
+ matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
380
493
 
381
- # Find corresponding elements in the full element list
382
- source_elements = []
383
- for element in elements:
384
- if hasattr(element, "text") and element.text in matched_texts:
385
- source_elements.append(element)
386
- # Remove from matched texts to avoid duplicates
387
- if element.text in matched_texts:
388
- matched_texts.remove(element.text)
494
+ source_elements = []
495
+ for element in elements:
496
+ if hasattr(element, "text") and element.text in matched_texts:
497
+ source_elements.append(element)
498
+ if element.text in matched_texts:
499
+ matched_texts.remove(element.text)
389
500
 
390
- result.source_elements = ElementCollection(source_elements)
501
+ res.source_elements = ElementCollection(source_elements)
391
502
 
392
- return result
503
+ return results if isinstance(question, (list, tuple)) else results[0]
393
504
 
394
505
  finally:
395
506
  # Clean up temporary file
@@ -36,7 +36,7 @@ def create_correction_task_package(
36
36
  output_zip_path: str,
37
37
  overwrite: bool = False,
38
38
  suggest=None,
39
- resolution: int = 150,
39
+ resolution: int = 300,
40
40
  ) -> None:
41
41
  """
42
42
  Creates a zip package containing data for an OCR correction task.
@@ -160,8 +160,22 @@ def create_correction_task_package(
160
160
 
161
161
  # 3. Prepare region data for manifest
162
162
  page_regions_data = []
163
- # Calculate scaling factor from PDF coordinates (72 DPI) to image pixels
164
- coord_scale_factor = resolution / 72.0
163
+ # Calculate scaling factor *from PDF points* to *actual image pixels*.
164
+ # We prefer using the rendered image dimensions rather than the nominal
165
+ # resolution value, because the image might have been resized (e.g. via
166
+ # global `natural_pdf.options.image.width`). This guarantees that the
167
+ # bounding boxes we write to the manifest always align with the exact
168
+ # pixel grid of the exported image.
169
+
170
+ try:
171
+ scale_x = img.width / float(page.width) if page.width else 1.0
172
+ scale_y = img.height / float(page.height) if page.height else 1.0
173
+ except Exception as e:
174
+ logger.warning(
175
+ f"Could not compute per-axis scale factors for page {page.number}: {e}. "
176
+ "Falling back to resolution-based scaling."
177
+ )
178
+ scale_x = scale_y = resolution / 72.0
165
179
 
166
180
  i = -1
167
181
  for elem in tqdm(ocr_elements):
@@ -176,12 +190,12 @@ def create_correction_task_package(
176
190
  continue
177
191
  region_id = f"r_{page.index}_{i}" # ID unique within page
178
192
 
179
- # Scale coordinates to match the 300 DPI image
193
+ # Scale coordinates to match the **actual** image dimensions.
180
194
  scaled_bbox = [
181
- elem.x0 * coord_scale_factor,
182
- elem.top * coord_scale_factor,
183
- elem.x1 * coord_scale_factor,
184
- elem.bottom * coord_scale_factor,
195
+ elem.x0 * scale_x,
196
+ elem.top * scale_y,
197
+ elem.x1 * scale_x,
198
+ elem.bottom * scale_y,
185
199
  ]
186
200
 
187
201
  corrected = elem.text
@@ -191,7 +205,7 @@ def create_correction_task_package(
191
205
 
192
206
  page_regions_data.append(
193
207
  {
194
- "resolution": resolution,
208
+ "resolution": scale_x * 72.0,
195
209
  "id": region_id,
196
210
  "bbox": scaled_bbox,
197
211
  "ocr_text": elem.text,
@@ -63,9 +63,9 @@ def _get_layout_kwargs(
63
63
  else:
64
64
  logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
65
65
 
66
- # 4. Ensure layout flag is present, defaulting to True
66
+ # 4. Ensure layout flag is present, defaulting to False (caller can override)
67
67
  if "layout" not in layout_kwargs:
68
- layout_kwargs["layout"] = True
68
+ layout_kwargs["layout"] = False
69
69
 
70
70
  return layout_kwargs
71
71
 
@@ -203,24 +203,42 @@ def generate_text_layout(
203
203
  logger.debug("generate_text_layout: No valid character dicts found after filtering.")
204
204
  return ""
205
205
 
206
- # Prepare layout arguments
207
- layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
208
- use_layout = layout_kwargs.pop("layout", True) # Extract layout flag, default True
206
+ # Make a working copy of user_kwargs so we can safely pop custom keys
207
+ incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
209
208
 
210
- if not use_layout:
211
- # Simple join if layout=False
212
- logger.debug("generate_text_layout: Using simple join (layout=False requested).")
213
- # Sort before joining if layout is off
214
- valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
215
- result = "".join(c.get("text", "") for c in valid_char_dicts) # Use valid chars
216
- return result
209
+ # --- Handle custom 'strip' option ------------------------------------
210
+ # * strip=True – post-process the final string to remove leading/trailing
211
+ # whitespace (typically used when layout=False)
212
+ # * strip=False preserve whitespace exactly as produced.
213
+ # Default behaviour depends on the layout flag (see below).
214
+ explicit_strip_flag = incoming_kwargs.pop("strip", None) # May be None
215
+
216
+ # Prepare layout arguments now that we've removed the non-pdfplumber key
217
+ layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
218
+ use_layout = layout_kwargs.get("layout", False)
219
+
220
+ # Determine final strip behaviour: if caller specified override, honour it;
221
+ # otherwise default to !use_layout (True when layout=False, False when
222
+ # layout=True) per user request.
223
+ strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
217
224
 
218
225
  try:
219
- # Sort chars primarily by top, then x0 before layout analysis
220
- # This helps pdfplumber group lines correctly
226
+ # Sort chars primarily by top, then x0 before layout analysis – required by
227
+ # pdfplumber so that grouping into lines works deterministically.
221
228
  valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
229
+
230
+ # Build the text map. `layout_kwargs` still contains the caller-specified or
231
+ # default "layout" flag, which chars_to_textmap will respect.
222
232
  textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
223
233
  result = textmap.as_string
234
+
235
+ # ----------------------------------------------------------------
236
+ # Optional post-processing strip
237
+ # ----------------------------------------------------------------
238
+ if strip_result and isinstance(result, str):
239
+ # Remove trailing spaces on each line then trim leading/trailing
240
+ # blank lines for a cleaner output while keeping internal newlines.
241
+ result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
224
242
  except Exception as e:
225
243
  # Fallback to simple join on error
226
244
  logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
@@ -230,5 +248,7 @@ def generate_text_layout(
230
248
  # Fallback already has sorted characters if layout was attempted
231
249
  # Need to use the valid_char_dicts here too
232
250
  result = "".join(c.get("text", "") for c in valid_char_dicts)
251
+ if strip_result:
252
+ result = result.strip()
233
253
 
234
254
  return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.23
3
+ Version: 0.1.26.dev0
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: markdown
14
15
  Requires-Dist: pandas
15
16
  Requires-Dist: pdfplumber
16
17
  Requires-Dist: colormath2