natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/analyzers/shape_detection_mixin.py +43 -3
  2. natural_pdf/classification/manager.py +1 -1
  3. natural_pdf/classification/mixin.py +35 -14
  4. natural_pdf/classification/results.py +16 -1
  5. natural_pdf/cli.py +1 -0
  6. natural_pdf/core/highlighting_service.py +23 -0
  7. natural_pdf/core/page.py +32 -2
  8. natural_pdf/core/pdf.py +24 -4
  9. natural_pdf/describe/base.py +11 -1
  10. natural_pdf/describe/summary.py +26 -0
  11. natural_pdf/elements/base.py +81 -3
  12. natural_pdf/elements/collections.py +162 -101
  13. natural_pdf/elements/region.py +187 -160
  14. natural_pdf/elements/text.py +15 -7
  15. natural_pdf/exporters/paddleocr.py +1 -1
  16. natural_pdf/extraction/manager.py +2 -2
  17. natural_pdf/extraction/mixin.py +295 -11
  18. natural_pdf/extraction/result.py +28 -1
  19. natural_pdf/flows/region.py +117 -2
  20. natural_pdf/ocr/engine_surya.py +25 -5
  21. natural_pdf/qa/__init__.py +2 -1
  22. natural_pdf/qa/document_qa.py +166 -113
  23. natural_pdf/qa/qa_result.py +55 -0
  24. natural_pdf/selectors/parser.py +22 -0
  25. natural_pdf/utils/text_extraction.py +34 -14
  26. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
  27. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
  28. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
  29. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
  30. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ import numpy as np
8
8
  from PIL import Image, ImageDraw
9
9
 
10
10
  from natural_pdf.elements.collections import ElementCollection
11
+ from .qa_result import QAResult
11
12
 
12
13
  logger = logging.getLogger("natural_pdf.qa.document_qa")
13
14
 
@@ -118,34 +119,52 @@ class DocumentQA:
118
119
  def ask(
119
120
  self,
120
121
  image: Union[str, Image.Image, np.ndarray],
121
- question: str,
122
+ question: Union[str, List[str], Tuple[str, ...]],
122
123
  word_boxes: List = None,
123
124
  min_confidence: float = 0.1,
124
125
  debug: bool = False,
125
126
  debug_output_dir: str = "output",
126
- ) -> Dict[str, Any]:
127
+ ) -> Union[QAResult, List[QAResult]]:
127
128
  """
128
- Ask a question about document content.
129
+ Ask one or more natural-language questions about the supplied document image.
130
+
131
+ This method now accepts a single *question* (``str``) **or** an
132
+ iterable of questions (``list``/``tuple`` of ``str``). When multiple
133
+ questions are provided they are executed in a single batch through the
134
+ underlying transformers pipeline which is considerably faster than
135
+ looping and calling :py:meth:`ask` repeatedly.
129
136
 
130
137
  Args:
131
- image: PIL Image, numpy array, or path to image file
132
- question: Question to ask about the document
133
- word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
134
- min_confidence: Minimum confidence threshold for answers
135
- debug: Whether to save debug information
136
- debug_output_dir: Directory to save debug files
138
+ image: PIL ``Image``, ``numpy`` array, or path to an image file.
139
+ question: A question string *or* a list/tuple of question strings.
140
+ word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
141
+ format ``[[text, [x0, y0, x1, y1]], …]``.
142
+ min_confidence: Minimum confidence threshold below which an answer
143
+ will be marked as ``found = False``.
144
+ debug: If ``True`` intermediate artefacts will be written to
145
+ *debug_output_dir* to aid troubleshooting.
146
+ debug_output_dir: Directory where debug artefacts should be saved.
137
147
 
138
148
  Returns:
139
- Dictionary with answer details: {
140
- "answer": extracted text,
141
- "confidence": confidence score,
142
- "start": start word index,
143
- "end": end word index
144
- }
149
+ A single :class:`QAResult` when *question* is a string.
150
+ • A ``list`` of :class:`QAResult`` objects (one per question) when
151
+ *question* is a list/tuple.
145
152
  """
146
153
  if not self._is_initialized:
147
154
  raise RuntimeError("DocumentQA is not properly initialized")
148
155
 
156
+ # Normalise *questions* to a list so we can treat batch and single
157
+ # uniformly. We'll remember if the caller supplied a single question
158
+ # so that we can preserve the original return type.
159
+ single_question = False
160
+ if isinstance(question, str):
161
+ questions = [question]
162
+ single_question = True
163
+ elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
164
+ questions = list(question)
165
+ else:
166
+ raise TypeError("'question' must be a string or a list/tuple of strings")
167
+
149
168
  # Process the image
150
169
  if isinstance(image, str):
151
170
  # It's a file path
@@ -161,12 +180,16 @@ class DocumentQA:
161
180
  else:
162
181
  raise TypeError("Image must be a PIL Image, numpy array, or file path")
163
182
 
164
- # Prepare the query
165
- query = {"image": image_obj, "question": question}
183
+ # ------------------------------------------------------------------
184
+ # Build the queries for the pipeline (either single dict or list).
185
+ # ------------------------------------------------------------------
186
+ def _build_query_dict(q: str):
187
+ d = {"image": image_obj, "question": q}
188
+ if word_boxes:
189
+ d["word_boxes"] = word_boxes
190
+ return d
166
191
 
167
- # Add word boxes if provided
168
- if word_boxes:
169
- query["word_boxes"] = word_boxes
192
+ queries = [_build_query_dict(q) for q in questions]
170
193
 
171
194
  # Save debug information if requested
172
195
  if debug:
@@ -202,48 +225,79 @@ class DocumentQA:
202
225
  logger.info(f"Word boxes: {word_boxes_path}")
203
226
  logger.info(f"Visualization: {vis_path}")
204
227
 
205
- # Run the query through the pipeline
206
- logger.info(f"Running document QA pipeline with question: {question}")
207
- result = self.pipe(query)[0]
208
- logger.info(f"Raw result: {result}")
209
-
210
- # Save the result if debugging
211
- if debug:
212
- result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
213
- with open(result_path, "w") as f:
214
- # Convert any non-serializable data
215
- serializable_result = {
216
- k: (
217
- str(v)
218
- if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
219
- else v
220
- )
221
- for k, v in result.items()
222
- }
223
- json.dump(serializable_result, f, indent=2)
224
-
225
- # Check confidence against threshold
226
- if result["score"] < min_confidence:
227
- logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
228
- return {
229
- "answer": "",
230
- "confidence": result["score"],
231
- "start": result.get("start", -1),
232
- "end": result.get("end", -1),
233
- "found": False,
234
- }
235
-
236
- return {
237
- "answer": result["answer"],
238
- "confidence": result["score"],
239
- "start": result.get("start", 0),
240
- "end": result.get("end", 0),
241
- "found": True,
242
- }
228
+ # ------------------------------------------------------------------
229
+ # Run the queries through the pipeline (batch or single) and collect
230
+ # *only the top answer* for each, mirroring the original behaviour.
231
+ # ------------------------------------------------------------------
232
+ logger.info(
233
+ f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
234
+ )
235
+
236
+ # When we pass a list the pipeline returns a list of per-question
237
+ # results; each per-question result is itself a list (top-k answers).
238
+ # We keep only the best answer (index 0) to maintain backwards
239
+ # compatibility.
240
+ raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
241
+
242
+ # Ensure we always have a list aligned with *questions*
243
+ if len(queries) == 1:
244
+ raw_results = [raw_results]
245
+
246
+ processed_results: List[QAResult] = []
247
+
248
+ for q, res in zip(questions, raw_results):
249
+ top_res = res[0] if isinstance(res, list) else res # pipeline may or may not nest
250
+
251
+ # Save per-question result in debug mode
252
+ if debug:
253
+ # File names: debug_qa_result_0.json,
254
+ result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
255
+ try:
256
+ with open(result_path, "w") as f:
257
+ serializable = {
258
+ k: (
259
+ str(v)
260
+ if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
261
+ else v
262
+ )
263
+ for k, v in top_res.items()
264
+ }
265
+ json.dump(serializable, f, indent=2)
266
+ except Exception as e:
267
+ logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
268
+
269
+ # Apply confidence threshold
270
+ if top_res["score"] < min_confidence:
271
+ qa_res = QAResult(
272
+ question=q,
273
+ answer="",
274
+ confidence=top_res["score"],
275
+ start=top_res.get("start", -1),
276
+ end=top_res.get("end", -1),
277
+ found=False,
278
+ )
279
+ else:
280
+ qa_res = QAResult(
281
+ question=q,
282
+ answer=top_res["answer"],
283
+ confidence=top_res["score"],
284
+ start=top_res.get("start", 0),
285
+ end=top_res.get("end", 0),
286
+ found=True,
287
+ )
288
+
289
+ processed_results.append(qa_res)
290
+
291
+ # Return appropriately typed result (single item or list)
292
+ return processed_results[0] if single_question else processed_results
243
293
 
244
294
  def ask_pdf_page(
245
- self, page, question: str, min_confidence: float = 0.1, debug: bool = False
246
- ) -> Dict[str, Any]:
295
+ self,
296
+ page,
297
+ question: Union[str, List[str], Tuple[str, ...]],
298
+ min_confidence: float = 0.1,
299
+ debug: bool = False,
300
+ ) -> Union[QAResult, List[QAResult]]:
247
301
  """
248
302
  Ask a question about a specific PDF page.
249
303
 
@@ -253,7 +307,7 @@ class DocumentQA:
253
307
  min_confidence: Minimum confidence threshold for answers
254
308
 
255
309
  Returns:
256
- Dictionary with answer details
310
+ QAResult instance with answer details
257
311
  """
258
312
  # Ensure we have text elements on the page
259
313
  if not page.find_all("text"):
@@ -274,8 +328,8 @@ class DocumentQA:
274
328
  page_image.save(temp_path)
275
329
 
276
330
  try:
277
- # Ask the question
278
- result = self.ask(
331
+ # Ask the question(s)
332
+ result_obj = self.ask(
279
333
  image=temp_path,
280
334
  question=question,
281
335
  word_boxes=word_boxes,
@@ -283,34 +337,35 @@ class DocumentQA:
283
337
  debug=debug,
284
338
  )
285
339
 
286
- # Add page reference to the result
287
- result["page_num"] = page.index
340
+ # Ensure we have a list for uniform processing
341
+ results = result_obj if isinstance(result_obj, list) else [result_obj]
288
342
 
289
- # Add element references if possible
290
- if result.get("found", False) and "start" in result and "end" in result:
291
- start_idx = result["start"]
292
- end_idx = result["end"]
343
+ for res in results:
344
+ # Attach page reference
345
+ res.page_num = page.index
293
346
 
294
- # Make sure we have valid indices and elements to work with
295
- if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
296
- # Find the actual source elements in the original list
297
- # Since word_boxes may have filtered out some elements, we need to map indices
347
+ # Map answer span back to source elements
348
+ if res.found and "start" in res and "end" in res:
349
+ start_idx = res.start
350
+ end_idx = res.end
298
351
 
299
- # Get the text from result word boxes
300
- matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
352
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
353
+ matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
301
354
 
302
- # Find corresponding elements in the full element list
303
- source_elements = []
304
- for element in elements:
305
- if hasattr(element, "text") and element.text in matched_texts:
306
- source_elements.append(element)
307
- # Remove from matched texts to avoid duplicates
308
- if element.text in matched_texts:
309
- matched_texts.remove(element.text)
355
+ source_elements = []
356
+ for element in elements:
357
+ if hasattr(element, "text") and element.text in matched_texts:
358
+ source_elements.append(element)
359
+ if element.text in matched_texts:
360
+ matched_texts.remove(element.text)
310
361
 
311
- result["source_elements"] = ElementCollection(source_elements)
362
+ res.source_elements = ElementCollection(source_elements)
312
363
 
313
- return result
364
+ # Return result(s) preserving original input type
365
+ if isinstance(question, (list, tuple)):
366
+ return results
367
+ else:
368
+ return results[0]
314
369
 
315
370
  finally:
316
371
  # Clean up temporary file
@@ -318,8 +373,12 @@ class DocumentQA:
318
373
  os.remove(temp_path)
319
374
 
320
375
  def ask_pdf_region(
321
- self, region, question: str, min_confidence: float = 0.1, debug: bool = False
322
- ) -> Dict[str, Any]:
376
+ self,
377
+ region,
378
+ question: Union[str, List[str], Tuple[str, ...]],
379
+ min_confidence: float = 0.1,
380
+ debug: bool = False,
381
+ ) -> Union[QAResult, List[QAResult]]:
323
382
  """
324
383
  Ask a question about a specific region of a PDF page.
325
384
 
@@ -329,7 +388,7 @@ class DocumentQA:
329
388
  min_confidence: Minimum confidence threshold for answers
330
389
 
331
390
  Returns:
332
- Dictionary with answer details
391
+ QAResult instance with answer details
333
392
  """
334
393
  # Get all text elements within the region
335
394
  elements = region.find_all("text")
@@ -356,8 +415,8 @@ class DocumentQA:
356
415
  region_image.save(temp_path)
357
416
 
358
417
  try:
359
- # Ask the question
360
- result = self.ask(
418
+ # Ask the question(s)
419
+ result_obj = self.ask(
361
420
  image=temp_path,
362
421
  question=question,
363
422
  word_boxes=word_boxes,
@@ -365,35 +424,29 @@ class DocumentQA:
365
424
  debug=debug,
366
425
  )
367
426
 
368
- # Add region reference to the result
369
- result["region"] = region
370
- result["page_num"] = region.page.index
427
+ results = result_obj if isinstance(result_obj, list) else [result_obj]
371
428
 
372
- # Add element references if possible
373
- if result.get("found", False) and "start" in result and "end" in result:
374
- start_idx = result["start"]
375
- end_idx = result["end"]
429
+ for res in results:
430
+ res.region = region
431
+ res.page_num = region.page.index
376
432
 
377
- # Make sure we have valid indices and elements to work with
378
- if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
379
- # Find the actual source elements in the original list
380
- # Since word_boxes may have filtered out some elements, we need to map indices
433
+ if res.found and "start" in res and "end" in res:
434
+ start_idx = res.start
435
+ end_idx = res.end
381
436
 
382
- # Get the text from result word boxes
383
- matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
437
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
438
+ matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
384
439
 
385
- # Find corresponding elements in the full element list
386
- source_elements = []
387
- for element in elements:
388
- if hasattr(element, "text") and element.text in matched_texts:
389
- source_elements.append(element)
390
- # Remove from matched texts to avoid duplicates
391
- if element.text in matched_texts:
392
- matched_texts.remove(element.text)
440
+ source_elements = []
441
+ for element in elements:
442
+ if hasattr(element, "text") and element.text in matched_texts:
443
+ source_elements.append(element)
444
+ if element.text in matched_texts:
445
+ matched_texts.remove(element.text)
393
446
 
394
- result["source_elements"] = ElementCollection(source_elements)
447
+ res.source_elements = ElementCollection(source_elements)
395
448
 
396
- return result
449
+ return results if isinstance(question, (list, tuple)) else results[0]
397
450
 
398
451
  finally:
399
452
  # Clean up temporary file
@@ -0,0 +1,55 @@
1
+ class QAResult(dict):
2
+ """Dictionary-like container for Document QA results with a convenient ``show`` method.
3
+
4
+ This class behaves exactly like a regular ``dict`` so existing code that
5
+ expects a mapping will continue to work. In addition it exposes:
6
+
7
+ • ``show()`` – delegates to the underlying ``source_elements.show`` if those
8
+ elements are present (added automatically by ``ask_pdf_page`` and
9
+ ``ask_pdf_region``). This provides a quick way to visualise where an
10
+ answer was found in the document.
11
+
12
+ • Attribute access (e.g. ``result.answer``) as sugar for the usual
13
+ ``result["answer"]``.
14
+ """
15
+
16
+ # ---------------------------------------------------------------------
17
+ # Convenience helpers
18
+ # ---------------------------------------------------------------------
19
+ def show(self, *args, **kwargs):
20
+ """Display the answer region by delegating to ``source_elements.show``.
21
+
22
+ Any positional or keyword arguments are forwarded to
23
+ ``ElementCollection.show``.
24
+ """
25
+ source = self.get("source_elements")
26
+ if source is None:
27
+ raise AttributeError(
28
+ "QAResult does not contain 'source_elements'; nothing to show()."
29
+ )
30
+ if not hasattr(source, "show"):
31
+ raise AttributeError(
32
+ "'source_elements' object has no 'show' method; cannot visualise."
33
+ )
34
+ return source.show(*args, **kwargs)
35
+
36
+ # ------------------------------------------------------------------
37
+ # Attribute <-> key delegation so ``result.answer`` works
38
+ # ------------------------------------------------------------------
39
+ def __getattr__(self, item):
40
+ try:
41
+ return self[item]
42
+ except KeyError as exc:
43
+ raise AttributeError(item) from exc
44
+
45
+ def __setattr__(self, key, value):
46
+ # Store all non-dunder attributes in the underlying mapping so that
47
+ # they remain serialisable.
48
+ if key.startswith("__") and key.endswith("__"):
49
+ super().__setattr__(key, value)
50
+ else:
51
+ self[key] = value
52
+
53
+ # Ensure ``copy`` keeps the subclass type
54
+ def copy(self):
55
+ return QAResult(self)
@@ -698,6 +698,28 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
698
698
 
699
699
  filter_lambda = contains_check
700
700
 
701
+ # --- Handle :startswith and :starts-with (alias) --- #
702
+ elif name in ("starts-with", "startswith") and args is not None:
703
+ filter_name = f"pseudo-class :{name}({args!r})"
704
+
705
+ def startswith_check(element, arg=args):
706
+ if not hasattr(element, "text") or not element.text:
707
+ return False
708
+ return str(element.text).startswith(str(arg))
709
+
710
+ filter_lambda = startswith_check
711
+
712
+ # --- Handle :endswith and :ends-with (alias) --- #
713
+ elif name in ("ends-with", "endswith") and args is not None:
714
+ filter_name = f"pseudo-class :{name}({args!r})"
715
+
716
+ def endswith_check(element, arg=args):
717
+ if not hasattr(element, "text") or not element.text:
718
+ return False
719
+ return str(element.text).endswith(str(arg))
720
+
721
+ filter_lambda = endswith_check
722
+
701
723
  elif name == "starts-with" and args is not None:
702
724
  filter_lambda = (
703
725
  lambda el, arg=args: hasattr(el, "text")
@@ -63,9 +63,9 @@ def _get_layout_kwargs(
63
63
  else:
64
64
  logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
65
65
 
66
- # 4. Ensure layout flag is present, defaulting to True
66
+ # 4. Ensure layout flag is present, defaulting to False (caller can override)
67
67
  if "layout" not in layout_kwargs:
68
- layout_kwargs["layout"] = True
68
+ layout_kwargs["layout"] = False
69
69
 
70
70
  return layout_kwargs
71
71
 
@@ -203,24 +203,42 @@ def generate_text_layout(
203
203
  logger.debug("generate_text_layout: No valid character dicts found after filtering.")
204
204
  return ""
205
205
 
206
- # Prepare layout arguments
207
- layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
208
- use_layout = layout_kwargs.pop("layout", True) # Extract layout flag, default True
206
+ # Make a working copy of user_kwargs so we can safely pop custom keys
207
+ incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
209
208
 
210
- if not use_layout:
211
- # Simple join if layout=False
212
- logger.debug("generate_text_layout: Using simple join (layout=False requested).")
213
- # Sort before joining if layout is off
214
- valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
215
- result = "".join(c.get("text", "") for c in valid_char_dicts) # Use valid chars
216
- return result
209
+ # --- Handle custom 'strip' option ------------------------------------
210
+ # * strip=True – post-process the final string to remove leading/trailing
211
+ # whitespace (typically used when layout=False)
212
+ # * strip=False preserve whitespace exactly as produced.
213
+ # Default behaviour depends on the layout flag (see below).
214
+ explicit_strip_flag = incoming_kwargs.pop("strip", None) # May be None
215
+
216
+ # Prepare layout arguments now that we've removed the non-pdfplumber key
217
+ layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
218
+ use_layout = layout_kwargs.get("layout", False)
219
+
220
+ # Determine final strip behaviour: if caller specified override, honour it;
221
+ # otherwise default to !use_layout (True when layout=False, False when
222
+ # layout=True) per user request.
223
+ strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
217
224
 
218
225
  try:
219
- # Sort chars primarily by top, then x0 before layout analysis
220
- # This helps pdfplumber group lines correctly
226
+ # Sort chars primarily by top, then x0 before layout analysis – required by
227
+ # pdfplumber so that grouping into lines works deterministically.
221
228
  valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
229
+
230
+ # Build the text map. `layout_kwargs` still contains the caller-specified or
231
+ # default "layout" flag, which chars_to_textmap will respect.
222
232
  textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
223
233
  result = textmap.as_string
234
+
235
+ # ----------------------------------------------------------------
236
+ # Optional post-processing strip
237
+ # ----------------------------------------------------------------
238
+ if strip_result and isinstance(result, str):
239
+ # Remove trailing spaces on each line then trim leading/trailing
240
+ # blank lines for a cleaner output while keeping internal newlines.
241
+ result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
224
242
  except Exception as e:
225
243
  # Fallback to simple join on error
226
244
  logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
@@ -230,5 +248,7 @@ def generate_text_layout(
230
248
  # Fallback already has sorted characters if layout was attempted
231
249
  # Need to use the valid_char_dicts here too
232
250
  result = "".join(c.get("text", "") for c in valid_char_dicts)
251
+ if strip_result:
252
+ result = result.strip()
233
253
 
234
254
  return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.22
3
+ Version: 0.1.24
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: markdown
14
15
  Requires-Dist: pandas
15
16
  Requires-Dist: pdfplumber
16
17
  Requires-Dist: colormath2
@@ -22,12 +23,6 @@ Requires-Dist: tqdm
22
23
  Requires-Dist: pydantic
23
24
  Requires-Dist: jenkspy
24
25
  Requires-Dist: scipy
25
- Requires-Dist: torch
26
- Requires-Dist: torchvision
27
- Requires-Dist: transformers[sentencepiece]
28
- Requires-Dist: huggingface_hub>=0.29.3
29
- Requires-Dist: sentence-transformers
30
- Requires-Dist: timm
31
26
  Requires-Dist: ipywidgets>=7.0.0
32
27
  Provides-Extra: test
33
28
  Requires-Dist: pytest; extra == "test"
@@ -57,6 +52,7 @@ Requires-Dist: natural-pdf[test]; extra == "all"
57
52
  Requires-Dist: natural-pdf[search]; extra == "all"
58
53
  Requires-Dist: natural-pdf[favorites]; extra == "all"
59
54
  Requires-Dist: natural-pdf[export-extras]; extra == "all"
55
+ Requires-Dist: natural-pdf[ai]; extra == "all"
60
56
  Provides-Extra: deskew
61
57
  Requires-Dist: deskew>=1.5; extra == "deskew"
62
58
  Requires-Dist: img2pdf; extra == "deskew"
@@ -68,6 +64,15 @@ Requires-Dist: pikepdf; extra == "ocr-export"
68
64
  Provides-Extra: export-extras
69
65
  Requires-Dist: jupytext; extra == "export-extras"
70
66
  Requires-Dist: nbformat; extra == "export-extras"
67
+ Provides-Extra: ai
68
+ Requires-Dist: sentence-transformers; extra == "ai"
69
+ Requires-Dist: torch; extra == "ai"
70
+ Requires-Dist: torchvision; extra == "ai"
71
+ Requires-Dist: transformers[sentencepiece]; extra == "ai"
72
+ Requires-Dist: huggingface_hub>=0.29.3; extra == "ai"
73
+ Requires-Dist: timm; extra == "ai"
74
+ Requires-Dist: doclayout_yolo; extra == "ai"
75
+ Requires-Dist: easyocr; extra == "ai"
71
76
  Dynamic: license-file
72
77
 
73
78
  # Natural PDF
@@ -87,25 +92,29 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
87
92
  pip install natural-pdf
88
93
  ```
89
94
 
90
- Need OCR engines, layout models, or other heavy add-ons? Install the **core** once, then use the helper CLI to pull in exactly what you need:
95
+ Need OCR engines, layout models, or other heavy add-ons? Install the **core** once, then use the helper `npdf` command to pull in exactly what you need:
91
96
 
92
97
  ```bash
93
- # add PaddleOCR (+paddlex) after the fact
94
- npdf install paddle
98
+ # Everything you need for classification, document-QA, semantic search, etc.
99
+ npdf install ai
95
100
 
96
101
  # Surya OCR and the YOLO Doc-Layout detector in one go
97
102
  npdf install surya yolo
98
103
 
104
+ # add PaddleOCR (+paddlex) after the fact
105
+ npdf install paddle
106
+
99
107
  # see what's already on your machine
100
108
  npdf list
101
109
  ```
102
110
 
103
- Light-weight extras such as `deskew` or `search` can still be added with
104
- classic PEP-508 markers if you prefer:
111
+ Lightweight extras such as `deskew` or `search` can still be added with
112
+ classic `pip install`:
105
113
 
106
114
  ```bash
107
115
  pip install "natural-pdf[deskew]"
108
116
  pip install "natural-pdf[search]"
117
+ pip install "natural-pdf[ai]"
109
118
  ```
110
119
 
111
120
  More details in the [installation guide](https://jsoma.github.io/natural-pdf/installation/).
@@ -116,7 +125,7 @@ More details in the [installation guide](https://jsoma.github.io/natural-pdf/ins
116
125
  from natural_pdf import PDF
117
126
 
118
127
  # Open a PDF
119
- pdf = PDF('document.pdf')
128
+ pdf = PDF('https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf')
120
129
  page = pdf.pages[0]
121
130
 
122
131
  # Extract all of the text on the page