natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +751 -607
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +120 -23
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.35.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,7 @@
1
1
  # ocr_options.py
2
- import logging
3
2
  from dataclasses import dataclass, field
4
3
  from typing import Any, Dict, List, Optional, Tuple, Union
5
4
 
6
- # Configure logging
7
- # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
8
- # logger = logging.getLogger(__name__)
9
- # Assume logger is configured elsewhere or remove if not needed globally
10
5
 
11
6
 
12
7
  # --- Base Options ---
@@ -58,8 +53,6 @@ class EasyOCROptions(BaseOCROptions):
58
53
  add_margin: float = 0.1
59
54
  output_format: str = "standard"
60
55
 
61
- # def __post_init__(self):
62
- # logger.debug(f"Initialized EasyOCROptions: {self}")
63
56
 
64
57
 
65
58
  # --- PaddleOCR Specific Options ---
@@ -95,8 +88,8 @@ class PaddleOCROptions(BaseOCROptions):
95
88
 
96
89
  # Detection parameters (can be overridden at predict time)
97
90
  # https://github.com/PaddlePaddle/PaddleOCR/issues/15424
98
- text_det_limit_side_len: Optional[int] = 736 # WAITING FOR FIX
99
- text_det_limit_type: Optional[str] = 'max' # WAITING FOR FIX
91
+ text_det_limit_side_len: Optional[int] = 736 # WAITING FOR FIX
92
+ text_det_limit_type: Optional[str] = "max" # WAITING FOR FIX
100
93
  text_det_thresh: Optional[float] = None
101
94
  text_det_box_thresh: Optional[float] = None
102
95
  text_det_unclip_ratio: Optional[float] = None
@@ -113,7 +106,7 @@ class PaddleOCROptions(BaseOCROptions):
113
106
  enable_hpi: Optional[bool] = None
114
107
  use_tensorrt: Optional[bool] = None
115
108
  precision: Optional[str] = None
116
- enable_mkldnn: Optional[bool] = False # https://github.com/PaddlePaddle/PaddleOCR/issues/15294
109
+ enable_mkldnn: Optional[bool] = False # https://github.com/PaddlePaddle/PaddleOCR/issues/15294
117
110
  # mkldnn_cache_capacity: Optional[int] = None
118
111
  cpu_threads: Optional[int] = None
119
112
  paddlex_config: Optional[str] = None
@@ -9,6 +9,7 @@ import numpy as np
9
9
  from PIL import Image, ImageDraw
10
10
 
11
11
  from natural_pdf.elements.collections import ElementCollection
12
+
12
13
  from .qa_result import QAResult
13
14
 
14
15
  logger = logging.getLogger("natural_pdf.qa.document_qa")
@@ -252,13 +253,17 @@ class DocumentQA:
252
253
  # Save per-question result in debug mode
253
254
  if debug:
254
255
  # File names: debug_qa_result_0.json, …
255
- result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
256
+ result_path = os.path.join(
257
+ debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json"
258
+ )
256
259
  try:
257
260
  with open(result_path, "w") as f:
258
261
  serializable = {
259
262
  k: (
260
263
  str(v)
261
- if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
264
+ if not isinstance(
265
+ v, (str, int, float, bool, list, dict, type(None))
266
+ )
262
267
  else v
263
268
  )
264
269
  for k, v in top_res.items()
@@ -317,9 +322,9 @@ class DocumentQA:
317
322
  warnings.warn(
318
323
  f"No text elements found on page {page.index}. "
319
324
  "Consider applying OCR first using page.apply_ocr() to extract text from images.",
320
- UserWarning
325
+ UserWarning,
321
326
  )
322
-
327
+
323
328
  # Return appropriate "not found" result(s)
324
329
  if isinstance(question, (list, tuple)):
325
330
  return [
@@ -376,7 +381,11 @@ class DocumentQA:
376
381
  start_idx = res.start
377
382
  end_idx = res.end
378
383
 
379
- if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
384
+ if (
385
+ elements
386
+ and 0 <= start_idx < len(word_boxes)
387
+ and 0 <= end_idx < len(word_boxes)
388
+ ):
380
389
  matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
381
390
 
382
391
  source_elements = []
@@ -426,9 +435,9 @@ class DocumentQA:
426
435
  warnings.warn(
427
436
  f"No text elements found in region on page {region.page.index}. "
428
437
  "Consider applying OCR first using region.apply_ocr() to extract text from images.",
429
- UserWarning
438
+ UserWarning,
430
439
  )
431
-
440
+
432
441
  # Return appropriate "not found" result(s)
433
442
  if isinstance(question, (list, tuple)):
434
443
  return [
@@ -488,7 +497,11 @@ class DocumentQA:
488
497
  start_idx = res.start
489
498
  end_idx = res.end
490
499
 
491
- if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
500
+ if (
501
+ elements
502
+ and 0 <= start_idx < len(word_boxes)
503
+ and 0 <= end_idx < len(word_boxes)
504
+ ):
492
505
  matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
493
506
 
494
507
  source_elements = []
@@ -24,13 +24,9 @@ class QAResult(dict):
24
24
  """
25
25
  source = self.get("source_elements")
26
26
  if source is None:
27
- raise AttributeError(
28
- "QAResult does not contain 'source_elements'; nothing to show()."
29
- )
27
+ raise AttributeError("QAResult does not contain 'source_elements'; nothing to show().")
30
28
  if not hasattr(source, "show"):
31
- raise AttributeError(
32
- "'source_elements' object has no 'show' method; cannot visualise."
33
- )
29
+ raise AttributeError("'source_elements' object has no 'show' method; cannot visualise.")
34
30
  return source.show(*args, **kwargs)
35
31
 
36
32
  # ------------------------------------------------------------------
@@ -52,4 +48,4 @@ class QAResult(dict):
52
48
 
53
49
  # Ensure ``copy`` keeps the subclass type
54
50
  def copy(self):
55
- return QAResult(self)
51
+ return QAResult(self)
@@ -18,9 +18,9 @@ SEARCH_DEPENDENCIES_AVAILABLE = False
18
18
 
19
19
  try:
20
20
  import numpy as np
21
+
21
22
  # Lazy import for sentence_transformers to avoid heavy loading at module level
22
23
  # import sentence_transformers
23
-
24
24
  # Basic search dependencies are available
25
25
  SEARCH_DEPENDENCIES_AVAILABLE = True
26
26
 
@@ -51,6 +51,7 @@ def _check_sentence_transformers():
51
51
  """Lazy check for sentence_transformers availability."""
52
52
  try:
53
53
  import sentence_transformers
54
+
54
55
  return True
55
56
  except ImportError:
56
57
  return False
@@ -63,7 +64,7 @@ def check_search_availability():
63
64
  "Search functionality requires 'lancedb' and pyarrow. "
64
65
  "Install with: pip install natural-pdf[search] (or pip install lancedb pyarrow)"
65
66
  )
66
-
67
+
67
68
  # Lazy check for sentence_transformers when actually needed
68
69
  if not _check_sentence_transformers():
69
70
  raise ImportError(
@@ -7,15 +7,13 @@ from typing import Any, Dict, Iterable, List, Optional, Union
7
7
 
8
8
  import lancedb
9
9
  import pyarrow as pa
10
+
11
+ from .search_options import BaseSearchOptions
12
+ from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
13
+
10
14
  # Lazy import for SentenceTransformer to avoid heavy loading at module level
11
15
  # from sentence_transformers import SentenceTransformer
12
16
 
13
- from .search_options import BaseSearchOptions
14
- from .search_service_protocol import (
15
- Indexable,
16
- IndexConfigurationError,
17
- SearchServiceProtocol,
18
- )
19
17
 
20
18
  logger = logging.getLogger(__name__)
21
19
 
@@ -26,6 +24,7 @@ DEFAULT_LANCEDB_PERSIST_PATH = "./lancedb_data"
26
24
  def _get_sentence_transformer(model_name: str):
27
25
  """Lazy import and instantiation of SentenceTransformer."""
28
26
  from sentence_transformers import SentenceTransformer
27
+
29
28
  return SentenceTransformer(model_name)
30
29
 
31
30
 
@@ -6,8 +6,6 @@ from pathlib import Path
6
6
  from typing import Any, Dict, Iterable, List, Optional, Union
7
7
 
8
8
  import numpy as np
9
- # Lazy import for SentenceTransformer to avoid heavy loading at module level
10
- # from sentence_transformers import SentenceTransformer
11
9
 
12
10
  from .search_options import BaseSearchOptions
13
11
  from .search_service_protocol import (
@@ -17,6 +15,10 @@ from .search_service_protocol import (
17
15
  SearchServiceProtocol,
18
16
  )
19
17
 
18
+ # Lazy import for SentenceTransformer to avoid heavy loading at module level
19
+ # from sentence_transformers import SentenceTransformer
20
+
21
+
20
22
  logger = logging.getLogger(__name__)
21
23
 
22
24
  DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
@@ -25,6 +27,7 @@ DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
25
27
  def _get_sentence_transformer(model_name: str):
26
28
  """Lazy import and instantiation of SentenceTransformer."""
27
29
  from sentence_transformers import SentenceTransformer
30
+
28
31
  return SentenceTransformer(model_name)
29
32
 
30
33
 
@@ -1,5 +1,29 @@
1
- """
2
- CSS-like selector parser for natural-pdf.
1
+ """CSS-like selector parser for natural-pdf.
2
+
3
+ This module implements a sophisticated selector parsing system that enables
4
+ jQuery-style element selection in PDF documents. It supports complex CSS-like
5
+ selectors with extensions for PDF-specific attributes and spatial relationships.
6
+
7
+ The parser handles:
8
+ - Basic element selectors (text, rect, line, image)
9
+ - Attribute selectors with comparisons ([size>12], [color="red"])
10
+ - Pseudo-selectors for text content (:contains(), :regex())
11
+ - Spatial relationship selectors (:above(), :below(), :near())
12
+ - Color matching with Delta E distance calculations
13
+ - Logical operators (AND, OR) and grouping
14
+ - Complex nested expressions with proper precedence
15
+
16
+ Key features:
17
+ - Safe value parsing without eval() for security
18
+ - Color parsing from multiple formats (hex, RGB, names, CSS functions)
19
+ - Font and style attribute matching
20
+ - Coordinate and dimension-based selections
21
+ - Performance-optimized filtering functions
22
+
23
+ This enables powerful document navigation like:
24
+ - page.find('text[size>12]:bold:contains("Summary")')
25
+ - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
26
+ - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
3
27
  """
4
28
 
5
29
  import ast
@@ -16,14 +40,35 @@ logger = logging.getLogger(__name__)
16
40
 
17
41
 
18
42
  def safe_parse_value(value_str: str) -> Any:
19
- """
20
- Safely parse a value string without using eval().
43
+ """Safely parse a value string without using eval().
44
+
45
+ Parses various value formats commonly found in PDF attributes while maintaining
46
+ security by avoiding eval(). Supports numbers, tuples, lists, booleans, and
47
+ quoted strings with proper type conversion.
21
48
 
22
49
  Args:
23
- value_str: String representation of a value (number, tuple, string, etc.)
50
+ value_str: String representation of a value. Can be a number ("12"),
51
+ tuple ("(1.0, 0.5, 0.2)"), list ("[1, 2, 3]"), quoted string
52
+ ('"Arial"'), boolean ("True"), or plain string ("Arial").
24
53
 
25
54
  Returns:
26
- Parsed value
55
+ Parsed value with appropriate Python type. Numbers become int/float,
56
+ tuples/lists maintain structure, quoted strings are unquoted, and
57
+ unrecognized values are returned as strings.
58
+
59
+ Example:
60
+ ```python
61
+ safe_parse_value("12") # -> 12
62
+ safe_parse_value("12.5") # -> 12.5
63
+ safe_parse_value("(1,0,0)") # -> (1, 0, 0)
64
+ safe_parse_value('"Arial"') # -> "Arial"
65
+ safe_parse_value("True") # -> True
66
+ safe_parse_value("plain_text") # -> "plain_text"
67
+ ```
68
+
69
+ Note:
70
+ This function deliberately avoids eval() for security reasons and uses
71
+ ast.literal_eval() for safe parsing of Python literals.
27
72
  """
28
73
  # Strip quotes first if it's a quoted string
29
74
  value_str = value_str.strip()
@@ -1,5 +1,5 @@
1
1
  # new file
2
2
  # Re-export for convenient import
3
3
  from .result import TableResult
4
-
5
- __all__ = ["TableResult"]
4
+
5
+ __all__ = ["TableResult"]
@@ -1,8 +1,9 @@
1
1
  """Sequence wrapper for table data with convenient DataFrame helpers."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  from collections.abc import Sequence
5
- from typing import Any, List, Iterator, Optional, Union
6
+ from typing import Any, Iterator, List, Optional, Union
6
7
 
7
8
 
8
9
  class TableResult(Sequence):
@@ -12,9 +13,7 @@ class TableResult(Sequence):
12
13
  list of cell values) but offers an easy hand-off to *pandas*.
13
14
  """
14
15
 
15
- _IMMUTABLE_MESSAGE = (
16
- "TableResult is read-only; convert to list(result) if you need to mutate"
17
- )
16
+ _IMMUTABLE_MESSAGE = "TableResult is read-only; convert to list(result) if you need to mutate"
18
17
 
19
18
  def __init__(self, rows: Optional[List[List[Any]]] = None) -> None:
20
19
  # Normalise to list of list so that Sequence operations work as expected
@@ -81,7 +80,9 @@ class TableResult(Sequence):
81
80
 
82
81
  df = pd.DataFrame(body, columns=hdr)
83
82
  if index_col is not None and not df.empty:
84
- df.set_index(df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True)
83
+ df.set_index(
84
+ df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True
85
+ )
85
86
 
86
87
  if kwargs:
87
88
  df = pd.DataFrame(df, **kwargs)
@@ -98,4 +99,4 @@ class TableResult(Sequence):
98
99
  # Nice repr in notebooks
99
100
  def __repr__(self) -> str: # noqa: D401 (simple)
100
101
  preview = "…" if len(self._rows) > 5 else ""
101
- return f"TableResult(rows={len(self._rows)}{preview})"
102
+ return f"TableResult(rows={len(self._rows)}{preview})"
@@ -6,6 +6,7 @@ replaces each bracket/parenthesis character with its Unicode-defined pair.
6
6
  For everyday PDFs the six ASCII pairs are enough, but the mapping can be
7
7
  extended easily from Unicode's BidiBrackets.txt.
8
8
  """
9
+
9
10
  from typing import Dict
10
11
 
11
12
  # Minimal mapping – ( ) [ ] { }
@@ -33,4 +34,4 @@ def mirror_brackets(text: str) -> str: # pragma: no cover
33
34
  append = out_chars.append
34
35
  for ch in text:
35
36
  append(_ASCII_MIRROR.get(ord(ch), ch))
36
- return "".join(out_chars)
37
+ return "".join(out_chars)
@@ -178,8 +178,9 @@ def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any
178
178
  Returns:
179
179
  List of elements in reading order
180
180
  """
181
- # TODO: Implement complex layout analysis
182
- # For now, fall back to column-aware reading order
181
+ # TODO: Implement complex layout analysis for sophisticated document structures
182
+ # Would include: multi-column detection, figure/caption relationships, sidebars
183
+ # For now, fall back to column-aware reading order which handles most cases
183
184
  return _column_reading_order(elements)
184
185
 
185
186
 
@@ -237,11 +237,11 @@ def merge_images_with_legend(
237
237
  def render_plain_page(page, resolution):
238
238
  """
239
239
  Render a page to PIL Image using the specified resolution.
240
-
240
+
241
241
  Args:
242
242
  page: Page object to render
243
243
  resolution: DPI resolution for rendering
244
-
244
+
245
245
  Returns:
246
246
  PIL Image of the rendered page
247
247
  """
@@ -252,7 +252,7 @@ def render_plain_page(page, resolution):
252
252
  # Convert resolution (DPI) to scale factor for pypdfium2
253
253
  # PDF standard is 72 DPI, so scale = resolution / 72
254
254
  scale_factor = resolution / 72.0
255
-
255
+
256
256
  bitmap = pdf_page.render(
257
257
  scale=scale_factor,
258
258
  )
@@ -1,7 +1,6 @@
1
1
  # natural_pdf/widgets/viewer.py
2
2
 
3
3
  import logging
4
- import os
5
4
 
6
5
  from natural_pdf.utils.visualization import render_plain_page
7
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.33
3
+ Version: 0.1.35
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,121 @@
1
+ natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
2
+ natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
3
+ natural_pdf/analyzers/__init__.py,sha256=IPu_PMKFviDeEIeiC8_2KdeqH7z8OQ6q2v980hkByFY,672
4
+ natural_pdf/analyzers/guides.py,sha256=5Lqc51trtqmLvjxLjDS__mgeyviRrjV-CIIT69RmEt4,92327
5
+ natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
6
+ natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
7
+ natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
8
+ natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
9
+ natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
10
+ natural_pdf/analyzers/layout/base.py,sha256=F5xPOJcI65N4nxwm0szvhtbDD6lVMqWDut8PSkTCobU,8349
11
+ natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
12
+ natural_pdf/analyzers/layout/gemini.py,sha256=ldECVCQ5HNQA3Omjg2NOsTrJXslyYb0vErDncmLIiuE,10510
13
+ natural_pdf/analyzers/layout/layout_analyzer.py,sha256=Ff7OfjMyGIWkfPKl6dHgkFyb-iru83_VDk0gmvyHbbg,15549
14
+ natural_pdf/analyzers/layout/layout_manager.py,sha256=ivMfTPjS14dyISu2o2Q6K48jkftvAOD04aCOtInkZGo,10267
15
+ natural_pdf/analyzers/layout/layout_options.py,sha256=2JENtBMHhP3hP0zpFI5-UP3-t1y49E7oLZnjd9d1eB0,7704
16
+ natural_pdf/analyzers/layout/paddle.py,sha256=44GG1sbaYTgvmtnrckNaCbDaNyw_D7FLLiSvzKP2cbk,23048
17
+ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
18
+ natural_pdf/analyzers/layout/surya.py,sha256=ugRXPIHiLoh65lfbbiXO317TbgdtQ-5kVN1nonEf4ws,9778
19
+ natural_pdf/analyzers/layout/table_structure_utils.py,sha256=_sugFWvVpRK3EimOCrikTDAalGnSaWqiqFbtJw8t-lg,2770
20
+ natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
21
+ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kNnMUuwCU,8286
22
+ natural_pdf/classification/manager.py,sha256=wyENltPSeWpJNjqzU91-ydJTnACZ_LC1q-ox_tRhMIM,22172
23
+ natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
24
+ natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
25
+ natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
26
+ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw4ZRqKWa6nbqA,30131
27
+ natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
+ natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
+ natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
30
+ natural_pdf/core/page.py,sha256=Jw5SDshnHesqoC4yhtKEokeV08wMHuWZyWs5kDMOAjo,133204
31
+ natural_pdf/core/pdf.py,sha256=9t8Ks-AZp3yjH_lRkFZAyIkjUQoCTRbmXK7vSi1e4UE,92415
32
+ natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
33
+ natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
34
+ natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
35
+ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
36
+ natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
37
+ natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
38
+ natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
39
+ natural_pdf/elements/collections.py,sha256=7i279l8kpgzRyvjRr13n1BeqbC5ufwYx7lu_WmfXWTE,131199
40
+ natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
41
+ natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
42
+ natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
43
+ natural_pdf/elements/region.py,sha256=EqwtZJ2qgMyykuLVv2zO51oKJoSU4Hl7UA_mqTqRzmQ,143419
44
+ natural_pdf/elements/text.py,sha256=409RqADe0FYG_i99n6Dy0hl_fWTtBHRCzCq7BP0eAL8,18854
45
+ natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
46
+ natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
47
+ natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
48
+ natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
49
+ natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
50
+ natural_pdf/exporters/original_pdf.py,sha256=KYW0f9_zdouZq_ZwNGvYnu6WHqv7JWrrEAdPCVmhRV4,6782
51
+ natural_pdf/exporters/paddleocr.py,sha256=RBP03GCk0mLeC7tWtuti8AIUHlpOrtvbWkE2n7Ja7k8,19484
52
+ natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
53
+ natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
+ natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
55
+ natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
56
+ natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6HcsM,4979
57
+ natural_pdf/extraction/mixin.py,sha256=z0HNRs4x4RoioNjzg3slDeqoHbiPug0HB37bUHehqMY,25066
58
+ natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M2o4,1825
59
+ natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
60
+ natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
61
+ natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
62
+ natural_pdf/flows/flow.py,sha256=ukkUqXsZmEw-QJEiVqEBLC8ktfBG2Bw56_RR1OEsd24,12802
63
+ natural_pdf/flows/region.py,sha256=nB634NCuC2BzBHuXAn8Ynf5lwZnR5mWb3RD36iEaPYY,27659
64
+ natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
65
+ natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
66
+ natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
67
+ natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
68
+ natural_pdf/ocr/engine_paddle.py,sha256=OmZlXVh2SSgNePqb6sMo2Mg5boX7REA4MUY25O7hKgU,16144
69
+ natural_pdf/ocr/engine_surya.py,sha256=lOvSbZk53VKFVxRmqcQzM_0dHVdwTkRGiDZ9AWCgL1Q,5951
70
+ natural_pdf/ocr/ocr_factory.py,sha256=Ix-p1SrV6dchq6YcbbCTf2BPBHSGwu9KBnwnZ_ohOuw,5282
71
+ natural_pdf/ocr/ocr_manager.py,sha256=U8EVzNgeRQxxAbMpCEZhkF7nr_R8Fcvtp28oeV_D-Ms,16229
72
+ natural_pdf/ocr/ocr_options.py,sha256=_BgLjIih6mY3k-AgkdXu9UDD8bykmQX2fpf37tAOhYQ,5146
73
+ natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
74
+ natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
75
+ natural_pdf/qa/document_qa.py,sha256=EduwpmUs8Oz35GrCfLw3312F_ngxIpWZLM8KNvasdrM,19887
76
+ natural_pdf/qa/qa_result.py,sha256=8_jL5MJAHR4LcjGVe5lVsFizxWieF6VI86DWaqetYxs,2167
77
+ natural_pdf/search/__init__.py,sha256=araouqM-l_m0VlluKf6i9BybAsHnfCuh39M0-xEI3jA,4273
78
+ natural_pdf/search/lancedb_search_service.py,sha256=dfz5IiMIcAc3KFzkBDF6Ab_JDLpLHqW6DO1JDkPPu1k,14458
79
+ natural_pdf/search/numpy_search_service.py,sha256=GwPwnX_wxBPFHe-bKS5upMRZLHj8PjLQ2d84lZygzHg,10331
80
+ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
81
+ natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
82
+ natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
83
+ natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
84
+ natural_pdf/selectors/parser.py,sha256=Flxjo_ZODBLQM8DQlQGqZTTQDyea3zUTzO9L2dtVabM,36402
85
+ natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
86
+ natural_pdf/tables/result.py,sha256=hrGIWDkImpdxsGzugcQKU-qrTgHwwfOigJDFdYl8aUc,3994
87
+ natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
88
+ natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
89
+ natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
90
+ natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
91
+ natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
92
+ natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
93
+ natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
94
+ natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
95
+ natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
96
+ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
97
+ natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
98
+ natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
99
+ natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
100
+ natural_pdf-0.1.35.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
101
+ optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
102
+ optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
103
+ optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
104
+ optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
105
+ optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
106
+ tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
+ tools/bad_pdf_eval/analyser.py,sha256=bKUT3muP3ESE5i1D8sGyAS5tMzFMcq-i-xD_ZeUxYhY,13692
108
+ tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
109
+ tools/bad_pdf_eval/compile_attempts_markdown.py,sha256=ArFDZaSa9dz0ez0lsNlbUSK4hbvB3___DlfwqPEAZpY,4359
110
+ tools/bad_pdf_eval/eval_suite.py,sha256=zcapsGwO-VJ2OupJnPYKbrkzvzdGdoh2DZPK19bfkQg,4450
111
+ tools/bad_pdf_eval/evaluate_quality.py,sha256=-LR_shgxPVbaEZyWSVYKXTp2LNNVSdIwrlN5rllqntg,7149
112
+ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=1hd1iaTinmT8K1rlaHFV_ZvvbyuLEAnIbmKZUtRWv8o,1958
113
+ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1tP5Q,13313
114
+ tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
115
+ tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
116
+ tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
117
+ natural_pdf-0.1.35.dist-info/METADATA,sha256=SVdCwYrjweXrrmU8m2korCIMJENbN9zDasRCi2pkb8E,6711
118
+ natural_pdf-0.1.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
119
+ natural_pdf-0.1.35.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
120
+ natural_pdf-0.1.35.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
121
+ natural_pdf-0.1.35.dist-info/RECORD,,