natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -67,9 +67,10 @@ class EasyOCROptions(BaseOCROptions):
67
67
  class PaddleOCROptions(BaseOCROptions):
68
68
  """Specific options for the PaddleOCR engine."""
69
69
 
70
- use_angle_cls: bool = True
70
+ # General
71
71
  use_gpu: Optional[bool] = None
72
- gpu_mem: int = 500
72
+ gpu_mem: int = 8000 # Default from Paddle documentation
73
+ gpu_mem: int = 8000 # Default from Paddle documentation
73
74
  ir_optim: bool = True
74
75
  use_tensorrt: bool = False
75
76
  min_subgraph_size: int = 15
@@ -77,19 +78,73 @@ class PaddleOCROptions(BaseOCROptions):
77
78
  enable_mkldnn: bool = False
78
79
  cpu_threads: int = 10
79
80
  use_fp16: bool = False
81
+ show_log: bool = False
82
+ use_onnx: bool = False
83
+ use_zero_copy_run: bool = False
84
+
85
+ # Detection
86
+ det: bool = True
87
+ det_algorithm: str = "DB"
88
+ show_log: bool = False
89
+ use_onnx: bool = False
90
+ use_zero_copy_run: bool = False
91
+
92
+ # Detection
93
+ det: bool = True
94
+ det_algorithm: str = "DB"
80
95
  det_model_dir: Optional[str] = None
96
+ det_limit_side_len: int = 960 # Corresponds to det_max_side_len
97
+ # DB specific
98
+ det_db_thresh: float = 0.3
99
+ det_db_box_thresh: float = 0.5
100
+ det_db_unclip_ratio: float = 2.0
101
+ # EAST specific
102
+ det_east_score_thresh: float = 0.8
103
+ det_east_cover_thresh: float = 0.1
104
+ det_east_nms_thresh: float = 0.2
105
+
106
+ # Recognition
107
+ rec: bool = True
108
+ rec_algorithm: str = "CRNN"
109
+ det_limit_side_len: int = 960 # Corresponds to det_max_side_len
110
+ # DB specific
111
+ det_db_thresh: float = 0.3
112
+ det_db_box_thresh: float = 0.5
113
+ det_db_unclip_ratio: float = 2.0
114
+ # EAST specific
115
+ det_east_score_thresh: float = 0.8
116
+ det_east_cover_thresh: float = 0.1
117
+ det_east_nms_thresh: float = 0.2
118
+
119
+ # Recognition
120
+ rec: bool = True
121
+ rec_algorithm: str = "CRNN"
81
122
  rec_model_dir: Optional[str] = None
82
- cls_model_dir: Optional[str] = None
83
- det_limit_side_len: int = 960
84
- rec_batch_num: int = 6
123
+ rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
124
+ rec_batch_num: int = 30 # Default from Paddle documentation
125
+ rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
126
+ rec_batch_num: int = 30 # Default from Paddle documentation
85
127
  max_text_length: int = 25
128
+ rec_char_dict_path: Optional[str] = None # Path to char dictionary file
129
+ rec_char_dict_path: Optional[str] = None # Path to char dictionary file
86
130
  use_space_char: bool = True
87
131
  drop_score: float = 0.5
88
- show_log: bool = False
89
- use_onnx: bool = False
90
- det: bool = True
91
- rec: bool = True
92
- cls: Optional[bool] = None
132
+
133
+ # Classification
134
+ cls: Optional[bool] = None # Often inferred from use_angle_cls
135
+ use_angle_cls: bool = False # Default from Paddle documentation
136
+ cls_model_dir: Optional[str] = None
137
+ cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
138
+ label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
139
+ cls_batch_num: int = 30
140
+
141
+ # Classification
142
+ cls: Optional[bool] = None # Often inferred from use_angle_cls
143
+ use_angle_cls: bool = False # Default from Paddle documentation
144
+ cls_model_dir: Optional[str] = None
145
+ cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
146
+ label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
147
+ cls_batch_num: int = 30
93
148
 
94
149
  def __post_init__(self):
95
150
  pass
@@ -111,5 +166,33 @@ class SuryaOCROptions(BaseOCROptions):
111
166
  pass
112
167
 
113
168
 
169
+ # --- Doctr Specific Options ---
170
+ @dataclass
171
+ class DoctrOCROptions(BaseOCROptions):
172
+ """Specific options for the doctr engine."""
173
+
174
+ # OCR predictor options
175
+ det_arch: str = "db_resnet50"
176
+ reco_arch: str = "crnn_vgg16_bn"
177
+ pretrained: bool = True
178
+ assume_straight_pages: bool = True # Faster if pages are straight
179
+ export_as_straight_boxes: bool = False # Output straight boxes even if rotated text is detected
180
+
181
+ # Additional options from standalone predictors
182
+ # Detection predictor options
183
+ symmetric_pad: bool = True
184
+ preserve_aspect_ratio: bool = True
185
+ batch_size: int = 1
186
+
187
+ # Postprocessing parameters
188
+ bin_thresh: Optional[float] = None # Default is usually 0.3
189
+ box_thresh: Optional[float] = None # Default is usually 0.1
190
+
191
+ # Options for orientation predictors
192
+ use_orientation_predictor: bool = False # Whether to use page orientation predictor
193
+
194
+
114
195
  # --- Union type for type hinting ---
115
- OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
196
+ OCROptions = Union[
197
+ EasyOCROptions, PaddleOCROptions, SuryaOCROptions, DoctrOCROptions, BaseOCROptions
198
+ ]
natural_pdf/ocr/utils.py CHANGED
@@ -1,13 +1,18 @@
1
- import io
2
1
  import base64
2
+ import io
3
3
  import logging
4
- from typing import TYPE_CHECKING, Callable, Iterable, Optional, Any
5
- from natural_pdf.elements.text import TextElement
4
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Optional
5
+
6
6
  from tqdm.auto import tqdm
7
7
 
8
+ from natural_pdf.elements.text import TextElement
9
+
8
10
  if TYPE_CHECKING:
9
11
  from natural_pdf.elements.base import Element
10
12
 
13
+ # Import the global PDF render lock from dedicated locks module
14
+ from natural_pdf.utils.locks import pdf_render_lock
15
+
11
16
  logger = logging.getLogger(__name__)
12
17
 
13
18
 
@@ -72,7 +77,7 @@ def direct_ocr_llm(
72
77
  client,
73
78
  model="",
74
79
  resolution=150,
75
- prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
80
+ prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc. If you cannot see any text, return an empty string.",
76
81
  padding=2,
77
82
  ) -> str:
78
83
  """Convenience method to directly OCR a region of the page."""
@@ -83,7 +88,15 @@ def direct_ocr_llm(
83
88
  region = element
84
89
 
85
90
  buffered = io.BytesIO()
86
- region_img = region.to_image(resolution=resolution, include_highlights=False)
91
+ # Use the global PDF render lock when rendering images
92
+ with pdf_render_lock:
93
+ region_img = region.to_image(resolution=resolution, include_highlights=False)
94
+
95
+ # Handle cases where image creation might fail (e.g., zero-dim region)
96
+ if region_img is None:
97
+ logger.warning(f"Could not generate image for region {region.bbox}, skipping OCR.")
98
+ return "" # Return empty string if image creation failed
99
+
87
100
  region_img.save(buffered, format="PNG")
88
101
  base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
89
102
 
@@ -107,7 +120,7 @@ def direct_ocr_llm(
107
120
  ],
108
121
  )
109
122
 
110
- corrected = response.choices[0].message.content
123
+ corrected = response.choices[0].message.content.strip()
111
124
  logger.debug(f"Corrected {region.extract_text()} to {corrected}")
112
125
 
113
126
  return corrected
@@ -58,10 +58,6 @@ class DocumentQA:
58
58
  import torch
59
59
  from transformers import pipeline
60
60
 
61
- # Determine device
62
- if device is None:
63
- device = "cuda" if torch.cuda.is_available() else "cpu"
64
-
65
61
  logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
66
62
 
67
63
  # Initialize the pipeline
@@ -24,63 +24,54 @@ from .search_service_protocol import Indexable, IndexConfigurationError, SearchS
24
24
 
25
25
  logger = logging.getLogger(__name__)
26
26
 
27
- # --- Factory Function ---
28
-
29
27
 
28
+ # Factory Function
30
29
  def get_search_service(
31
- collection_name: str, # Add collection_name as a required argument
32
- persist: bool = False, # Default to In-Memory
33
- # Configuration for the service itself
34
- default_persist_path: Optional[str] = None,
30
+ collection_name: str,
31
+ persist: bool = False,
32
+ uri: Optional[str] = None,
35
33
  default_embedding_model: Optional[str] = None,
36
- # Potential future args: cache_services=True? service_type='haystack'?
37
34
  ) -> SearchServiceProtocol:
38
35
  """
39
36
  Factory function to get an instance of the configured search service.
40
37
 
41
- A service instance is tied to a specific collection name.
38
+ A service instance is tied to a specific index name (collection/table).
42
39
 
43
40
  Currently, only returns HaystackSearchService but is structured for future extension.
44
41
 
45
42
  Args:
46
- collection_name: The name of the collection this service instance will manage.
43
+ collection_name: The logical name for the index this service instance manages
44
+ (used as table_name for LanceDB).
47
45
  persist: If True, creates a service instance configured for persistent
48
- storage (ChromaDB). If False (default), uses In-Memory.
49
- default_persist_path: Override the default path for persistent storage.
46
+ storage (currently LanceDB). If False (default), uses InMemory.
47
+ uri: Override the default path/URI for persistent storage.
50
48
  default_embedding_model: Override the default embedding model used by the service.
51
49
  **kwargs: Reserved for future configuration options.
52
50
 
53
51
  Returns:
54
- An instance conforming to the SearchServiceProtocol for the specified collection.
52
+ An instance conforming to the SearchServiceProtocol for the specified collection/table.
55
53
  """
56
54
  logger.debug(
57
- f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})..."
55
+ f"Calling get_search_service factory for index '{collection_name}' (persist={persist}, uri={uri})..."
58
56
  )
59
57
 
60
- # For now, we only have one implementation
61
58
  # Collect arguments relevant to HaystackSearchService.__init__
62
59
  service_args = {}
63
- service_args["collection_name"] = collection_name # Pass collection_name
64
- service_args["persist"] = persist # Pass persist flag to service constructor
65
- if default_persist_path is not None:
66
- service_args["default_persist_path"] = default_persist_path
60
+ service_args["table_name"] = collection_name
61
+ service_args["persist"] = persist
62
+ if uri is not None:
63
+ service_args["uri"] = uri
67
64
  if default_embedding_model is not None:
68
- service_args["default_embedding_model"] = default_embedding_model
65
+ service_args["embedding_model"] = default_embedding_model
69
66
 
70
- # TODO: Implement caching/registry if needed to return the same instance
71
- # for the same configuration instead of always creating a new one.
72
- # cache_key = tuple(sorted(service_args.items()))
73
- # if cache_key in _service_instance_cache:
74
- # return _service_instance_cache[cache_key]
67
+ # Cache logic commented out as before
75
68
 
76
69
  try:
77
70
  service_instance = HaystackSearchService(**service_args)
78
- # _service_instance_cache[cache_key] = service_instance
79
- logger.info(
80
- f"Created new HaystackSearchService instance for collection '{collection_name}'."
81
- )
71
+ logger.info(f"Created new HaystackSearchService instance for index '{collection_name}'.")
82
72
  return service_instance
83
73
  except ImportError as e:
74
+ # Error message remains valid
84
75
  logger.error(
85
76
  f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
86
77
  )
@@ -92,9 +83,4 @@ def get_search_service(
92
83
  raise RuntimeError("Could not create Search Service instance.") from e
93
84
 
94
85
 
95
- # --- Optional: Define a default instance for extreme ease of use? ---
96
- # try:
97
- # default_search_service = get_search_service()
98
- # except Exception:
99
- # default_search_service = None
100
- # logger.warning("Could not create default search service instance on import.")
86
+ # Default instance commented out as before