natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -69,7 +69,8 @@ class PaddleOCROptions(BaseOCROptions):
69
69
 
70
70
  # General
71
71
  use_gpu: Optional[bool] = None
72
- gpu_mem: int = 8000 # Default from Paddle documentation
72
+ gpu_mem: int = 8000 # Default from Paddle documentation
73
+ gpu_mem: int = 8000 # Default from Paddle documentation
73
74
  ir_optim: bool = True
74
75
  use_tensorrt: bool = False
75
76
  min_subgraph_size: int = 15
@@ -81,11 +82,31 @@ class PaddleOCROptions(BaseOCROptions):
81
82
  use_onnx: bool = False
82
83
  use_zero_copy_run: bool = False
83
84
 
85
+ # Detection
86
+ det: bool = True
87
+ det_algorithm: str = "DB"
88
+ show_log: bool = False
89
+ use_onnx: bool = False
90
+ use_zero_copy_run: bool = False
91
+
84
92
  # Detection
85
93
  det: bool = True
86
94
  det_algorithm: str = "DB"
87
95
  det_model_dir: Optional[str] = None
88
- det_limit_side_len: int = 960 # Corresponds to det_max_side_len
96
+ det_limit_side_len: int = 960 # Corresponds to det_max_side_len
97
+ # DB specific
98
+ det_db_thresh: float = 0.3
99
+ det_db_box_thresh: float = 0.5
100
+ det_db_unclip_ratio: float = 2.0
101
+ # EAST specific
102
+ det_east_score_thresh: float = 0.8
103
+ det_east_cover_thresh: float = 0.1
104
+ det_east_nms_thresh: float = 0.2
105
+
106
+ # Recognition
107
+ rec: bool = True
108
+ rec_algorithm: str = "CRNN"
109
+ det_limit_side_len: int = 960 # Corresponds to det_max_side_len
89
110
  # DB specific
90
111
  det_db_thresh: float = 0.3
91
112
  det_db_box_thresh: float = 0.5
@@ -99,19 +120,30 @@ class PaddleOCROptions(BaseOCROptions):
99
120
  rec: bool = True
100
121
  rec_algorithm: str = "CRNN"
101
122
  rec_model_dir: Optional[str] = None
102
- rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
103
- rec_batch_num: int = 30 # Default from Paddle documentation
123
+ rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
124
+ rec_batch_num: int = 30 # Default from Paddle documentation
125
+ rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
126
+ rec_batch_num: int = 30 # Default from Paddle documentation
104
127
  max_text_length: int = 25
105
- rec_char_dict_path: Optional[str] = None # Path to char dictionary file
128
+ rec_char_dict_path: Optional[str] = None # Path to char dictionary file
129
+ rec_char_dict_path: Optional[str] = None # Path to char dictionary file
106
130
  use_space_char: bool = True
107
131
  drop_score: float = 0.5
108
132
 
109
133
  # Classification
110
- cls: Optional[bool] = None # Often inferred from use_angle_cls
111
- use_angle_cls: bool = False # Default from Paddle documentation
134
+ cls: Optional[bool] = None # Often inferred from use_angle_cls
135
+ use_angle_cls: bool = False # Default from Paddle documentation
136
+ cls_model_dir: Optional[str] = None
137
+ cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
138
+ label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
139
+ cls_batch_num: int = 30
140
+
141
+ # Classification
142
+ cls: Optional[bool] = None # Often inferred from use_angle_cls
143
+ use_angle_cls: bool = False # Default from Paddle documentation
112
144
  cls_model_dir: Optional[str] = None
113
- cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
114
- label_list: List[str] = field(default_factory=lambda: ['0', '180']) # Default from Paddle doc
145
+ cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
146
+ label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
115
147
  cls_batch_num: int = 30
116
148
 
117
149
  def __post_init__(self):
@@ -134,5 +166,33 @@ class SuryaOCROptions(BaseOCROptions):
134
166
  pass
135
167
 
136
168
 
169
+ # --- Doctr Specific Options ---
170
+ @dataclass
171
+ class DoctrOCROptions(BaseOCROptions):
172
+ """Specific options for the doctr engine."""
173
+
174
+ # OCR predictor options
175
+ det_arch: str = "db_resnet50"
176
+ reco_arch: str = "crnn_vgg16_bn"
177
+ pretrained: bool = True
178
+ assume_straight_pages: bool = True # Faster if pages are straight
179
+ export_as_straight_boxes: bool = False # Output straight boxes even if rotated text is detected
180
+
181
+ # Additional options from standalone predictors
182
+ # Detection predictor options
183
+ symmetric_pad: bool = True
184
+ preserve_aspect_ratio: bool = True
185
+ batch_size: int = 1
186
+
187
+ # Postprocessing parameters
188
+ bin_thresh: Optional[float] = None # Default is usually 0.3
189
+ box_thresh: Optional[float] = None # Default is usually 0.1
190
+
191
+ # Options for orientation predictors
192
+ use_orientation_predictor: bool = False # Whether to use page orientation predictor
193
+
194
+
137
195
  # --- Union type for type hinting ---
138
- OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
196
+ OCROptions = Union[
197
+ EasyOCROptions, PaddleOCROptions, SuryaOCROptions, DoctrOCROptions, BaseOCROptions
198
+ ]
natural_pdf/ocr/utils.py CHANGED
@@ -1,10 +1,12 @@
1
- import io
2
1
  import base64
2
+ import io
3
3
  import logging
4
- from typing import TYPE_CHECKING, Callable, Iterable, Optional, Any
5
- from natural_pdf.elements.text import TextElement
4
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Optional
5
+
6
6
  from tqdm.auto import tqdm
7
7
 
8
+ from natural_pdf.elements.text import TextElement
9
+
8
10
  if TYPE_CHECKING:
9
11
  from natural_pdf.elements.base import Element
10
12
 
@@ -93,7 +95,7 @@ def direct_ocr_llm(
93
95
  # Handle cases where image creation might fail (e.g., zero-dim region)
94
96
  if region_img is None:
95
97
  logger.warning(f"Could not generate image for region {region.bbox}, skipping OCR.")
96
- return "" # Return empty string if image creation failed
98
+ return "" # Return empty string if image creation failed
97
99
 
98
100
  region_img.save(buffered, format="PNG")
99
101
  base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
@@ -24,63 +24,54 @@ from .search_service_protocol import Indexable, IndexConfigurationError, SearchS
24
24
 
25
25
  logger = logging.getLogger(__name__)
26
26
 
27
- # --- Factory Function ---
28
-
29
27
 
28
+ # Factory Function
30
29
  def get_search_service(
31
- collection_name: str, # Add collection_name as a required argument
32
- persist: bool = False, # Default to In-Memory
33
- # Configuration for the service itself
34
- default_persist_path: Optional[str] = None,
30
+ collection_name: str,
31
+ persist: bool = False,
32
+ uri: Optional[str] = None,
35
33
  default_embedding_model: Optional[str] = None,
36
- # Potential future args: cache_services=True? service_type='haystack'?
37
34
  ) -> SearchServiceProtocol:
38
35
  """
39
36
  Factory function to get an instance of the configured search service.
40
37
 
41
- A service instance is tied to a specific collection name.
38
+ A service instance is tied to a specific index name (collection/table).
42
39
 
43
40
  Currently, only returns HaystackSearchService but is structured for future extension.
44
41
 
45
42
  Args:
46
- collection_name: The name of the collection this service instance will manage.
43
+ collection_name: The logical name for the index this service instance manages
44
+ (used as table_name for LanceDB).
47
45
  persist: If True, creates a service instance configured for persistent
48
- storage (ChromaDB). If False (default), uses In-Memory.
49
- default_persist_path: Override the default path for persistent storage.
46
+ storage (currently LanceDB). If False (default), uses InMemory.
47
+ uri: Override the default path/URI for persistent storage.
50
48
  default_embedding_model: Override the default embedding model used by the service.
51
49
  **kwargs: Reserved for future configuration options.
52
50
 
53
51
  Returns:
54
- An instance conforming to the SearchServiceProtocol for the specified collection.
52
+ An instance conforming to the SearchServiceProtocol for the specified collection/table.
55
53
  """
56
54
  logger.debug(
57
- f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})..."
55
+ f"Calling get_search_service factory for index '{collection_name}' (persist={persist}, uri={uri})..."
58
56
  )
59
57
 
60
- # For now, we only have one implementation
61
58
  # Collect arguments relevant to HaystackSearchService.__init__
62
59
  service_args = {}
63
- service_args["collection_name"] = collection_name # Pass collection_name
64
- service_args["persist"] = persist # Pass persist flag to service constructor
65
- if default_persist_path is not None:
66
- service_args["default_persist_path"] = default_persist_path
60
+ service_args["table_name"] = collection_name
61
+ service_args["persist"] = persist
62
+ if uri is not None:
63
+ service_args["uri"] = uri
67
64
  if default_embedding_model is not None:
68
- service_args["default_embedding_model"] = default_embedding_model
65
+ service_args["embedding_model"] = default_embedding_model
69
66
 
70
- # TODO: Implement caching/registry if needed to return the same instance
71
- # for the same configuration instead of always creating a new one.
72
- # cache_key = tuple(sorted(service_args.items()))
73
- # if cache_key in _service_instance_cache:
74
- # return _service_instance_cache[cache_key]
67
+ # Cache logic commented out as before
75
68
 
76
69
  try:
77
70
  service_instance = HaystackSearchService(**service_args)
78
- # _service_instance_cache[cache_key] = service_instance
79
- logger.info(
80
- f"Created new HaystackSearchService instance for collection '{collection_name}'."
81
- )
71
+ logger.info(f"Created new HaystackSearchService instance for index '{collection_name}'.")
82
72
  return service_instance
83
73
  except ImportError as e:
74
+ # Error message remains valid
84
75
  logger.error(
85
76
  f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
86
77
  )
@@ -92,9 +83,4 @@ def get_search_service(
92
83
  raise RuntimeError("Could not create Search Service instance.") from e
93
84
 
94
85
 
95
- # --- Optional: Define a default instance for extreme ease of use? ---
96
- # try:
97
- # default_search_service = get_search_service()
98
- # except Exception:
99
- # default_search_service = None
100
- # logger.warning("Could not create default search service instance on import.")
86
+ # Default instance commented out as before