natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -2,13 +2,15 @@
2
2
  Base Element class for natural-pdf.
3
3
  """
4
4
 
5
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
5
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overload
6
6
 
7
7
  from PIL import Image
8
8
 
9
+ # Import selector parsing functions
10
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
11
+
9
12
  if TYPE_CHECKING:
10
13
  from natural_pdf.core.page import Page
11
- from natural_pdf.elements.base import Element
12
14
  from natural_pdf.elements.collections import ElementCollection
13
15
  from natural_pdf.elements.region import Region
14
16
 
@@ -135,19 +137,11 @@ class DirectionalMixin:
135
137
  # Adjust cross boundaries if cross_size is 'element'
136
138
  if cross_size == "element":
137
139
  if is_horizontal: # Adjust y0, y1
138
- target_y0 = (
139
- target.top if include_endpoint else target.bottom
140
- ) # Use opposite boundary if excluding
141
- target_y1 = target.bottom if include_endpoint else target.top
142
- y0 = min(y0, target_y0)
143
- y1 = max(y1, target_y1)
140
+ y0 = min(y0, self.y0)
141
+ y1 = max(y1, self.y1)
144
142
  else: # Adjust x0, x1
145
- target_x0 = (
146
- target.x0 if include_endpoint else target.x1
147
- ) # Use opposite boundary if excluding
148
- target_x1 = target.x1 if include_endpoint else target.x0
149
- x0 = min(x0, target_x0)
150
- x1 = max(x1, target_x1)
143
+ x0 = min(x0, self.x0)
144
+ x1 = max(x1, self.x1)
151
145
 
152
146
  # 4. Finalize bbox coordinates
153
147
  if is_horizontal:
@@ -525,7 +519,7 @@ class Element(DirectionalMixin):
525
519
  selector: Optional selector to filter by
526
520
  limit: Maximum number of elements to search through (default: 10)
527
521
  apply_exclusions: Whether to apply exclusion regions (default: True)
528
- **kwargs: Additional parameters
522
+ **kwargs: Additional parameters for selector filtering (e.g., regex, case)
529
523
 
530
524
  Returns:
531
525
  Next element or None if not found
@@ -548,13 +542,19 @@ class Element(DirectionalMixin):
548
542
  # Limit search range for performance
549
543
  candidates = candidates[:limit] if limit else candidates
550
544
 
551
- # Find matching elements
552
- from natural_pdf.elements.collections import ElementCollection
545
+ # Parse the selector and create a filter function
546
+ parsed_selector = parse_selector(selector)
547
+ # Pass relevant kwargs (like regex, case) to the filter function builder
548
+ filter_func = selector_to_filter_func(parsed_selector, **kwargs)
549
+
550
+ # Iterate and return the first match
551
+ for candidate in candidates:
552
+ if filter_func(candidate):
553
+ return candidate
554
+ return None # No match found
553
555
 
554
- matches = ElementCollection(candidates).find_all(selector, **kwargs)
555
- return matches[0] if matches else None
556
+ # No selector, just return the next element if it exists
556
557
  elif idx + 1 < len(all_elements):
557
- # No selector, just return the next element
558
558
  return all_elements[idx + 1]
559
559
 
560
560
  return None
@@ -573,7 +573,7 @@ class Element(DirectionalMixin):
573
573
  selector: Optional selector to filter by
574
574
  limit: Maximum number of elements to search through (default: 10)
575
575
  apply_exclusions: Whether to apply exclusion regions (default: True)
576
- **kwargs: Additional parameters
576
+ **kwargs: Additional parameters for selector filtering (e.g., regex, case)
577
577
 
578
578
  Returns:
579
579
  Previous element or None if not found
@@ -598,13 +598,19 @@ class Element(DirectionalMixin):
598
598
  # Limit search range for performance
599
599
  candidates = candidates[:limit] if limit else candidates
600
600
 
601
- # Find matching elements using ElementCollection
602
- from natural_pdf.elements.collections import ElementCollection
601
+ # Parse the selector and create a filter function
602
+ parsed_selector = parse_selector(selector)
603
+ # Pass relevant kwargs (like regex, case) to the filter function builder
604
+ filter_func = selector_to_filter_func(parsed_selector, **kwargs)
605
+
606
+ # Iterate and return the first match (from reversed list)
607
+ for candidate in candidates:
608
+ if filter_func(candidate):
609
+ return candidate
610
+ return None # No match found
603
611
 
604
- matches = ElementCollection(candidates).find_all(selector, **kwargs)
605
- return matches[0] if matches else None # find_all returns a collection
612
+ # No selector, just return the previous element if it exists
606
613
  elif idx > 0:
607
- # No selector, just return the previous element
608
614
  return all_elements[idx - 1]
609
615
 
610
616
  return None
@@ -887,40 +893,128 @@ class Element(DirectionalMixin):
887
893
  """String representation of the element."""
888
894
  return f"<{self.__class__.__name__} bbox={self.bbox}>"
889
895
 
890
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
896
+ @overload
897
+ def find(
898
+ self,
899
+ *,
900
+ text: str,
901
+ apply_exclusions: bool = True,
902
+ regex: bool = False,
903
+ case: bool = True,
904
+ **kwargs,
905
+ ) -> Optional["Element"]: ...
906
+
907
+ @overload
908
+ def find(
909
+ self,
910
+ selector: str,
911
+ *,
912
+ apply_exclusions: bool = True,
913
+ regex: bool = False,
914
+ case: bool = True,
915
+ **kwargs,
916
+ ) -> Optional["Element"]: ...
917
+
918
+ def find(
919
+ self,
920
+ selector: Optional[str] = None,
921
+ *,
922
+ text: Optional[str] = None,
923
+ apply_exclusions: bool = True,
924
+ regex: bool = False,
925
+ case: bool = True,
926
+ **kwargs,
927
+ ) -> Optional["Element"]:
891
928
  """
892
- Find first element within this element's bounds matching the selector.
929
+ Find first element within this element's bounds matching the selector OR text.
893
930
  Creates a temporary region to perform the search.
894
931
 
932
+ Provide EITHER `selector` OR `text`, but not both.
933
+
895
934
  Args:
896
- selector: CSS-like selector string
897
- apply_exclusions: Whether to apply exclusion regions
898
- **kwargs: Additional parameters for element filtering
935
+ selector: CSS-like selector string.
936
+ text: Text content to search for (equivalent to 'text:contains(...)').
937
+ apply_exclusions: Whether to apply exclusion regions (default: True).
938
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
939
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
940
+ **kwargs: Additional parameters for element filtering.
899
941
 
900
942
  Returns:
901
- First matching element or None
943
+ First matching element or None.
902
944
  """
903
945
  from natural_pdf.elements.region import Region
904
946
 
905
947
  # Create a temporary region from this element's bounds
906
948
  temp_region = Region(self.page, self.bbox)
907
- return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
949
+ # Delegate to the region's find method
950
+ return temp_region.find(
951
+ selector=selector,
952
+ text=text,
953
+ apply_exclusions=apply_exclusions,
954
+ regex=regex,
955
+ case=case,
956
+ **kwargs,
957
+ )
958
+
959
+ @overload
960
+ def find_all(
961
+ self,
962
+ *,
963
+ text: str,
964
+ apply_exclusions: bool = True,
965
+ regex: bool = False,
966
+ case: bool = True,
967
+ **kwargs,
968
+ ) -> "ElementCollection": ...
969
+
970
+ @overload
971
+ def find_all(
972
+ self,
973
+ selector: str,
974
+ *,
975
+ apply_exclusions: bool = True,
976
+ regex: bool = False,
977
+ case: bool = True,
978
+ **kwargs,
979
+ ) -> "ElementCollection": ...
908
980
 
909
- def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> "ElementCollection":
981
+ def find_all(
982
+ self,
983
+ selector: Optional[str] = None,
984
+ *,
985
+ text: Optional[str] = None,
986
+ apply_exclusions: bool = True,
987
+ regex: bool = False,
988
+ case: bool = True,
989
+ **kwargs,
990
+ ) -> "ElementCollection":
910
991
  """
911
- Find all elements within this element's bounds matching the selector.
992
+ Find all elements within this element's bounds matching the selector OR text.
912
993
  Creates a temporary region to perform the search.
913
994
 
995
+ Provide EITHER `selector` OR `text`, but not both.
996
+
914
997
  Args:
915
- selector: CSS-like selector string
916
- apply_exclusions: Whether to apply exclusion regions
917
- **kwargs: Additional parameters for element filtering
998
+ selector: CSS-like selector string.
999
+ text: Text content to search for (equivalent to 'text:contains(...)').
1000
+ apply_exclusions: Whether to apply exclusion regions (default: True).
1001
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1002
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1003
+ **kwargs: Additional parameters for element filtering.
918
1004
 
919
1005
  Returns:
920
- ElementCollection with matching elements
1006
+ ElementCollection with matching elements.
921
1007
  """
922
1008
  from natural_pdf.elements.region import Region
923
1009
 
924
1010
  # Create a temporary region from this element's bounds
925
1011
  temp_region = Region(self.page, self.bbox)
926
- return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1012
+ # Delegate to the region's find_all method
1013
+ return temp_region.find_all(
1014
+ selector=selector,
1015
+ text=text,
1016
+ apply_exclusions=apply_exclusions,
1017
+ regex=regex,
1018
+ case=case,
1019
+ **kwargs,
1020
+ )