natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,149 +0,0 @@
1
- # Finding Specific Elements
2
-
3
- Extracting all the text is useful, but often you need specific pieces of information. `natural-pdf` lets you find elements using selectors, similar to CSS.
4
-
5
- Let's find the "Site" and "Date" information from our `01-practice.pdf`:
6
-
7
- ```python
8
- #%pip install "natural-pdf[all]"
9
- ```
10
-
11
-
12
- ```python
13
- from natural_pdf import PDF
14
-
15
- # Load a PDF
16
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
17
-
18
- # Get the first page (index 0)
19
- page = pdf.pages[0]
20
-
21
- # Find the text element containing "Site:"
22
- # The ':contains()' pseudo-class looks for text content.
23
- site_label = page.find('text:contains("Site:")')
24
-
25
- # Find the text element containing "Date:"
26
- date_label = page.find('text:contains("Date:")')
27
-
28
- # Visualize the found elements
29
- site_label.highlight(color="red", label="Site Label")
30
- date_label.highlight(color="blue", label="Date Label")
31
-
32
- # Access the text content directly
33
- {
34
- "Site Label": site_label.text,
35
- "Date Label": date_label.text
36
- }
37
-
38
- # Display the page image to see the visualized elements
39
- page.to_image()
40
- ```
41
-
42
- ## Finding Elements by Color
43
-
44
- You can find elements based on their color:
45
-
46
- ```python
47
- # Find text elements that are red
48
- red_text = page.find('text[color~=red]')
49
- red_text.highlight(color="red", label="Red Text")
50
- print(f"Found red text: {red_text.text}")
51
-
52
- # Find elements with specific RGB colors
53
- blue_text = page.find('text[color=rgb(0,0,255)]')
54
- ```
55
-
56
- ## Finding Lines and Shapes
57
-
58
- Find lines and rectangles based on their properties:
59
-
60
- ```python
61
- # Find horizontal lines
62
- horizontal_lines = page.find_all('line[horizontal]')
63
-
64
- # Find thick lines (width >= 2)
65
- thick_lines = page.find_all('line[width>=2]')
66
-
67
- # Find rectangles
68
- rectangles = page.find_all('rect')
69
-
70
- # Visualize what we found
71
- page.clear_highlights()
72
- horizontal_lines.highlight(color="blue", label="Horizontal Lines")
73
- thick_lines.highlight(color="red", label="Thick Lines")
74
- rectangles.highlight(color="green", label="Rectangles")
75
- page.to_image()
76
- ```
77
-
78
- ## Finding Elements by Font Properties
79
-
80
- ```python
81
- # Find text with specific font properties
82
- bold_text = page.find_all('text[style~=bold]')
83
- large_text = page.find_all('text[size>=12]')
84
-
85
- # Find text with specific font names
86
- helvetica_text = page.find_all('text[fontname~=Helvetica]')
87
- ```
88
-
89
- ## Spatial Navigation
90
-
91
- You can find elements based on their position relative to other elements:
92
-
93
- ```python
94
- # Find text above a specific element
95
- above_text = page.find('line[width=2]').above().extract_text()
96
-
97
- # Find text below a specific element
98
- below_text = page.find('text:contains("Summary")').below().extract_text()
99
-
100
- # Find text to the right of a specific element
101
- nearby_text = page.find('text:contains("Site")').right(width=200).extract_text()
102
- ```
103
-
104
- ## Combining Selectors
105
-
106
- You can combine multiple conditions to find exactly what you need:
107
-
108
- ```python
109
- # Find large, bold text that contains specific words
110
- important_text = page.find_all('text[size>=12][style~=bold]:contains("Critical")')
111
-
112
- # Find red text inside a rectangle
113
- highlighted_text = page.find('rect').find_all('text[color~=red]')
114
- ```
115
-
116
- <div class="admonition note">
117
- <p class="admonition-title">Handling Missing Elements</p>
118
-
119
- In these examples, we know certain elements exist in the PDF. In real-world scenarios, `page.find()` might not find a match and would return `None`. Production code should check for this:
120
-
121
- ```py
122
- site_label = page.find('text:contains("Site:")')
123
- if site_label:
124
- # Found it! Proceed...
125
- site_label.highlight(color="red", label="Site Label")
126
- site_label.text # Display or use the text
127
- else:
128
- # Didn't find it, handle appropriately...
129
- "Warning: 'Site:' label not found."
130
- ```
131
- </div>
132
-
133
- <div class="admonition tip">
134
- <p class="admonition-title">Visual Debugging</p>
135
-
136
- When working with complex selectors, it's helpful to visualize what you're finding:
137
-
138
- ```py
139
- # Clear any existing highlights
140
- page.clear_highlights()
141
-
142
- # Find and highlight elements
143
- elements = page.find_all('text[color~=red]')
144
- elements.highlight(color="red", label="Red Text")
145
-
146
- # Display the page to see what was found
147
- page.to_image(width=800)
148
- ```
149
- </div>