natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1888 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "f4fa5be5",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Semantic Search Across Multiple Documents\n",
9
+ "\n",
10
+ "When working with a collection of PDFs, you might need to find information relevant to a specific query across all documents, not just within a single one. This tutorial demonstrates how to perform semantic search over a `PDFCollection`."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "25274d94",
17
+ "metadata": {
18
+ "execution": {
19
+ "iopub.execute_input": "2025-04-16T14:59:23.384091Z",
20
+ "iopub.status.busy": "2025-04-16T14:59:23.383945Z",
21
+ "iopub.status.idle": "2025-04-16T14:59:23.386424Z",
22
+ "shell.execute_reply": "2025-04-16T14:59:23.386138Z"
23
+ }
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "#%pip install \"natural-pdf[all]\"\n",
28
+ "#%pip install \"natural-pdf[search]\" # Ensure search dependencies are installed"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "id": "78286236",
35
+ "metadata": {
36
+ "execution": {
37
+ "iopub.execute_input": "2025-04-16T14:59:23.387957Z",
38
+ "iopub.status.busy": "2025-04-16T14:59:23.387832Z",
39
+ "iopub.status.idle": "2025-04-16T14:59:33.969693Z",
40
+ "shell.execute_reply": "2025-04-16T14:59:33.969249Z"
41
+ }
42
+ },
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "natural_pdf.collections.pdf_collection - INFO - Initializing 2 PDF objects...\n"
49
+ ]
50
+ },
51
+ {
52
+ "name": "stderr",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "\u001b[2m2025-04-16T14:59:29.336679Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mInitializing 2 PDF objects... \u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m145\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mInitializing 2 PDF objects...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.collections.pdf_collection\u001b[0m\n"
56
+ ]
57
+ },
58
+ {
59
+ "name": "stderr",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "[2025-04-16 17:59:29,336] [ INFO] pdf_collection.py:145 - Initializing 2 PDF objects...\n"
63
+ ]
64
+ },
65
+ {
66
+ "name": "stderr",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "\r",
70
+ "Loading PDFs: 0%| | 0/2 [00:00<?, ?it/s]"
71
+ ]
72
+ },
73
+ {
74
+ "name": "stderr",
75
+ "output_type": "stream",
76
+ "text": [
77
+ "natural_pdf.core.pdf - INFO - Downloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\n"
78
+ ]
79
+ },
80
+ {
81
+ "name": "stderr",
82
+ "output_type": "stream",
83
+ "text": [
84
+ "\u001b[2m2025-04-16T14:59:29.353564Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDownloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m80\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mDownloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
85
+ ]
86
+ },
87
+ {
88
+ "name": "stderr",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "[2025-04-16 17:59:29,353] [ INFO] pdf.py:80 - Downloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\n"
92
+ ]
93
+ },
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "natural_pdf.core.pdf - INFO - PDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\n"
99
+ ]
100
+ },
101
+ {
102
+ "name": "stderr",
103
+ "output_type": "stream",
104
+ "text": [
105
+ "\u001b[2m2025-04-16T14:59:29.636697Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m93\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mPDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
106
+ ]
107
+ },
108
+ {
109
+ "name": "stderr",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "[2025-04-16 17:59:29,636] [ INFO] pdf.py:93 - PDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\n"
113
+ ]
114
+ },
115
+ {
116
+ "name": "stderr",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "natural_pdf.core.pdf - INFO - Initializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\n"
120
+ ]
121
+ },
122
+ {
123
+ "name": "stderr",
124
+ "output_type": "stream",
125
+ "text": [
126
+ "\u001b[2m2025-04-16T14:59:29.637422Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mInitializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m106\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mInitializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
127
+ ]
128
+ },
129
+ {
130
+ "name": "stderr",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "[2025-04-16 17:59:29,637] [ INFO] pdf.py:106 - Initializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\n"
134
+ ]
135
+ },
136
+ {
137
+ "name": "stderr",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "natural_pdf.ocr.ocr_manager - INFO - OCRManager initialized.\n"
141
+ ]
142
+ },
143
+ {
144
+ "name": "stderr",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "\u001b[2m2025-04-16T14:59:29.638958Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mOCRManager initialized. \u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m38\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mOCRManager initialized.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.ocr.ocr_manager\u001b[0m\n"
148
+ ]
149
+ },
150
+ {
151
+ "name": "stderr",
152
+ "output_type": "stream",
153
+ "text": [
154
+ "[2025-04-16 17:59:29,638] [ INFO] ocr_manager.py:38 - OCRManager initialized.\n"
155
+ ]
156
+ },
157
+ {
158
+ "name": "stderr",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "natural_pdf.analyzers.layout.layout_manager - INFO - LayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\n"
162
+ ]
163
+ },
164
+ {
165
+ "name": "stderr",
166
+ "output_type": "stream",
167
+ "text": [
168
+ "\u001b[2m2025-04-16T14:59:29.639737Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mLayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m68\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mLayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.layout_manager\u001b[0m\n"
169
+ ]
170
+ },
171
+ {
172
+ "name": "stderr",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "[2025-04-16 17:59:29,639] [ INFO] layout_manager.py:68 - LayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\n"
176
+ ]
177
+ },
178
+ {
179
+ "name": "stderr",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "natural_pdf.core.highlighting_service - INFO - HighlightingService initialized with ColorManager.\n"
183
+ ]
184
+ },
185
+ {
186
+ "name": "stderr",
187
+ "output_type": "stream",
188
+ "text": [
189
+ "\u001b[2m2025-04-16T14:59:29.640333Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mHighlightingService initialized with ColorManager.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m286\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mHighlightingService initialized with ColorManager.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.highlighting_service\u001b[0m\n"
190
+ ]
191
+ },
192
+ {
193
+ "name": "stderr",
194
+ "output_type": "stream",
195
+ "text": [
196
+ "[2025-04-16 17:59:29,640] [ INFO] highlighting_service.py:286 - HighlightingService initialized with ColorManager.\n"
197
+ ]
198
+ },
199
+ {
200
+ "name": "stderr",
201
+ "output_type": "stream",
202
+ "text": [
203
+ "natural_pdf.core.pdf - INFO - Initialized HighlightingService.\n"
204
+ ]
205
+ },
206
+ {
207
+ "name": "stderr",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "\u001b[2m2025-04-16T14:59:29.642175Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mInitialized HighlightingService.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m141\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mInitialized HighlightingService.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
211
+ ]
212
+ },
213
+ {
214
+ "name": "stderr",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "[2025-04-16 17:59:29,641] [ INFO] pdf.py:141 - Initialized HighlightingService.\n"
218
+ ]
219
+ },
220
+ {
221
+ "name": "stderr",
222
+ "output_type": "stream",
223
+ "text": [
224
+ "natural_pdf.core.pdf - INFO - PDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf' initialized with 1 pages.\n"
225
+ ]
226
+ },
227
+ {
228
+ "name": "stderr",
229
+ "output_type": "stream",
230
+ "text": [
231
+ "\u001b[2m2025-04-16T14:59:29.643262Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf' initialized with 1 pages.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m142\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mPDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf' initialized with 1 pages.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
232
+ ]
233
+ },
234
+ {
235
+ "name": "stderr",
236
+ "output_type": "stream",
237
+ "text": [
238
+ "[2025-04-16 17:59:29,643] [ INFO] pdf.py:142 - PDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf' initialized with 1 pages.\n"
239
+ ]
240
+ },
241
+ {
242
+ "name": "stderr",
243
+ "output_type": "stream",
244
+ "text": [
245
+ "\r",
246
+ "Loading PDFs: 50%|███████████████████▌ | 1/2 [00:00<00:00, 3.44it/s]"
247
+ ]
248
+ },
249
+ {
250
+ "name": "stderr",
251
+ "output_type": "stream",
252
+ "text": [
253
+ "natural_pdf.core.pdf - INFO - Downloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\n"
254
+ ]
255
+ },
256
+ {
257
+ "name": "stderr",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "\u001b[2m2025-04-16T14:59:29.644068Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDownloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m80\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mDownloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
261
+ ]
262
+ },
263
+ {
264
+ "name": "stderr",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "[2025-04-16 17:59:29,643] [ INFO] pdf.py:80 - Downloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\n"
268
+ ]
269
+ },
270
+ {
271
+ "name": "stderr",
272
+ "output_type": "stream",
273
+ "text": [
274
+ "natural_pdf.core.pdf - INFO - PDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n"
275
+ ]
276
+ },
277
+ {
278
+ "name": "stderr",
279
+ "output_type": "stream",
280
+ "text": [
281
+ "\u001b[2m2025-04-16T14:59:33.955839Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m93\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mPDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
282
+ ]
283
+ },
284
+ {
285
+ "name": "stderr",
286
+ "output_type": "stream",
287
+ "text": [
288
+ "[2025-04-16 17:59:33,952] [ INFO] pdf.py:93 - PDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n"
289
+ ]
290
+ },
291
+ {
292
+ "name": "stderr",
293
+ "output_type": "stream",
294
+ "text": [
295
+ "natural_pdf.core.pdf - INFO - Initializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n"
296
+ ]
297
+ },
298
+ {
299
+ "name": "stderr",
300
+ "output_type": "stream",
301
+ "text": [
302
+ "\u001b[2m2025-04-16T14:59:33.957471Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mInitializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m106\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mInitializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
303
+ ]
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "[2025-04-16 17:59:33,957] [ INFO] pdf.py:106 - Initializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n"
310
+ ]
311
+ },
312
+ {
313
+ "name": "stderr",
314
+ "output_type": "stream",
315
+ "text": [
316
+ "natural_pdf.ocr.ocr_manager - INFO - OCRManager initialized.\n"
317
+ ]
318
+ },
319
+ {
320
+ "name": "stderr",
321
+ "output_type": "stream",
322
+ "text": [
323
+ "\u001b[2m2025-04-16T14:59:33.959364Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mOCRManager initialized. \u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m38\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mOCRManager initialized.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.ocr.ocr_manager\u001b[0m\n"
324
+ ]
325
+ },
326
+ {
327
+ "name": "stderr",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "[2025-04-16 17:59:33,958] [ INFO] ocr_manager.py:38 - OCRManager initialized.\n"
331
+ ]
332
+ },
333
+ {
334
+ "name": "stderr",
335
+ "output_type": "stream",
336
+ "text": [
337
+ "natural_pdf.analyzers.layout.layout_manager - INFO - LayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\n"
338
+ ]
339
+ },
340
+ {
341
+ "name": "stderr",
342
+ "output_type": "stream",
343
+ "text": [
344
+ "\u001b[2m2025-04-16T14:59:33.960339Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mLayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m68\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mLayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.layout_manager\u001b[0m\n"
345
+ ]
346
+ },
347
+ {
348
+ "name": "stderr",
349
+ "output_type": "stream",
350
+ "text": [
351
+ "[2025-04-16 17:59:33,960] [ INFO] layout_manager.py:68 - LayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling']\n"
352
+ ]
353
+ },
354
+ {
355
+ "name": "stderr",
356
+ "output_type": "stream",
357
+ "text": [
358
+ "natural_pdf.core.highlighting_service - INFO - HighlightingService initialized with ColorManager.\n"
359
+ ]
360
+ },
361
+ {
362
+ "name": "stderr",
363
+ "output_type": "stream",
364
+ "text": [
365
+ "\u001b[2m2025-04-16T14:59:33.961122Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mHighlightingService initialized with ColorManager.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m286\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mHighlightingService initialized with ColorManager.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.highlighting_service\u001b[0m\n"
366
+ ]
367
+ },
368
+ {
369
+ "name": "stderr",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "[2025-04-16 17:59:33,960] [ INFO] highlighting_service.py:286 - HighlightingService initialized with ColorManager.\n"
373
+ ]
374
+ },
375
+ {
376
+ "name": "stderr",
377
+ "output_type": "stream",
378
+ "text": [
379
+ "natural_pdf.core.pdf - INFO - Initialized HighlightingService.\n"
380
+ ]
381
+ },
382
+ {
383
+ "name": "stderr",
384
+ "output_type": "stream",
385
+ "text": [
386
+ "\u001b[2m2025-04-16T14:59:33.964681Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mInitialized HighlightingService.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m141\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mInitialized HighlightingService.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
387
+ ]
388
+ },
389
+ {
390
+ "name": "stderr",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "[2025-04-16 17:59:33,964] [ INFO] pdf.py:141 - Initialized HighlightingService.\n"
394
+ ]
395
+ },
396
+ {
397
+ "name": "stderr",
398
+ "output_type": "stream",
399
+ "text": [
400
+ "natural_pdf.core.pdf - INFO - PDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf' initialized with 5 pages.\n"
401
+ ]
402
+ },
403
+ {
404
+ "name": "stderr",
405
+ "output_type": "stream",
406
+ "text": [
407
+ "\u001b[2m2025-04-16T14:59:33.965372Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf' initialized with 5 pages.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m142\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mPDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf' initialized with 5 pages.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.core.pdf\u001b[0m\n"
408
+ ]
409
+ },
410
+ {
411
+ "name": "stderr",
412
+ "output_type": "stream",
413
+ "text": [
414
+ "[2025-04-16 17:59:33,965] [ INFO] pdf.py:142 - PDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf' initialized with 5 pages.\n"
415
+ ]
416
+ },
417
+ {
418
+ "name": "stderr",
419
+ "output_type": "stream",
420
+ "text": [
421
+ "\r",
422
+ "Loading PDFs: 100%|███████████████████████████████████████| 2/2 [00:04<00:00, 2.66s/it]"
423
+ ]
424
+ },
425
+ {
426
+ "name": "stderr",
427
+ "output_type": "stream",
428
+ "text": [
429
+ "\r",
430
+ "Loading PDFs: 100%|███████████████████████████████████████| 2/2 [00:04<00:00, 2.31s/it]"
431
+ ]
432
+ },
433
+ {
434
+ "name": "stderr",
435
+ "output_type": "stream",
436
+ "text": [
437
+ "\n",
438
+ "natural_pdf.collections.pdf_collection - INFO - Successfully initialized 2 PDFs. Failed: 0\n"
439
+ ]
440
+ },
441
+ {
442
+ "name": "stderr",
443
+ "output_type": "stream",
444
+ "text": [
445
+ "\u001b[2m2025-04-16T14:59:33.966755Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSuccessfully initialized 2 PDFs. Failed: 0\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m154\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSuccessfully initialized 2 PDFs. Failed: 0\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.collections.pdf_collection\u001b[0m\n"
446
+ ]
447
+ },
448
+ {
449
+ "name": "stderr",
450
+ "output_type": "stream",
451
+ "text": [
452
+ "[2025-04-16 17:59:33,966] [ INFO] pdf_collection.py:154 - Successfully initialized 2 PDFs. Failed: 0\n"
453
+ ]
454
+ },
455
+ {
456
+ "name": "stdout",
457
+ "output_type": "stream",
458
+ "text": [
459
+ "Created collection with 2 PDFs.\n"
460
+ ]
461
+ }
462
+ ],
463
+ "source": [
464
+ "import logging\n",
465
+ "import natural_pdf\n",
466
+ "\n",
467
+ "# Optional: Configure logging to see progress\n",
468
+ "natural_pdf.configure_logging(level=logging.INFO)\n",
469
+ "\n",
470
+ "# Define the paths to your PDF files\n",
471
+ "pdf_paths = [\n",
472
+ " \"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\",\n",
473
+ " \"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\"\n",
474
+ " # Add more PDF paths as needed\n",
475
+ "]\n",
476
+ "\n",
477
+ "# Create a PDFCollection\n",
478
+ "collection = natural_pdf.PDFCollection(pdf_paths)\n",
479
+ "print(f\"Created collection with {len(collection.pdfs)} PDFs.\")"
480
+ ]
481
+ },
482
+ {
483
+ "cell_type": "markdown",
484
+ "id": "57c16881",
485
+ "metadata": {},
486
+ "source": [
487
+ "## Initializing the Search Index\n",
488
+ "\n",
489
+ "Before performing a search, you need to initialize the search capabilities for the collection. This involves processing the documents and building an index."
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 3,
495
+ "id": "53b5decd",
496
+ "metadata": {
497
+ "execution": {
498
+ "iopub.execute_input": "2025-04-16T14:59:33.973935Z",
499
+ "iopub.status.busy": "2025-04-16T14:59:33.972405Z",
500
+ "iopub.status.idle": "2025-04-16T14:59:37.674880Z",
501
+ "shell.execute_reply": "2025-04-16T14:59:37.674549Z"
502
+ }
503
+ },
504
+ "outputs": [
505
+ {
506
+ "name": "stderr",
507
+ "output_type": "stream",
508
+ "text": [
509
+ "natural_pdf.search.searchable_mixin - INFO - Using default collection name 'default_collection' for in-memory service.\n"
510
+ ]
511
+ },
512
+ {
513
+ "name": "stderr",
514
+ "output_type": "stream",
515
+ "text": [
516
+ "\u001b[2m2025-04-16T14:59:33.975593Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mUsing default collection name 'default_collection' for in-memory service.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m104\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mUsing default collection name 'default_collection' for in-memory service.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
517
+ ]
518
+ },
519
+ {
520
+ "name": "stderr",
521
+ "output_type": "stream",
522
+ "text": [
523
+ "[2025-04-16 17:59:33,974] [ INFO] searchable_mixin.py:104 - Using default collection name 'default_collection' for in-memory service.\n"
524
+ ]
525
+ },
526
+ {
527
+ "name": "stderr",
528
+ "output_type": "stream",
529
+ "text": [
530
+ "natural_pdf.search.searchable_mixin - INFO - Creating new SearchService: name='default_collection', persist=False, model=default\n"
531
+ ]
532
+ },
533
+ {
534
+ "name": "stderr",
535
+ "output_type": "stream",
536
+ "text": [
537
+ "\u001b[2m2025-04-16T14:59:33.976579Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCreating new SearchService: name='default_collection', persist=False, model=default\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m106\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mCreating new SearchService: name='default_collection', persist=False, model=default\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
538
+ ]
539
+ },
540
+ {
541
+ "name": "stderr",
542
+ "output_type": "stream",
543
+ "text": [
544
+ "[2025-04-16 17:59:33,976] [ INFO] searchable_mixin.py:106 - Creating new SearchService: name='default_collection', persist=False, model=default\n"
545
+ ]
546
+ },
547
+ {
548
+ "name": "stderr",
549
+ "output_type": "stream",
550
+ "text": [
551
+ "natural_pdf.search.haystack_search_service - INFO - HaystackSearchService initialized for collection='default_collection' (persist=False, model='sentence-transformers/all-MiniLM-L6-v2'). Default path: './natural_pdf_index'\n"
552
+ ]
553
+ },
554
+ {
555
+ "name": "stderr",
556
+ "output_type": "stream",
557
+ "text": [
558
+ "\u001b[2m2025-04-16T14:59:33.977956Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mHaystackSearchService initialized for collection='default_collection' (persist=False, model='sentence-transformers/all-MiniLM-L6-v2'). Default path: './natural_pdf_index'\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m106\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mHaystackSearchService initialized for collection='default_collection' (persist=False, model='sentence-transformers/all-MiniLM-L6-v2'). Default path: './natural_pdf_index'\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
559
+ ]
560
+ },
561
+ {
562
+ "name": "stderr",
563
+ "output_type": "stream",
564
+ "text": [
565
+ "[2025-04-16 17:59:33,977] [ INFO] haystack_search_service.py:106 - HaystackSearchService initialized for collection='default_collection' (persist=False, model='sentence-transformers/all-MiniLM-L6-v2'). Default path: './natural_pdf_index'\n"
566
+ ]
567
+ },
568
+ {
569
+ "name": "stderr",
570
+ "output_type": "stream",
571
+ "text": [
572
+ "natural_pdf.search - INFO - Created new HaystackSearchService instance for collection 'default_collection'.\n"
573
+ ]
574
+ },
575
+ {
576
+ "name": "stderr",
577
+ "output_type": "stream",
578
+ "text": [
579
+ "\u001b[2m2025-04-16T14:59:33.978879Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCreated new HaystackSearchService instance for collection 'default_collection'.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m80\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mCreated new HaystackSearchService instance for collection 'default_collection'.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search\u001b[0m\n"
580
+ ]
581
+ },
582
+ {
583
+ "name": "stderr",
584
+ "output_type": "stream",
585
+ "text": [
586
+ "[2025-04-16 17:59:33,978] [ INFO] __init__.py:80 - Created new HaystackSearchService instance for collection 'default_collection'.\n"
587
+ ]
588
+ },
589
+ {
590
+ "name": "stderr",
591
+ "output_type": "stream",
592
+ "text": [
593
+ "natural_pdf.search.searchable_mixin - INFO - index=True: Proceeding to index collection immediately after search initialization.\n"
594
+ ]
595
+ },
596
+ {
597
+ "name": "stderr",
598
+ "output_type": "stream",
599
+ "text": [
600
+ "\u001b[2m2025-04-16T14:59:33.979919Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mindex=True: Proceeding to index collection immediately after search initialization.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m141\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mindex=True: Proceeding to index collection immediately after search initialization.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
601
+ ]
602
+ },
603
+ {
604
+ "name": "stderr",
605
+ "output_type": "stream",
606
+ "text": [
607
+ "[2025-04-16 17:59:33,979] [ INFO] searchable_mixin.py:141 - index=True: Proceeding to index collection immediately after search initialization.\n"
608
+ ]
609
+ },
610
+ {
611
+ "name": "stderr",
612
+ "output_type": "stream",
613
+ "text": [
614
+ "natural_pdf.search.searchable_mixin - INFO - Starting internal indexing process into SearchService collection 'default_collection'...\n"
615
+ ]
616
+ },
617
+ {
618
+ "name": "stderr",
619
+ "output_type": "stream",
620
+ "text": [
621
+ "\u001b[2m2025-04-16T14:59:33.980745Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mStarting internal indexing process into SearchService collection 'default_collection'...\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m152\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mStarting internal indexing process into SearchService collection 'default_collection'...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
622
+ ]
623
+ },
624
+ {
625
+ "name": "stderr",
626
+ "output_type": "stream",
627
+ "text": [
628
+ "[2025-04-16 17:59:33,980] [ INFO] searchable_mixin.py:152 - Starting internal indexing process into SearchService collection 'default_collection'...\n"
629
+ ]
630
+ },
631
+ {
632
+ "name": "stderr",
633
+ "output_type": "stream",
634
+ "text": [
635
+ "natural_pdf.search.searchable_mixin - INFO - Prepared 6 indexable items for indexing.\n"
636
+ ]
637
+ },
638
+ {
639
+ "name": "stderr",
640
+ "output_type": "stream",
641
+ "text": [
642
+ "\u001b[2m2025-04-16T14:59:33.981554Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPrepared 6 indexable items for indexing.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m165\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mPrepared 6 indexable items for indexing.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
643
+ ]
644
+ },
645
+ {
646
+ "name": "stderr",
647
+ "output_type": "stream",
648
+ "text": [
649
+ "[2025-04-16 17:59:33,981] [ INFO] searchable_mixin.py:165 - Prepared 6 indexable items for indexing.\n"
650
+ ]
651
+ },
652
+ {
653
+ "name": "stderr",
654
+ "output_type": "stream",
655
+ "text": [
656
+ "natural_pdf.search.haystack_search_service - INFO - Index request for collection='default_collection', docs=6, model='sentence-transformers/all-MiniLM-L6-v2', force=False, persist=False\n"
657
+ ]
658
+ },
659
+ {
660
+ "name": "stderr",
661
+ "output_type": "stream",
662
+ "text": [
663
+ "\u001b[2m2025-04-16T14:59:33.982471Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mIndex request for collection='default_collection', docs=6, model='sentence-transformers/all-MiniLM-L6-v2', force=False, persist=False\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m210\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mIndex request for collection='default_collection', docs=6, model='sentence-transformers/all-MiniLM-L6-v2', force=False, persist=False\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
664
+ ]
665
+ },
666
+ {
667
+ "name": "stderr",
668
+ "output_type": "stream",
669
+ "text": [
670
+ "[2025-04-16 17:59:33,982] [ INFO] haystack_search_service.py:210 - Index request for collection='default_collection', docs=6, model='sentence-transformers/all-MiniLM-L6-v2', force=False, persist=False\n"
671
+ ]
672
+ },
673
+ {
674
+ "name": "stderr",
675
+ "output_type": "stream",
676
+ "text": [
677
+ "natural_pdf.search.haystack_search_service - INFO - Created SentenceTransformersDocumentEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\n"
678
+ ]
679
+ },
680
+ {
681
+ "name": "stderr",
682
+ "output_type": "stream",
683
+ "text": [
684
+ "\u001b[2m2025-04-16T14:59:36.828108Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCreated SentenceTransformersDocumentEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m146\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mCreated SentenceTransformersDocumentEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
685
+ ]
686
+ },
687
+ {
688
+ "name": "stderr",
689
+ "output_type": "stream",
690
+ "text": [
691
+ "[2025-04-16 17:59:36,827] [ INFO] haystack_search_service.py:146 - Created SentenceTransformersDocumentEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\n"
692
+ ]
693
+ },
694
+ {
695
+ "name": "stderr",
696
+ "output_type": "stream",
697
+ "text": [
698
+ "natural_pdf.search.haystack_search_service - INFO - Preparing Haystack Documents from 6 indexable items...\n"
699
+ ]
700
+ },
701
+ {
702
+ "name": "stderr",
703
+ "output_type": "stream",
704
+ "text": [
705
+ "\u001b[2m2025-04-16T14:59:36.828818Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPreparing Haystack Documents from 6 indexable items...\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m241\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mPreparing Haystack Documents from 6 indexable items...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
706
+ ]
707
+ },
708
+ {
709
+ "name": "stderr",
710
+ "output_type": "stream",
711
+ "text": [
712
+ "[2025-04-16 17:59:36,828] [ INFO] haystack_search_service.py:241 - Preparing Haystack Documents from 6 indexable items...\n"
713
+ ]
714
+ },
715
+ {
716
+ "name": "stderr",
717
+ "output_type": "stream",
718
+ "text": [
719
+ "natural_pdf.search.haystack_search_service - INFO - Embedding 6 documents using 'sentence-transformers/all-MiniLM-L6-v2'...\n"
720
+ ]
721
+ },
722
+ {
723
+ "name": "stderr",
724
+ "output_type": "stream",
725
+ "text": [
726
+ "\u001b[2m2025-04-16T14:59:37.420176Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mEmbedding 6 documents using 'sentence-transformers/all-MiniLM-L6-v2'...\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m281\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mEmbedding 6 documents using 'sentence-transformers/all-MiniLM-L6-v2'...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
727
+ ]
728
+ },
729
+ {
730
+ "name": "stderr",
731
+ "output_type": "stream",
732
+ "text": [
733
+ "[2025-04-16 17:59:37,419] [ INFO] haystack_search_service.py:281 - Embedding 6 documents using 'sentence-transformers/all-MiniLM-L6-v2'...\n"
734
+ ]
735
+ },
736
+ {
737
+ "data": {
738
+ "application/vnd.jupyter.widget-view+json": {
739
+ "model_id": "c680fa26e0ff4501b6ef8eacd599090e",
740
+ "version_major": 2,
741
+ "version_minor": 0
742
+ },
743
+ "text/plain": [
744
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
745
+ ]
746
+ },
747
+ "metadata": {},
748
+ "output_type": "display_data"
749
+ },
750
+ {
751
+ "name": "stderr",
752
+ "output_type": "stream",
753
+ "text": [
754
+ "natural_pdf.search.haystack_search_service - INFO - Successfully embedded 6 documents.\n"
755
+ ]
756
+ },
757
+ {
758
+ "name": "stderr",
759
+ "output_type": "stream",
760
+ "text": [
761
+ "\u001b[2m2025-04-16T14:59:37.669019Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSuccessfully embedded 6 documents.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m286\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSuccessfully embedded 6 documents.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
762
+ ]
763
+ },
764
+ {
765
+ "name": "stderr",
766
+ "output_type": "stream",
767
+ "text": [
768
+ "[2025-04-16 17:59:37,668] [ INFO] haystack_search_service.py:286 - Successfully embedded 6 documents.\n"
769
+ ]
770
+ },
771
+ {
772
+ "name": "stderr",
773
+ "output_type": "stream",
774
+ "text": [
775
+ "natural_pdf.search.haystack_search_service - INFO - Writing 6 embedded documents to store 'default_collection'...\n"
776
+ ]
777
+ },
778
+ {
779
+ "name": "stderr",
780
+ "output_type": "stream",
781
+ "text": [
782
+ "\u001b[2m2025-04-16T14:59:37.669875Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mWriting 6 embedded documents to store 'default_collection'...\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m302\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mWriting 6 embedded documents to store 'default_collection'...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
783
+ ]
784
+ },
785
+ {
786
+ "name": "stderr",
787
+ "output_type": "stream",
788
+ "text": [
789
+ "[2025-04-16 17:59:37,669] [ INFO] haystack_search_service.py:302 - Writing 6 embedded documents to store 'default_collection'...\n"
790
+ ]
791
+ },
792
+ {
793
+ "name": "stderr",
794
+ "output_type": "stream",
795
+ "text": [
796
+ "natural_pdf.search.haystack_search_service - INFO - Successfully wrote 6 documents to store 'default_collection'.\n"
797
+ ]
798
+ },
799
+ {
800
+ "name": "stderr",
801
+ "output_type": "stream",
802
+ "text": [
803
+ "\u001b[2m2025-04-16T14:59:37.671847Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSuccessfully wrote 6 documents to store 'default_collection'.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m308\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSuccessfully wrote 6 documents to store 'default_collection'.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
804
+ ]
805
+ },
806
+ {
807
+ "name": "stderr",
808
+ "output_type": "stream",
809
+ "text": [
810
+ "[2025-04-16 17:59:37,671] [ INFO] haystack_search_service.py:308 - Successfully wrote 6 documents to store 'default_collection'.\n"
811
+ ]
812
+ },
813
+ {
814
+ "name": "stderr",
815
+ "output_type": "stream",
816
+ "text": [
817
+ "natural_pdf.search.haystack_search_service - INFO - Store 'default_collection' document count after write: 6\n"
818
+ ]
819
+ },
820
+ {
821
+ "name": "stderr",
822
+ "output_type": "stream",
823
+ "text": [
824
+ "\u001b[2m2025-04-16T14:59:37.672454Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mStore 'default_collection' document count after write: 6\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m310\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mStore 'default_collection' document count after write: 6\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
825
+ ]
826
+ },
827
+ {
828
+ "name": "stderr",
829
+ "output_type": "stream",
830
+ "text": [
831
+ "[2025-04-16 17:59:37,672] [ INFO] haystack_search_service.py:310 - Store 'default_collection' document count after write: 6\n"
832
+ ]
833
+ },
834
+ {
835
+ "name": "stderr",
836
+ "output_type": "stream",
837
+ "text": [
838
+ "natural_pdf.search.searchable_mixin - INFO - Successfully completed indexing into SearchService collection 'default_collection'.\n"
839
+ ]
840
+ },
841
+ {
842
+ "name": "stderr",
843
+ "output_type": "stream",
844
+ "text": [
845
+ "\u001b[2m2025-04-16T14:59:37.673044Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSuccessfully completed indexing into SearchService collection 'default_collection'.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m173\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSuccessfully completed indexing into SearchService collection 'default_collection'.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
846
+ ]
847
+ },
848
+ {
849
+ "name": "stderr",
850
+ "output_type": "stream",
851
+ "text": [
852
+ "[2025-04-16 17:59:37,672] [ INFO] searchable_mixin.py:173 - Successfully completed indexing into SearchService collection 'default_collection'.\n"
853
+ ]
854
+ },
855
+ {
856
+ "name": "stdout",
857
+ "output_type": "stream",
858
+ "text": [
859
+ "Search index initialized.\n"
860
+ ]
861
+ }
862
+ ],
863
+ "source": [
864
+ "# Initialize search. 'index=True' builds the index immediately.\n",
865
+ "# This might take some time depending on the number and size of PDFs.\n",
866
+ "collection.init_search(index=True) \n",
867
+ "print(\"Search index initialized.\")"
868
+ ]
869
+ },
870
+ {
871
+ "cell_type": "markdown",
872
+ "id": "86416bd8",
873
+ "metadata": {},
874
+ "source": [
875
+ "## Performing a Semantic Search\n",
876
+ "\n",
877
+ "Once the index is ready, you can use the `find_relevant()` method to search for content semantically related to your query."
878
+ ]
879
+ },
880
+ {
881
+ "cell_type": "code",
882
+ "execution_count": 4,
883
+ "id": "c4f6ed33",
884
+ "metadata": {
885
+ "execution": {
886
+ "iopub.execute_input": "2025-04-16T14:59:37.677294Z",
887
+ "iopub.status.busy": "2025-04-16T14:59:37.677184Z",
888
+ "iopub.status.idle": "2025-04-16T14:59:37.865461Z",
889
+ "shell.execute_reply": "2025-04-16T14:59:37.865159Z"
890
+ }
891
+ },
892
+ "outputs": [
893
+ {
894
+ "name": "stderr",
895
+ "output_type": "stream",
896
+ "text": [
897
+ "natural_pdf.search.searchable_mixin - INFO - Searching collection 'default_collection' via HaystackSearchService...\n"
898
+ ]
899
+ },
900
+ {
901
+ "name": "stderr",
902
+ "output_type": "stream",
903
+ "text": [
904
+ "\u001b[2m2025-04-16T14:59:37.678388Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSearching collection 'default_collection' via HaystackSearchService...\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m244\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSearching collection 'default_collection' via HaystackSearchService...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
905
+ ]
906
+ },
907
+ {
908
+ "name": "stderr",
909
+ "output_type": "stream",
910
+ "text": [
911
+ "[2025-04-16 17:59:37,677] [ INFO] searchable_mixin.py:244 - Searching collection 'default_collection' via HaystackSearchService...\n"
912
+ ]
913
+ },
914
+ {
915
+ "name": "stderr",
916
+ "output_type": "stream",
917
+ "text": [
918
+ "natural_pdf.search.haystack_search_service - INFO - Search request for collection='default_collection', query_type=str, options=TextSearchOptions(top_k=10, retriever_top_k=20, filters=None, use_reranker=True, reranker_instance=None, reranker_model=None, reranker_api_key=None)\n"
919
+ ]
920
+ },
921
+ {
922
+ "name": "stderr",
923
+ "output_type": "stream",
924
+ "text": [
925
+ "\u001b[2m2025-04-16T14:59:37.679015Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSearch request for collection='default_collection', query_type=str, options=TextSearchOptions(top_k=10, retriever_top_k=20, filters=None, use_reranker=True, reranker_instance=None, reranker_model=None, reranker_api_key=None)\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m318\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSearch request for collection='default_collection', query_type=str, options=TextSearchOptions(top_k=10, retriever_top_k=20, filters=None, use_reranker=True, reranker_instance=None, reranker_model=None, reranker_api_key=None)\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
926
+ ]
927
+ },
928
+ {
929
+ "name": "stderr",
930
+ "output_type": "stream",
931
+ "text": [
932
+ "[2025-04-16 17:59:37,678] [ INFO] haystack_search_service.py:318 - Search request for collection='default_collection', query_type=str, options=TextSearchOptions(top_k=10, retriever_top_k=20, filters=None, use_reranker=True, reranker_instance=None, reranker_model=None, reranker_api_key=None)\n"
933
+ ]
934
+ },
935
+ {
936
+ "name": "stderr",
937
+ "output_type": "stream",
938
+ "text": [
939
+ "natural_pdf.search.haystack_search_service - INFO - Created SentenceTransformersTextEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\n"
940
+ ]
941
+ },
942
+ {
943
+ "name": "stderr",
944
+ "output_type": "stream",
945
+ "text": [
946
+ "\u001b[2m2025-04-16T14:59:37.679709Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCreated SentenceTransformersTextEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m164\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mCreated SentenceTransformersTextEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
947
+ ]
948
+ },
949
+ {
950
+ "name": "stderr",
951
+ "output_type": "stream",
952
+ "text": [
953
+ "[2025-04-16 17:59:37,679] [ INFO] haystack_search_service.py:164 - Created SentenceTransformersTextEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\n"
954
+ ]
955
+ },
956
+ {
957
+ "data": {
958
+ "application/vnd.jupyter.widget-view+json": {
959
+ "model_id": "d256c819166642f68442fa041de1ba67",
960
+ "version_major": 2,
961
+ "version_minor": 0
962
+ },
963
+ "text/plain": [
964
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
965
+ ]
966
+ },
967
+ "metadata": {},
968
+ "output_type": "display_data"
969
+ },
970
+ {
971
+ "name": "stderr",
972
+ "output_type": "stream",
973
+ "text": [
974
+ "natural_pdf.search.haystack_search_service - INFO - Running retrieval pipeline for collection 'default_collection'...\n"
975
+ ]
976
+ },
977
+ {
978
+ "name": "stderr",
979
+ "output_type": "stream",
980
+ "text": [
981
+ "\u001b[2m2025-04-16T14:59:37.859154Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mRunning retrieval pipeline for collection 'default_collection'...\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m401\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mRunning retrieval pipeline for collection 'default_collection'...\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
982
+ ]
983
+ },
984
+ {
985
+ "name": "stderr",
986
+ "output_type": "stream",
987
+ "text": [
988
+ "[2025-04-16 17:59:37,858] [ INFO] haystack_search_service.py:401 - Running retrieval pipeline for collection 'default_collection'...\n"
989
+ ]
990
+ },
991
+ {
992
+ "name": "stderr",
993
+ "output_type": "stream",
994
+ "text": [
995
+ "natural_pdf.search.haystack_search_service - INFO - Retrieved 6 documents.\n"
996
+ ]
997
+ },
998
+ {
999
+ "name": "stderr",
1000
+ "output_type": "stream",
1001
+ "text": [
1002
+ "\u001b[2m2025-04-16T14:59:37.863278Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mRetrieved 6 documents. \u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m410\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mRetrieved 6 documents.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.haystack_search_service\u001b[0m\n"
1003
+ ]
1004
+ },
1005
+ {
1006
+ "name": "stderr",
1007
+ "output_type": "stream",
1008
+ "text": [
1009
+ "[2025-04-16 17:59:37,862] [ INFO] haystack_search_service.py:410 - Retrieved 6 documents.\n"
1010
+ ]
1011
+ },
1012
+ {
1013
+ "name": "stderr",
1014
+ "output_type": "stream",
1015
+ "text": [
1016
+ "natural_pdf.search.searchable_mixin - INFO - SearchService returned 6 results from collection 'default_collection'.\n"
1017
+ ]
1018
+ },
1019
+ {
1020
+ "name": "stderr",
1021
+ "output_type": "stream",
1022
+ "text": [
1023
+ "\u001b[2m2025-04-16T14:59:37.863850Z\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mSearchService returned 6 results from collection 'default_collection'.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m266\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mSearchService returned 6 results from collection 'default_collection'.\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.search.searchable_mixin\u001b[0m\n"
1024
+ ]
1025
+ },
1026
+ {
1027
+ "name": "stderr",
1028
+ "output_type": "stream",
1029
+ "text": [
1030
+ "[2025-04-16 17:59:37,863] [ INFO] searchable_mixin.py:266 - SearchService returned 6 results from collection 'default_collection'.\n"
1031
+ ]
1032
+ },
1033
+ {
1034
+ "name": "stdout",
1035
+ "output_type": "stream",
1036
+ "text": [
1037
+ "Found 6 results for 'american president':\n"
1038
+ ]
1039
+ }
1040
+ ],
1041
+ "source": [
1042
+ "# Perform a search query\n",
1043
+ "query = \"american president\"\n",
1044
+ "results = collection.find_relevant(query)\n",
1045
+ "\n",
1046
+ "print(f\"Found {len(results)} results for '{query}':\")"
1047
+ ]
1048
+ },
1049
+ {
1050
+ "cell_type": "markdown",
1051
+ "id": "899900f2",
1052
+ "metadata": {},
1053
+ "source": [
1054
+ "## Understanding Search Results\n",
1055
+ "\n",
1056
+ "The `find_relevant()` method returns a list of dictionaries, each representing a relevant text chunk found in one of the PDFs. Each result includes:\n",
1057
+ "\n",
1058
+ "* `pdf_path`: The path to the PDF document where the result was found.\n",
1059
+ "* `page_number`: The page number within the PDF.\n",
1060
+ "* `score`: A relevance score (higher means more relevant).\n",
1061
+ "* `content_snippet`: A snippet of the text chunk that matched the query."
1062
+ ]
1063
+ },
1064
+ {
1065
+ "cell_type": "code",
1066
+ "execution_count": 5,
1067
+ "id": "846312a5",
1068
+ "metadata": {
1069
+ "execution": {
1070
+ "iopub.execute_input": "2025-04-16T14:59:37.867689Z",
1071
+ "iopub.status.busy": "2025-04-16T14:59:37.867564Z",
1072
+ "iopub.status.idle": "2025-04-16T14:59:37.870008Z",
1073
+ "shell.execute_reply": "2025-04-16T14:59:37.869701Z"
1074
+ }
1075
+ },
1076
+ "outputs": [
1077
+ {
1078
+ "name": "stdout",
1079
+ "output_type": "stream",
1080
+ "text": [
1081
+ " 1. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n",
1082
+ " Page: 2 (Score: 0.0708)\n",
1083
+ " Snippet: \n",
1084
+ " \n",
1085
+ " Library Weeding Log ...\n",
1086
+ " 2. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n",
1087
+ " Page: 5 (Score: 0.0669)\n",
1088
+ " Snippet: \n",
1089
+ " \n",
1090
+ " Library Weeding Log ...\n",
1091
+ " 3. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwq_3f750.pdf\n",
1092
+ " Page: 1 (Score: -0.0040)\n",
1093
+ " Snippet: \n",
1094
+ " \n",
1095
+ " ...\n",
1096
+ " 4. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n",
1097
+ " Page: 4 (Score: -0.0245)\n",
1098
+ " Snippet: \n",
1099
+ " \n",
1100
+ " Library Weeding Log ...\n",
1101
+ " 5. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n",
1102
+ " Page: 3 (Score: -0.0445)\n",
1103
+ " Snippet: \n",
1104
+ " \n",
1105
+ " Library Weeding Log ...\n",
1106
+ " 6. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpofyt17qz.pdf\n",
1107
+ " Page: 1 (Score: -0.0473)\n",
1108
+ " Snippet: \n",
1109
+ " \n",
1110
+ " Library Weeding Log ...\n"
1111
+ ]
1112
+ }
1113
+ ],
1114
+ "source": [
1115
+ "# Process and display the results\n",
1116
+ "if results:\n",
1117
+ " for i, result in enumerate(results):\n",
1118
+ " print(f\" {i+1}. PDF: {result['pdf_path']}\")\n",
1119
+ " print(f\" Page: {result['page_number']} (Score: {result['score']:.4f})\")\n",
1120
+ " # Display a snippet of the content\n",
1121
+ " snippet = result.get('content_snippet', '')\n",
1122
+ " print(f\" Snippet: {snippet}...\") \n",
1123
+ "else:\n",
1124
+ " print(\" No relevant results found.\")\n",
1125
+ "\n",
1126
+ "# You can access the full content if needed via the result object, \n",
1127
+ "# though 'content_snippet' is usually sufficient for display."
1128
+ ]
1129
+ },
1130
+ {
1131
+ "cell_type": "markdown",
1132
+ "id": "0f9261b5",
1133
+ "metadata": {},
1134
+ "source": [
1135
+ "Semantic search allows you to efficiently query large sets of documents to find the most relevant information without needing exact keyword matches, leveraging the meaning and context of your query. "
1136
+ ]
1137
+ }
1138
+ ],
1139
+ "metadata": {
1140
+ "jupytext": {
1141
+ "cell_metadata_filter": "-all",
1142
+ "main_language": "python",
1143
+ "notebook_metadata_filter": "-all"
1144
+ },
1145
+ "language_info": {
1146
+ "codemirror_mode": {
1147
+ "name": "ipython",
1148
+ "version": 3
1149
+ },
1150
+ "file_extension": ".py",
1151
+ "mimetype": "text/x-python",
1152
+ "name": "python",
1153
+ "nbconvert_exporter": "python",
1154
+ "pygments_lexer": "ipython3",
1155
+ "version": "3.10.13"
1156
+ },
1157
+ "widgets": {
1158
+ "application/vnd.jupyter.widget-state+json": {
1159
+ "state": {
1160
+ "141f2a3636e04c3f9bfde6ffa25253ea": {
1161
+ "model_module": "@jupyter-widgets/base",
1162
+ "model_module_version": "2.0.0",
1163
+ "model_name": "LayoutModel",
1164
+ "state": {
1165
+ "_model_module": "@jupyter-widgets/base",
1166
+ "_model_module_version": "2.0.0",
1167
+ "_model_name": "LayoutModel",
1168
+ "_view_count": null,
1169
+ "_view_module": "@jupyter-widgets/base",
1170
+ "_view_module_version": "2.0.0",
1171
+ "_view_name": "LayoutView",
1172
+ "align_content": null,
1173
+ "align_items": null,
1174
+ "align_self": null,
1175
+ "border_bottom": null,
1176
+ "border_left": null,
1177
+ "border_right": null,
1178
+ "border_top": null,
1179
+ "bottom": null,
1180
+ "display": null,
1181
+ "flex": null,
1182
+ "flex_flow": null,
1183
+ "grid_area": null,
1184
+ "grid_auto_columns": null,
1185
+ "grid_auto_flow": null,
1186
+ "grid_auto_rows": null,
1187
+ "grid_column": null,
1188
+ "grid_gap": null,
1189
+ "grid_row": null,
1190
+ "grid_template_areas": null,
1191
+ "grid_template_columns": null,
1192
+ "grid_template_rows": null,
1193
+ "height": null,
1194
+ "justify_content": null,
1195
+ "justify_items": null,
1196
+ "left": null,
1197
+ "margin": null,
1198
+ "max_height": null,
1199
+ "max_width": null,
1200
+ "min_height": null,
1201
+ "min_width": null,
1202
+ "object_fit": null,
1203
+ "object_position": null,
1204
+ "order": null,
1205
+ "overflow": null,
1206
+ "padding": null,
1207
+ "right": null,
1208
+ "top": null,
1209
+ "visibility": null,
1210
+ "width": null
1211
+ }
1212
+ },
1213
+ "2a420c66cd7c43229bd55cf377d1e00d": {
1214
+ "model_module": "@jupyter-widgets/controls",
1215
+ "model_module_version": "2.0.0",
1216
+ "model_name": "ProgressStyleModel",
1217
+ "state": {
1218
+ "_model_module": "@jupyter-widgets/controls",
1219
+ "_model_module_version": "2.0.0",
1220
+ "_model_name": "ProgressStyleModel",
1221
+ "_view_count": null,
1222
+ "_view_module": "@jupyter-widgets/base",
1223
+ "_view_module_version": "2.0.0",
1224
+ "_view_name": "StyleView",
1225
+ "bar_color": null,
1226
+ "description_width": ""
1227
+ }
1228
+ },
1229
+ "33c5442a4c63404899f3bfd8e5ee84c4": {
1230
+ "model_module": "@jupyter-widgets/controls",
1231
+ "model_module_version": "2.0.0",
1232
+ "model_name": "HTMLModel",
1233
+ "state": {
1234
+ "_dom_classes": [],
1235
+ "_model_module": "@jupyter-widgets/controls",
1236
+ "_model_module_version": "2.0.0",
1237
+ "_model_name": "HTMLModel",
1238
+ "_view_count": null,
1239
+ "_view_module": "@jupyter-widgets/controls",
1240
+ "_view_module_version": "2.0.0",
1241
+ "_view_name": "HTMLView",
1242
+ "description": "",
1243
+ "description_allow_html": false,
1244
+ "layout": "IPY_MODEL_141f2a3636e04c3f9bfde6ffa25253ea",
1245
+ "placeholder": "​",
1246
+ "style": "IPY_MODEL_4046efa4d5604c58a344822710b99068",
1247
+ "tabbable": null,
1248
+ "tooltip": null,
1249
+ "value": " 1/1 [00:00&lt;00:00,  5.67it/s]"
1250
+ }
1251
+ },
1252
+ "4046efa4d5604c58a344822710b99068": {
1253
+ "model_module": "@jupyter-widgets/controls",
1254
+ "model_module_version": "2.0.0",
1255
+ "model_name": "HTMLStyleModel",
1256
+ "state": {
1257
+ "_model_module": "@jupyter-widgets/controls",
1258
+ "_model_module_version": "2.0.0",
1259
+ "_model_name": "HTMLStyleModel",
1260
+ "_view_count": null,
1261
+ "_view_module": "@jupyter-widgets/base",
1262
+ "_view_module_version": "2.0.0",
1263
+ "_view_name": "StyleView",
1264
+ "background": null,
1265
+ "description_width": "",
1266
+ "font_size": null,
1267
+ "text_color": null
1268
+ }
1269
+ },
1270
+ "41578f932e1e4a549a30f80e2a598676": {
1271
+ "model_module": "@jupyter-widgets/controls",
1272
+ "model_module_version": "2.0.0",
1273
+ "model_name": "HTMLModel",
1274
+ "state": {
1275
+ "_dom_classes": [],
1276
+ "_model_module": "@jupyter-widgets/controls",
1277
+ "_model_module_version": "2.0.0",
1278
+ "_model_name": "HTMLModel",
1279
+ "_view_count": null,
1280
+ "_view_module": "@jupyter-widgets/controls",
1281
+ "_view_module_version": "2.0.0",
1282
+ "_view_name": "HTMLView",
1283
+ "description": "",
1284
+ "description_allow_html": false,
1285
+ "layout": "IPY_MODEL_f21b692c698847d8b9e4a514be59eb4f",
1286
+ "placeholder": "​",
1287
+ "style": "IPY_MODEL_4dd085e5a6c8425b932640ba38cf3c2c",
1288
+ "tabbable": null,
1289
+ "tooltip": null,
1290
+ "value": "Batches: 100%"
1291
+ }
1292
+ },
1293
+ "4569f5281db341da993745bf202d0a05": {
1294
+ "model_module": "@jupyter-widgets/base",
1295
+ "model_module_version": "2.0.0",
1296
+ "model_name": "LayoutModel",
1297
+ "state": {
1298
+ "_model_module": "@jupyter-widgets/base",
1299
+ "_model_module_version": "2.0.0",
1300
+ "_model_name": "LayoutModel",
1301
+ "_view_count": null,
1302
+ "_view_module": "@jupyter-widgets/base",
1303
+ "_view_module_version": "2.0.0",
1304
+ "_view_name": "LayoutView",
1305
+ "align_content": null,
1306
+ "align_items": null,
1307
+ "align_self": null,
1308
+ "border_bottom": null,
1309
+ "border_left": null,
1310
+ "border_right": null,
1311
+ "border_top": null,
1312
+ "bottom": null,
1313
+ "display": null,
1314
+ "flex": null,
1315
+ "flex_flow": null,
1316
+ "grid_area": null,
1317
+ "grid_auto_columns": null,
1318
+ "grid_auto_flow": null,
1319
+ "grid_auto_rows": null,
1320
+ "grid_column": null,
1321
+ "grid_gap": null,
1322
+ "grid_row": null,
1323
+ "grid_template_areas": null,
1324
+ "grid_template_columns": null,
1325
+ "grid_template_rows": null,
1326
+ "height": null,
1327
+ "justify_content": null,
1328
+ "justify_items": null,
1329
+ "left": null,
1330
+ "margin": null,
1331
+ "max_height": null,
1332
+ "max_width": null,
1333
+ "min_height": null,
1334
+ "min_width": null,
1335
+ "object_fit": null,
1336
+ "object_position": null,
1337
+ "order": null,
1338
+ "overflow": null,
1339
+ "padding": null,
1340
+ "right": null,
1341
+ "top": null,
1342
+ "visibility": null,
1343
+ "width": null
1344
+ }
1345
+ },
1346
+ "4dd085e5a6c8425b932640ba38cf3c2c": {
1347
+ "model_module": "@jupyter-widgets/controls",
1348
+ "model_module_version": "2.0.0",
1349
+ "model_name": "HTMLStyleModel",
1350
+ "state": {
1351
+ "_model_module": "@jupyter-widgets/controls",
1352
+ "_model_module_version": "2.0.0",
1353
+ "_model_name": "HTMLStyleModel",
1354
+ "_view_count": null,
1355
+ "_view_module": "@jupyter-widgets/base",
1356
+ "_view_module_version": "2.0.0",
1357
+ "_view_name": "StyleView",
1358
+ "background": null,
1359
+ "description_width": "",
1360
+ "font_size": null,
1361
+ "text_color": null
1362
+ }
1363
+ },
1364
+ "570e89d1cf884bec858eda7795166c72": {
1365
+ "model_module": "@jupyter-widgets/base",
1366
+ "model_module_version": "2.0.0",
1367
+ "model_name": "LayoutModel",
1368
+ "state": {
1369
+ "_model_module": "@jupyter-widgets/base",
1370
+ "_model_module_version": "2.0.0",
1371
+ "_model_name": "LayoutModel",
1372
+ "_view_count": null,
1373
+ "_view_module": "@jupyter-widgets/base",
1374
+ "_view_module_version": "2.0.0",
1375
+ "_view_name": "LayoutView",
1376
+ "align_content": null,
1377
+ "align_items": null,
1378
+ "align_self": null,
1379
+ "border_bottom": null,
1380
+ "border_left": null,
1381
+ "border_right": null,
1382
+ "border_top": null,
1383
+ "bottom": null,
1384
+ "display": null,
1385
+ "flex": null,
1386
+ "flex_flow": null,
1387
+ "grid_area": null,
1388
+ "grid_auto_columns": null,
1389
+ "grid_auto_flow": null,
1390
+ "grid_auto_rows": null,
1391
+ "grid_column": null,
1392
+ "grid_gap": null,
1393
+ "grid_row": null,
1394
+ "grid_template_areas": null,
1395
+ "grid_template_columns": null,
1396
+ "grid_template_rows": null,
1397
+ "height": null,
1398
+ "justify_content": null,
1399
+ "justify_items": null,
1400
+ "left": null,
1401
+ "margin": null,
1402
+ "max_height": null,
1403
+ "max_width": null,
1404
+ "min_height": null,
1405
+ "min_width": null,
1406
+ "object_fit": null,
1407
+ "object_position": null,
1408
+ "order": null,
1409
+ "overflow": null,
1410
+ "padding": null,
1411
+ "right": null,
1412
+ "top": null,
1413
+ "visibility": null,
1414
+ "width": null
1415
+ }
1416
+ },
1417
+ "58d4efb95b3b4c11a7f27a3e92067ca3": {
1418
+ "model_module": "@jupyter-widgets/base",
1419
+ "model_module_version": "2.0.0",
1420
+ "model_name": "LayoutModel",
1421
+ "state": {
1422
+ "_model_module": "@jupyter-widgets/base",
1423
+ "_model_module_version": "2.0.0",
1424
+ "_model_name": "LayoutModel",
1425
+ "_view_count": null,
1426
+ "_view_module": "@jupyter-widgets/base",
1427
+ "_view_module_version": "2.0.0",
1428
+ "_view_name": "LayoutView",
1429
+ "align_content": null,
1430
+ "align_items": null,
1431
+ "align_self": null,
1432
+ "border_bottom": null,
1433
+ "border_left": null,
1434
+ "border_right": null,
1435
+ "border_top": null,
1436
+ "bottom": null,
1437
+ "display": null,
1438
+ "flex": null,
1439
+ "flex_flow": null,
1440
+ "grid_area": null,
1441
+ "grid_auto_columns": null,
1442
+ "grid_auto_flow": null,
1443
+ "grid_auto_rows": null,
1444
+ "grid_column": null,
1445
+ "grid_gap": null,
1446
+ "grid_row": null,
1447
+ "grid_template_areas": null,
1448
+ "grid_template_columns": null,
1449
+ "grid_template_rows": null,
1450
+ "height": null,
1451
+ "justify_content": null,
1452
+ "justify_items": null,
1453
+ "left": null,
1454
+ "margin": null,
1455
+ "max_height": null,
1456
+ "max_width": null,
1457
+ "min_height": null,
1458
+ "min_width": null,
1459
+ "object_fit": null,
1460
+ "object_position": null,
1461
+ "order": null,
1462
+ "overflow": null,
1463
+ "padding": null,
1464
+ "right": null,
1465
+ "top": null,
1466
+ "visibility": null,
1467
+ "width": null
1468
+ }
1469
+ },
1470
+ "58e6754f29874f6d9afd0f2fcaa6477c": {
1471
+ "model_module": "@jupyter-widgets/base",
1472
+ "model_module_version": "2.0.0",
1473
+ "model_name": "LayoutModel",
1474
+ "state": {
1475
+ "_model_module": "@jupyter-widgets/base",
1476
+ "_model_module_version": "2.0.0",
1477
+ "_model_name": "LayoutModel",
1478
+ "_view_count": null,
1479
+ "_view_module": "@jupyter-widgets/base",
1480
+ "_view_module_version": "2.0.0",
1481
+ "_view_name": "LayoutView",
1482
+ "align_content": null,
1483
+ "align_items": null,
1484
+ "align_self": null,
1485
+ "border_bottom": null,
1486
+ "border_left": null,
1487
+ "border_right": null,
1488
+ "border_top": null,
1489
+ "bottom": null,
1490
+ "display": null,
1491
+ "flex": null,
1492
+ "flex_flow": null,
1493
+ "grid_area": null,
1494
+ "grid_auto_columns": null,
1495
+ "grid_auto_flow": null,
1496
+ "grid_auto_rows": null,
1497
+ "grid_column": null,
1498
+ "grid_gap": null,
1499
+ "grid_row": null,
1500
+ "grid_template_areas": null,
1501
+ "grid_template_columns": null,
1502
+ "grid_template_rows": null,
1503
+ "height": null,
1504
+ "justify_content": null,
1505
+ "justify_items": null,
1506
+ "left": null,
1507
+ "margin": null,
1508
+ "max_height": null,
1509
+ "max_width": null,
1510
+ "min_height": null,
1511
+ "min_width": null,
1512
+ "object_fit": null,
1513
+ "object_position": null,
1514
+ "order": null,
1515
+ "overflow": null,
1516
+ "padding": null,
1517
+ "right": null,
1518
+ "top": null,
1519
+ "visibility": null,
1520
+ "width": null
1521
+ }
1522
+ },
1523
+ "7699c3d8986d4a6e8d8bb4217053059e": {
1524
+ "model_module": "@jupyter-widgets/controls",
1525
+ "model_module_version": "2.0.0",
1526
+ "model_name": "HTMLStyleModel",
1527
+ "state": {
1528
+ "_model_module": "@jupyter-widgets/controls",
1529
+ "_model_module_version": "2.0.0",
1530
+ "_model_name": "HTMLStyleModel",
1531
+ "_view_count": null,
1532
+ "_view_module": "@jupyter-widgets/base",
1533
+ "_view_module_version": "2.0.0",
1534
+ "_view_name": "StyleView",
1535
+ "background": null,
1536
+ "description_width": "",
1537
+ "font_size": null,
1538
+ "text_color": null
1539
+ }
1540
+ },
1541
+ "7bd2a232a56d480ca596b415ef3ed717": {
1542
+ "model_module": "@jupyter-widgets/controls",
1543
+ "model_module_version": "2.0.0",
1544
+ "model_name": "HTMLModel",
1545
+ "state": {
1546
+ "_dom_classes": [],
1547
+ "_model_module": "@jupyter-widgets/controls",
1548
+ "_model_module_version": "2.0.0",
1549
+ "_model_name": "HTMLModel",
1550
+ "_view_count": null,
1551
+ "_view_module": "@jupyter-widgets/controls",
1552
+ "_view_module_version": "2.0.0",
1553
+ "_view_name": "HTMLView",
1554
+ "description": "",
1555
+ "description_allow_html": false,
1556
+ "layout": "IPY_MODEL_570e89d1cf884bec858eda7795166c72",
1557
+ "placeholder": "​",
1558
+ "style": "IPY_MODEL_e119ef47c621423185cd7d198c61d218",
1559
+ "tabbable": null,
1560
+ "tooltip": null,
1561
+ "value": " 1/1 [00:00&lt;00:00,  4.09it/s]"
1562
+ }
1563
+ },
1564
+ "7d81b94c96664753b5c7f2eda5405f17": {
1565
+ "model_module": "@jupyter-widgets/controls",
1566
+ "model_module_version": "2.0.0",
1567
+ "model_name": "FloatProgressModel",
1568
+ "state": {
1569
+ "_dom_classes": [],
1570
+ "_model_module": "@jupyter-widgets/controls",
1571
+ "_model_module_version": "2.0.0",
1572
+ "_model_name": "FloatProgressModel",
1573
+ "_view_count": null,
1574
+ "_view_module": "@jupyter-widgets/controls",
1575
+ "_view_module_version": "2.0.0",
1576
+ "_view_name": "ProgressView",
1577
+ "bar_style": "success",
1578
+ "description": "",
1579
+ "description_allow_html": false,
1580
+ "layout": "IPY_MODEL_bba7faa7f1874023b8b3f44e0980a1dc",
1581
+ "max": 1.0,
1582
+ "min": 0.0,
1583
+ "orientation": "horizontal",
1584
+ "style": "IPY_MODEL_2a420c66cd7c43229bd55cf377d1e00d",
1585
+ "tabbable": null,
1586
+ "tooltip": null,
1587
+ "value": 1.0
1588
+ }
1589
+ },
1590
+ "81bee5ad297641bda75ce7076242489d": {
1591
+ "model_module": "@jupyter-widgets/controls",
1592
+ "model_module_version": "2.0.0",
1593
+ "model_name": "ProgressStyleModel",
1594
+ "state": {
1595
+ "_model_module": "@jupyter-widgets/controls",
1596
+ "_model_module_version": "2.0.0",
1597
+ "_model_name": "ProgressStyleModel",
1598
+ "_view_count": null,
1599
+ "_view_module": "@jupyter-widgets/base",
1600
+ "_view_module_version": "2.0.0",
1601
+ "_view_name": "StyleView",
1602
+ "bar_color": null,
1603
+ "description_width": ""
1604
+ }
1605
+ },
1606
+ "8a00c11016924cebbb3cbc2c8a06e4e5": {
1607
+ "model_module": "@jupyter-widgets/controls",
1608
+ "model_module_version": "2.0.0",
1609
+ "model_name": "HTMLModel",
1610
+ "state": {
1611
+ "_dom_classes": [],
1612
+ "_model_module": "@jupyter-widgets/controls",
1613
+ "_model_module_version": "2.0.0",
1614
+ "_model_name": "HTMLModel",
1615
+ "_view_count": null,
1616
+ "_view_module": "@jupyter-widgets/controls",
1617
+ "_view_module_version": "2.0.0",
1618
+ "_view_name": "HTMLView",
1619
+ "description": "",
1620
+ "description_allow_html": false,
1621
+ "layout": "IPY_MODEL_58e6754f29874f6d9afd0f2fcaa6477c",
1622
+ "placeholder": "​",
1623
+ "style": "IPY_MODEL_7699c3d8986d4a6e8d8bb4217053059e",
1624
+ "tabbable": null,
1625
+ "tooltip": null,
1626
+ "value": "Batches: 100%"
1627
+ }
1628
+ },
1629
+ "bba7faa7f1874023b8b3f44e0980a1dc": {
1630
+ "model_module": "@jupyter-widgets/base",
1631
+ "model_module_version": "2.0.0",
1632
+ "model_name": "LayoutModel",
1633
+ "state": {
1634
+ "_model_module": "@jupyter-widgets/base",
1635
+ "_model_module_version": "2.0.0",
1636
+ "_model_name": "LayoutModel",
1637
+ "_view_count": null,
1638
+ "_view_module": "@jupyter-widgets/base",
1639
+ "_view_module_version": "2.0.0",
1640
+ "_view_name": "LayoutView",
1641
+ "align_content": null,
1642
+ "align_items": null,
1643
+ "align_self": null,
1644
+ "border_bottom": null,
1645
+ "border_left": null,
1646
+ "border_right": null,
1647
+ "border_top": null,
1648
+ "bottom": null,
1649
+ "display": null,
1650
+ "flex": null,
1651
+ "flex_flow": null,
1652
+ "grid_area": null,
1653
+ "grid_auto_columns": null,
1654
+ "grid_auto_flow": null,
1655
+ "grid_auto_rows": null,
1656
+ "grid_column": null,
1657
+ "grid_gap": null,
1658
+ "grid_row": null,
1659
+ "grid_template_areas": null,
1660
+ "grid_template_columns": null,
1661
+ "grid_template_rows": null,
1662
+ "height": null,
1663
+ "justify_content": null,
1664
+ "justify_items": null,
1665
+ "left": null,
1666
+ "margin": null,
1667
+ "max_height": null,
1668
+ "max_width": null,
1669
+ "min_height": null,
1670
+ "min_width": null,
1671
+ "object_fit": null,
1672
+ "object_position": null,
1673
+ "order": null,
1674
+ "overflow": null,
1675
+ "padding": null,
1676
+ "right": null,
1677
+ "top": null,
1678
+ "visibility": null,
1679
+ "width": null
1680
+ }
1681
+ },
1682
+ "c680fa26e0ff4501b6ef8eacd599090e": {
1683
+ "model_module": "@jupyter-widgets/controls",
1684
+ "model_module_version": "2.0.0",
1685
+ "model_name": "HBoxModel",
1686
+ "state": {
1687
+ "_dom_classes": [],
1688
+ "_model_module": "@jupyter-widgets/controls",
1689
+ "_model_module_version": "2.0.0",
1690
+ "_model_name": "HBoxModel",
1691
+ "_view_count": null,
1692
+ "_view_module": "@jupyter-widgets/controls",
1693
+ "_view_module_version": "2.0.0",
1694
+ "_view_name": "HBoxView",
1695
+ "box_style": "",
1696
+ "children": [
1697
+ "IPY_MODEL_8a00c11016924cebbb3cbc2c8a06e4e5",
1698
+ "IPY_MODEL_7d81b94c96664753b5c7f2eda5405f17",
1699
+ "IPY_MODEL_7bd2a232a56d480ca596b415ef3ed717"
1700
+ ],
1701
+ "layout": "IPY_MODEL_4569f5281db341da993745bf202d0a05",
1702
+ "tabbable": null,
1703
+ "tooltip": null
1704
+ }
1705
+ },
1706
+ "d256c819166642f68442fa041de1ba67": {
1707
+ "model_module": "@jupyter-widgets/controls",
1708
+ "model_module_version": "2.0.0",
1709
+ "model_name": "HBoxModel",
1710
+ "state": {
1711
+ "_dom_classes": [],
1712
+ "_model_module": "@jupyter-widgets/controls",
1713
+ "_model_module_version": "2.0.0",
1714
+ "_model_name": "HBoxModel",
1715
+ "_view_count": null,
1716
+ "_view_module": "@jupyter-widgets/controls",
1717
+ "_view_module_version": "2.0.0",
1718
+ "_view_name": "HBoxView",
1719
+ "box_style": "",
1720
+ "children": [
1721
+ "IPY_MODEL_41578f932e1e4a549a30f80e2a598676",
1722
+ "IPY_MODEL_dbbfdddf99314100a558b197a6d393a4",
1723
+ "IPY_MODEL_33c5442a4c63404899f3bfd8e5ee84c4"
1724
+ ],
1725
+ "layout": "IPY_MODEL_58d4efb95b3b4c11a7f27a3e92067ca3",
1726
+ "tabbable": null,
1727
+ "tooltip": null
1728
+ }
1729
+ },
1730
+ "dbbfdddf99314100a558b197a6d393a4": {
1731
+ "model_module": "@jupyter-widgets/controls",
1732
+ "model_module_version": "2.0.0",
1733
+ "model_name": "FloatProgressModel",
1734
+ "state": {
1735
+ "_dom_classes": [],
1736
+ "_model_module": "@jupyter-widgets/controls",
1737
+ "_model_module_version": "2.0.0",
1738
+ "_model_name": "FloatProgressModel",
1739
+ "_view_count": null,
1740
+ "_view_module": "@jupyter-widgets/controls",
1741
+ "_view_module_version": "2.0.0",
1742
+ "_view_name": "ProgressView",
1743
+ "bar_style": "success",
1744
+ "description": "",
1745
+ "description_allow_html": false,
1746
+ "layout": "IPY_MODEL_fc757532ed1d453882911701c1c95190",
1747
+ "max": 1.0,
1748
+ "min": 0.0,
1749
+ "orientation": "horizontal",
1750
+ "style": "IPY_MODEL_81bee5ad297641bda75ce7076242489d",
1751
+ "tabbable": null,
1752
+ "tooltip": null,
1753
+ "value": 1.0
1754
+ }
1755
+ },
1756
+ "e119ef47c621423185cd7d198c61d218": {
1757
+ "model_module": "@jupyter-widgets/controls",
1758
+ "model_module_version": "2.0.0",
1759
+ "model_name": "HTMLStyleModel",
1760
+ "state": {
1761
+ "_model_module": "@jupyter-widgets/controls",
1762
+ "_model_module_version": "2.0.0",
1763
+ "_model_name": "HTMLStyleModel",
1764
+ "_view_count": null,
1765
+ "_view_module": "@jupyter-widgets/base",
1766
+ "_view_module_version": "2.0.0",
1767
+ "_view_name": "StyleView",
1768
+ "background": null,
1769
+ "description_width": "",
1770
+ "font_size": null,
1771
+ "text_color": null
1772
+ }
1773
+ },
1774
+ "f21b692c698847d8b9e4a514be59eb4f": {
1775
+ "model_module": "@jupyter-widgets/base",
1776
+ "model_module_version": "2.0.0",
1777
+ "model_name": "LayoutModel",
1778
+ "state": {
1779
+ "_model_module": "@jupyter-widgets/base",
1780
+ "_model_module_version": "2.0.0",
1781
+ "_model_name": "LayoutModel",
1782
+ "_view_count": null,
1783
+ "_view_module": "@jupyter-widgets/base",
1784
+ "_view_module_version": "2.0.0",
1785
+ "_view_name": "LayoutView",
1786
+ "align_content": null,
1787
+ "align_items": null,
1788
+ "align_self": null,
1789
+ "border_bottom": null,
1790
+ "border_left": null,
1791
+ "border_right": null,
1792
+ "border_top": null,
1793
+ "bottom": null,
1794
+ "display": null,
1795
+ "flex": null,
1796
+ "flex_flow": null,
1797
+ "grid_area": null,
1798
+ "grid_auto_columns": null,
1799
+ "grid_auto_flow": null,
1800
+ "grid_auto_rows": null,
1801
+ "grid_column": null,
1802
+ "grid_gap": null,
1803
+ "grid_row": null,
1804
+ "grid_template_areas": null,
1805
+ "grid_template_columns": null,
1806
+ "grid_template_rows": null,
1807
+ "height": null,
1808
+ "justify_content": null,
1809
+ "justify_items": null,
1810
+ "left": null,
1811
+ "margin": null,
1812
+ "max_height": null,
1813
+ "max_width": null,
1814
+ "min_height": null,
1815
+ "min_width": null,
1816
+ "object_fit": null,
1817
+ "object_position": null,
1818
+ "order": null,
1819
+ "overflow": null,
1820
+ "padding": null,
1821
+ "right": null,
1822
+ "top": null,
1823
+ "visibility": null,
1824
+ "width": null
1825
+ }
1826
+ },
1827
+ "fc757532ed1d453882911701c1c95190": {
1828
+ "model_module": "@jupyter-widgets/base",
1829
+ "model_module_version": "2.0.0",
1830
+ "model_name": "LayoutModel",
1831
+ "state": {
1832
+ "_model_module": "@jupyter-widgets/base",
1833
+ "_model_module_version": "2.0.0",
1834
+ "_model_name": "LayoutModel",
1835
+ "_view_count": null,
1836
+ "_view_module": "@jupyter-widgets/base",
1837
+ "_view_module_version": "2.0.0",
1838
+ "_view_name": "LayoutView",
1839
+ "align_content": null,
1840
+ "align_items": null,
1841
+ "align_self": null,
1842
+ "border_bottom": null,
1843
+ "border_left": null,
1844
+ "border_right": null,
1845
+ "border_top": null,
1846
+ "bottom": null,
1847
+ "display": null,
1848
+ "flex": null,
1849
+ "flex_flow": null,
1850
+ "grid_area": null,
1851
+ "grid_auto_columns": null,
1852
+ "grid_auto_flow": null,
1853
+ "grid_auto_rows": null,
1854
+ "grid_column": null,
1855
+ "grid_gap": null,
1856
+ "grid_row": null,
1857
+ "grid_template_areas": null,
1858
+ "grid_template_columns": null,
1859
+ "grid_template_rows": null,
1860
+ "height": null,
1861
+ "justify_content": null,
1862
+ "justify_items": null,
1863
+ "left": null,
1864
+ "margin": null,
1865
+ "max_height": null,
1866
+ "max_width": null,
1867
+ "min_height": null,
1868
+ "min_width": null,
1869
+ "object_fit": null,
1870
+ "object_position": null,
1871
+ "order": null,
1872
+ "overflow": null,
1873
+ "padding": null,
1874
+ "right": null,
1875
+ "top": null,
1876
+ "visibility": null,
1877
+ "width": null
1878
+ }
1879
+ }
1880
+ },
1881
+ "version_major": 2,
1882
+ "version_minor": 0
1883
+ }
1884
+ }
1885
+ },
1886
+ "nbformat": 4,
1887
+ "nbformat_minor": 5
1888
+ }