natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,1718 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "0b803780",
6
- "metadata": {},
7
- "source": [
8
- "# Semantic Search Across Multiple Documents\n",
9
- "\n",
10
- "When working with a collection of PDFs, you might need to find information relevant to a specific query across all documents, not just within a single one. This tutorial demonstrates how to perform semantic search over a `PDFCollection`."
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 1,
16
- "id": "664b4cd6",
17
- "metadata": {
18
- "execution": {
19
- "iopub.execute_input": "2025-04-27T16:34:55.126546Z",
20
- "iopub.status.busy": "2025-04-27T16:34:55.125930Z",
21
- "iopub.status.idle": "2025-04-27T16:34:55.132105Z",
22
- "shell.execute_reply": "2025-04-27T16:34:55.131378Z"
23
- }
24
- },
25
- "outputs": [],
26
- "source": [
27
- "#%pip install \"natural-pdf[all]\"\n",
28
- "#%pip install \"natural-pdf[search]\" # Ensure search dependencies are installed"
29
- ]
30
- },
31
- {
32
- "cell_type": "code",
33
- "execution_count": 2,
34
- "id": "30a0ced5",
35
- "metadata": {
36
- "execution": {
37
- "iopub.execute_input": "2025-04-27T16:34:55.135528Z",
38
- "iopub.status.busy": "2025-04-27T16:34:55.135238Z",
39
- "iopub.status.idle": "2025-04-27T16:35:03.763400Z",
40
- "shell.execute_reply": "2025-04-27T16:35:03.762907Z"
41
- }
42
- },
43
- "outputs": [
44
- {
45
- "name": "stderr",
46
- "output_type": "stream",
47
- "text": [
48
- "natural_pdf.collections.pdf_collection - INFO - Initializing 2 PDF objects...\n"
49
- ]
50
- },
51
- {
52
- "data": {
53
- "application/vnd.jupyter.widget-view+json": {
54
- "model_id": "d7fd32b9ea9f487099b048e5b410f76f",
55
- "version_major": 2,
56
- "version_minor": 0
57
- },
58
- "text/plain": [
59
- "Loading PDFs: 0%| | 0/2 [00:00<?, ?it/s]"
60
- ]
61
- },
62
- "metadata": {},
63
- "output_type": "display_data"
64
- },
65
- {
66
- "name": "stderr",
67
- "output_type": "stream",
68
- "text": [
69
- "natural_pdf.core.pdf - INFO - Downloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\n"
70
- ]
71
- },
72
- {
73
- "name": "stderr",
74
- "output_type": "stream",
75
- "text": [
76
- "natural_pdf.core.pdf - INFO - PDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpc7n5dufd.pdf\n"
77
- ]
78
- },
79
- {
80
- "name": "stderr",
81
- "output_type": "stream",
82
- "text": [
83
- "natural_pdf.core.pdf - INFO - Initializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpc7n5dufd.pdf\n"
84
- ]
85
- },
86
- {
87
- "name": "stderr",
88
- "output_type": "stream",
89
- "text": [
90
- "natural_pdf.ocr.ocr_manager - INFO - OCRManager initialized.\n"
91
- ]
92
- },
93
- {
94
- "name": "stderr",
95
- "output_type": "stream",
96
- "text": [
97
- "natural_pdf.analyzers.layout.layout_manager - INFO - LayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling', 'gemini']\n"
98
- ]
99
- },
100
- {
101
- "name": "stderr",
102
- "output_type": "stream",
103
- "text": [
104
- "natural_pdf.core.highlighting_service - INFO - HighlightingService initialized with ColorManager.\n"
105
- ]
106
- },
107
- {
108
- "name": "stderr",
109
- "output_type": "stream",
110
- "text": [
111
- "natural_pdf.classification.manager - INFO - ClassificationManager initialized on device: None\n"
112
- ]
113
- },
114
- {
115
- "name": "stderr",
116
- "output_type": "stream",
117
- "text": [
118
- "natural_pdf.core.pdf - INFO - PDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf' initialized with 1 pages.\n"
119
- ]
120
- },
121
- {
122
- "name": "stderr",
123
- "output_type": "stream",
124
- "text": [
125
- "natural_pdf.classification.manager - INFO - ClassificationManager initialized on device: None\n"
126
- ]
127
- },
128
- {
129
- "name": "stderr",
130
- "output_type": "stream",
131
- "text": [
132
- "natural_pdf.extraction.manager - INFO - Initialized StructuredDataManager.\n"
133
- ]
134
- },
135
- {
136
- "name": "stderr",
137
- "output_type": "stream",
138
- "text": [
139
- "natural_pdf.core.pdf - INFO - Downloading PDF from URL: https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\n"
140
- ]
141
- },
142
- {
143
- "name": "stderr",
144
- "output_type": "stream",
145
- "text": [
146
- "natural_pdf.core.pdf - INFO - PDF downloaded to temporary file: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n"
147
- ]
148
- },
149
- {
150
- "name": "stderr",
151
- "output_type": "stream",
152
- "text": [
153
- "natural_pdf.core.pdf - INFO - Initializing PDF from /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n"
154
- ]
155
- },
156
- {
157
- "name": "stderr",
158
- "output_type": "stream",
159
- "text": [
160
- "natural_pdf.ocr.ocr_manager - INFO - OCRManager initialized.\n"
161
- ]
162
- },
163
- {
164
- "name": "stderr",
165
- "output_type": "stream",
166
- "text": [
167
- "natural_pdf.analyzers.layout.layout_manager - INFO - LayoutManager initialized. Available engines: ['yolo', 'tatr', 'paddle', 'surya', 'docling', 'gemini']\n"
168
- ]
169
- },
170
- {
171
- "name": "stderr",
172
- "output_type": "stream",
173
- "text": [
174
- "natural_pdf.core.highlighting_service - INFO - HighlightingService initialized with ColorManager.\n"
175
- ]
176
- },
177
- {
178
- "name": "stderr",
179
- "output_type": "stream",
180
- "text": [
181
- "natural_pdf.classification.manager - INFO - ClassificationManager initialized on device: None\n"
182
- ]
183
- },
184
- {
185
- "name": "stderr",
186
- "output_type": "stream",
187
- "text": [
188
- "natural_pdf.core.pdf - INFO - PDF 'https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf' initialized with 5 pages.\n"
189
- ]
190
- },
191
- {
192
- "name": "stderr",
193
- "output_type": "stream",
194
- "text": [
195
- "natural_pdf.classification.manager - INFO - ClassificationManager initialized on device: None\n"
196
- ]
197
- },
198
- {
199
- "name": "stderr",
200
- "output_type": "stream",
201
- "text": [
202
- "natural_pdf.extraction.manager - INFO - Initialized StructuredDataManager.\n"
203
- ]
204
- },
205
- {
206
- "name": "stderr",
207
- "output_type": "stream",
208
- "text": [
209
- "natural_pdf.collections.pdf_collection - INFO - Successfully initialized 2 PDFs. Failed: 0\n"
210
- ]
211
- },
212
- {
213
- "name": "stdout",
214
- "output_type": "stream",
215
- "text": [
216
- "Created collection with 2 PDFs.\n"
217
- ]
218
- }
219
- ],
220
- "source": [
221
- "import logging\n",
222
- "import natural_pdf\n",
223
- "\n",
224
- "# Optional: Configure logging to see progress\n",
225
- "natural_pdf.configure_logging(level=logging.INFO)\n",
226
- "\n",
227
- "# Define the paths to your PDF files\n",
228
- "pdf_paths = [\n",
229
- " \"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\",\n",
230
- " \"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf\"\n",
231
- " # Add more PDF paths as needed\n",
232
- "]\n",
233
- "\n",
234
- "# Create a PDFCollection\n",
235
- "collection = natural_pdf.PDFCollection(pdf_paths)\n",
236
- "print(f\"Created collection with {len(collection.pdfs)} PDFs.\")"
237
- ]
238
- },
239
- {
240
- "cell_type": "markdown",
241
- "id": "32cf5680",
242
- "metadata": {},
243
- "source": [
244
- "## Initializing the Search Index\n",
245
- "\n",
246
- "Before performing a search, you need to initialize the search capabilities for the collection. This involves processing the documents and building an index."
247
- ]
248
- },
249
- {
250
- "cell_type": "code",
251
- "execution_count": 3,
252
- "id": "a4f73496",
253
- "metadata": {
254
- "execution": {
255
- "iopub.execute_input": "2025-04-27T16:35:03.765700Z",
256
- "iopub.status.busy": "2025-04-27T16:35:03.764847Z",
257
- "iopub.status.idle": "2025-04-27T16:35:06.500731Z",
258
- "shell.execute_reply": "2025-04-27T16:35:06.500375Z"
259
- }
260
- },
261
- "outputs": [
262
- {
263
- "name": "stderr",
264
- "output_type": "stream",
265
- "text": [
266
- "natural_pdf.search.searchable_mixin - INFO - Using default collection name 'default_collection' for in-memory service.\n"
267
- ]
268
- },
269
- {
270
- "name": "stderr",
271
- "output_type": "stream",
272
- "text": [
273
- "natural_pdf.search.searchable_mixin - INFO - Creating new SearchService: name='default_collection', persist=False, model=default\n"
274
- ]
275
- },
276
- {
277
- "name": "stderr",
278
- "output_type": "stream",
279
- "text": [
280
- "natural_pdf.search.haystack_search_service - INFO - HaystackSearchService initialized for collection='default_collection' (persist=False, model='sentence-transformers/all-MiniLM-L6-v2'). Default path: './natural_pdf_index'\n"
281
- ]
282
- },
283
- {
284
- "name": "stderr",
285
- "output_type": "stream",
286
- "text": [
287
- "natural_pdf.search - INFO - Created new HaystackSearchService instance for collection 'default_collection'.\n"
288
- ]
289
- },
290
- {
291
- "name": "stderr",
292
- "output_type": "stream",
293
- "text": [
294
- "natural_pdf.search.searchable_mixin - INFO - index=True: Proceeding to index collection immediately after search initialization.\n"
295
- ]
296
- },
297
- {
298
- "name": "stderr",
299
- "output_type": "stream",
300
- "text": [
301
- "natural_pdf.search.searchable_mixin - INFO - Starting internal indexing process into SearchService collection 'default_collection'...\n"
302
- ]
303
- },
304
- {
305
- "name": "stderr",
306
- "output_type": "stream",
307
- "text": [
308
- "natural_pdf.search.searchable_mixin - INFO - Prepared 6 indexable items for indexing.\n"
309
- ]
310
- },
311
- {
312
- "name": "stderr",
313
- "output_type": "stream",
314
- "text": [
315
- "natural_pdf.search.haystack_search_service - INFO - Index request for collection='default_collection', docs=6, model='sentence-transformers/all-MiniLM-L6-v2', force=False, persist=False\n"
316
- ]
317
- },
318
- {
319
- "name": "stderr",
320
- "output_type": "stream",
321
- "text": [
322
- "natural_pdf.search.haystack_search_service - INFO - Created SentenceTransformersDocumentEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\n"
323
- ]
324
- },
325
- {
326
- "name": "stderr",
327
- "output_type": "stream",
328
- "text": [
329
- "natural_pdf.search.haystack_search_service - INFO - Preparing Haystack Documents from 6 indexable items...\n"
330
- ]
331
- },
332
- {
333
- "name": "stderr",
334
- "output_type": "stream",
335
- "text": [
336
- "natural_pdf.search.haystack_search_service - INFO - Embedding 6 documents using 'sentence-transformers/all-MiniLM-L6-v2'...\n"
337
- ]
338
- },
339
- {
340
- "data": {
341
- "application/vnd.jupyter.widget-view+json": {
342
- "model_id": "168a989d9d724abf92f2c02538b32c39",
343
- "version_major": 2,
344
- "version_minor": 0
345
- },
346
- "text/plain": [
347
- "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
348
- ]
349
- },
350
- "metadata": {},
351
- "output_type": "display_data"
352
- },
353
- {
354
- "name": "stderr",
355
- "output_type": "stream",
356
- "text": [
357
- "natural_pdf.search.haystack_search_service - INFO - Successfully embedded 6 documents.\n"
358
- ]
359
- },
360
- {
361
- "name": "stderr",
362
- "output_type": "stream",
363
- "text": [
364
- "natural_pdf.search.haystack_search_service - INFO - Writing 6 embedded documents to store 'default_collection'...\n"
365
- ]
366
- },
367
- {
368
- "name": "stderr",
369
- "output_type": "stream",
370
- "text": [
371
- "natural_pdf.search.haystack_search_service - INFO - Successfully wrote 6 documents to store 'default_collection'.\n"
372
- ]
373
- },
374
- {
375
- "name": "stderr",
376
- "output_type": "stream",
377
- "text": [
378
- "natural_pdf.search.haystack_search_service - INFO - Store 'default_collection' document count after write: 6\n"
379
- ]
380
- },
381
- {
382
- "name": "stderr",
383
- "output_type": "stream",
384
- "text": [
385
- "natural_pdf.search.searchable_mixin - INFO - Successfully completed indexing into SearchService collection 'default_collection'.\n"
386
- ]
387
- },
388
- {
389
- "name": "stdout",
390
- "output_type": "stream",
391
- "text": [
392
- "Search index initialized.\n"
393
- ]
394
- }
395
- ],
396
- "source": [
397
- "# Initialize search. 'index=True' builds the index immediately.\n",
398
- "# This might take some time depending on the number and size of PDFs.\n",
399
- "collection.init_search(index=True) \n",
400
- "print(\"Search index initialized.\")"
401
- ]
402
- },
403
- {
404
- "cell_type": "markdown",
405
- "id": "897988b1",
406
- "metadata": {},
407
- "source": [
408
- "## Performing a Semantic Search\n",
409
- "\n",
410
- "Once the index is ready, you can use the `find_relevant()` method to search for content semantically related to your query."
411
- ]
412
- },
413
- {
414
- "cell_type": "code",
415
- "execution_count": 4,
416
- "id": "8de85856",
417
- "metadata": {
418
- "execution": {
419
- "iopub.execute_input": "2025-04-27T16:35:06.502157Z",
420
- "iopub.status.busy": "2025-04-27T16:35:06.502017Z",
421
- "iopub.status.idle": "2025-04-27T16:35:06.629731Z",
422
- "shell.execute_reply": "2025-04-27T16:35:06.629475Z"
423
- }
424
- },
425
- "outputs": [
426
- {
427
- "name": "stderr",
428
- "output_type": "stream",
429
- "text": [
430
- "natural_pdf.search.searchable_mixin - INFO - Searching collection 'default_collection' via HaystackSearchService...\n"
431
- ]
432
- },
433
- {
434
- "name": "stderr",
435
- "output_type": "stream",
436
- "text": [
437
- "natural_pdf.search.haystack_search_service - INFO - Search request for collection='default_collection', query_type=str, options=TextSearchOptions(top_k=10, retriever_top_k=20, filters=None, use_reranker=True, reranker_instance=None, reranker_model=None, reranker_api_key=None)\n"
438
- ]
439
- },
440
- {
441
- "name": "stderr",
442
- "output_type": "stream",
443
- "text": [
444
- "natural_pdf.search.haystack_search_service - INFO - Created SentenceTransformersTextEmbedder. Model: sentence-transformers/all-MiniLM-L6-v2, Device: ComponentDevice(_single_device=Device(type=<DeviceType.MPS: 'mps'>, id=None), _multiple_devices=None)\n"
445
- ]
446
- },
447
- {
448
- "data": {
449
- "application/vnd.jupyter.widget-view+json": {
450
- "model_id": "2a9bdf4a442c40ba81ca10ec0d504df4",
451
- "version_major": 2,
452
- "version_minor": 0
453
- },
454
- "text/plain": [
455
- "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
456
- ]
457
- },
458
- "metadata": {},
459
- "output_type": "display_data"
460
- },
461
- {
462
- "name": "stderr",
463
- "output_type": "stream",
464
- "text": [
465
- "natural_pdf.search.haystack_search_service - INFO - Running retrieval pipeline for collection 'default_collection'...\n"
466
- ]
467
- },
468
- {
469
- "name": "stderr",
470
- "output_type": "stream",
471
- "text": [
472
- "natural_pdf.search.haystack_search_service - INFO - Retrieved 6 documents.\n"
473
- ]
474
- },
475
- {
476
- "name": "stderr",
477
- "output_type": "stream",
478
- "text": [
479
- "natural_pdf.search.searchable_mixin - INFO - SearchService returned 6 results from collection 'default_collection'.\n"
480
- ]
481
- },
482
- {
483
- "name": "stdout",
484
- "output_type": "stream",
485
- "text": [
486
- "Found 6 results for 'american president':\n"
487
- ]
488
- }
489
- ],
490
- "source": [
491
- "# Perform a search query\n",
492
- "query = \"american president\"\n",
493
- "results = collection.find_relevant(query)\n",
494
- "\n",
495
- "print(f\"Found {len(results)} results for '{query}':\")"
496
- ]
497
- },
498
- {
499
- "cell_type": "markdown",
500
- "id": "af2db0c4",
501
- "metadata": {},
502
- "source": [
503
- "## Understanding Search Results\n",
504
- "\n",
505
- "The `find_relevant()` method returns a list of dictionaries, each representing a relevant text chunk found in one of the PDFs. Each result includes:\n",
506
- "\n",
507
- "* `pdf_path`: The path to the PDF document where the result was found.\n",
508
- "* `page_number`: The page number within the PDF.\n",
509
- "* `score`: A relevance score (higher means more relevant).\n",
510
- "* `content_snippet`: A snippet of the text chunk that matched the query."
511
- ]
512
- },
513
- {
514
- "cell_type": "code",
515
- "execution_count": 5,
516
- "id": "dca1b9a7",
517
- "metadata": {
518
- "execution": {
519
- "iopub.execute_input": "2025-04-27T16:35:06.630985Z",
520
- "iopub.status.busy": "2025-04-27T16:35:06.630862Z",
521
- "iopub.status.idle": "2025-04-27T16:35:06.633294Z",
522
- "shell.execute_reply": "2025-04-27T16:35:06.633045Z"
523
- }
524
- },
525
- "outputs": [
526
- {
527
- "name": "stdout",
528
- "output_type": "stream",
529
- "text": [
530
- " 1. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n",
531
- " Page: 2 (Score: 0.0708)\n",
532
- " Snippet: Library Weeding Log Atlanta Public Schools\n",
533
- "From: 8/1/2017 To: 6/30/2023\n",
534
- "6/6/2023 - Copies Removed: 130\n",
535
- "The Anasazi (Removed: 1)\n",
536
- "Author: Petersen, David. ISBN: 0-516-01121-9 (trade) Published: 1991\n",
537
- "Sit...\n",
538
- " 2. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n",
539
- " Page: 5 (Score: 0.0669)\n",
540
- " Snippet: Library Weeding Log Atlanta Public Schools\n",
541
- "From: 8/1/2017 To: 6/30/2023\n",
542
- "6/6/2023 - Copies Removed: 130\n",
543
- "Centennial Place 33170000562167 $13.10 11/5/1999 33554-43170\n",
544
- "Academy (Charter)\n",
545
- "Was Available -- W...\n",
546
- " 3. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpc7n5dufd.pdf\n",
547
- " Page: 1 (Score: -0.0040)\n",
548
- " Snippet: Jungle Health and Safety Inspection Service\n",
549
- "INS-UP70N51NCL41R\n",
550
- "Site: Durham’s Meatpacking Chicago, Ill.\n",
551
- "Date: February 3, 1905\n",
552
- "Violation Count: 7\n",
553
- "Summary: Worst of any, however, were the fertilizer men...\n",
554
- " 4. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n",
555
- " Page: 4 (Score: -0.0245)\n",
556
- " Snippet: Library Weeding Log Atlanta Public Schools\n",
557
- "From: 8/1/2017 To: 6/30/2023\n",
558
- "6/6/2023 - Copies Removed: 130\n",
559
- "Children of the Philippines (Removed: 1)\n",
560
- "Author: Kinkade, Sheila, 1962- ISBN: 0-87614-993-X Publi...\n",
561
- " 5. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n",
562
- " Page: 3 (Score: -0.0445)\n",
563
- " Snippet: Library Weeding Log Atlanta Public Schools\n",
564
- "From: 8/1/2017 To: 6/30/2023\n",
565
- "6/6/2023 - Copies Removed: 130\n",
566
- "Centennial Place 33170000507600 $19.45 2/21/2000 33554-43170\n",
567
- "Academy (Charter)\n",
568
- "Was Available -- W...\n",
569
- " 6. PDF: /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp5mocjexv.pdf\n",
570
- " Page: 1 (Score: -0.0473)\n",
571
- " Snippet: Library Weeding Log Atlanta Public Schools\n",
572
- "From: 8/1/2017 To: 6/30/2023\n",
573
- "6/12/2023 - Copies Removed: 2\n",
574
- "Tristan Strong punches a hole in the sky (Removed: 1)\n",
575
- "Author: Mbalia, Kwame. ISBN: 978-1-36803993-...\n"
576
- ]
577
- }
578
- ],
579
- "source": [
580
- "# Process and display the results\n",
581
- "if results:\n",
582
- " for i, result in enumerate(results):\n",
583
- " print(f\" {i+1}. PDF: {result['pdf_path']}\")\n",
584
- " print(f\" Page: {result['page_number']} (Score: {result['score']:.4f})\")\n",
585
- " # Display a snippet of the content\n",
586
- " snippet = result.get('content_snippet', '')\n",
587
- " print(f\" Snippet: {snippet}...\") \n",
588
- "else:\n",
589
- " print(\" No relevant results found.\")\n",
590
- "\n",
591
- "# You can access the full content if needed via the result object, \n",
592
- "# though 'content_snippet' is usually sufficient for display."
593
- ]
594
- },
595
- {
596
- "cell_type": "markdown",
597
- "id": "71acde4d",
598
- "metadata": {},
599
- "source": [
600
- "Semantic search allows you to efficiently query large sets of documents to find the most relevant information without needing exact keyword matches, leveraging the meaning and context of your query. "
601
- ]
602
- }
603
- ],
604
- "metadata": {
605
- "jupytext": {
606
- "cell_metadata_filter": "-all",
607
- "main_language": "python",
608
- "notebook_metadata_filter": "-all"
609
- },
610
- "kernelspec": {
611
- "display_name": "Python (natural-pdf)",
612
- "language": "python",
613
- "name": "natural-pdf"
614
- },
615
- "language_info": {
616
- "codemirror_mode": {
617
- "name": "ipython",
618
- "version": 3
619
- },
620
- "file_extension": ".py",
621
- "mimetype": "text/x-python",
622
- "name": "python",
623
- "nbconvert_exporter": "python",
624
- "pygments_lexer": "ipython3",
625
- "version": "3.10.13"
626
- },
627
- "widgets": {
628
- "application/vnd.jupyter.widget-state+json": {
629
- "state": {
630
- "045481258c7a4764b11034c22edd1940": {
631
- "model_module": "@jupyter-widgets/controls",
632
- "model_module_version": "2.0.0",
633
- "model_name": "HTMLStyleModel",
634
- "state": {
635
- "_model_module": "@jupyter-widgets/controls",
636
- "_model_module_version": "2.0.0",
637
- "_model_name": "HTMLStyleModel",
638
- "_view_count": null,
639
- "_view_module": "@jupyter-widgets/base",
640
- "_view_module_version": "2.0.0",
641
- "_view_name": "StyleView",
642
- "background": null,
643
- "description_width": "",
644
- "font_size": null,
645
- "text_color": null
646
- }
647
- },
648
- "04f0fb9dfb2645cd8a728bf068b2ed2d": {
649
- "model_module": "@jupyter-widgets/base",
650
- "model_module_version": "2.0.0",
651
- "model_name": "LayoutModel",
652
- "state": {
653
- "_model_module": "@jupyter-widgets/base",
654
- "_model_module_version": "2.0.0",
655
- "_model_name": "LayoutModel",
656
- "_view_count": null,
657
- "_view_module": "@jupyter-widgets/base",
658
- "_view_module_version": "2.0.0",
659
- "_view_name": "LayoutView",
660
- "align_content": null,
661
- "align_items": null,
662
- "align_self": null,
663
- "border_bottom": null,
664
- "border_left": null,
665
- "border_right": null,
666
- "border_top": null,
667
- "bottom": null,
668
- "display": null,
669
- "flex": null,
670
- "flex_flow": null,
671
- "grid_area": null,
672
- "grid_auto_columns": null,
673
- "grid_auto_flow": null,
674
- "grid_auto_rows": null,
675
- "grid_column": null,
676
- "grid_gap": null,
677
- "grid_row": null,
678
- "grid_template_areas": null,
679
- "grid_template_columns": null,
680
- "grid_template_rows": null,
681
- "height": null,
682
- "justify_content": null,
683
- "justify_items": null,
684
- "left": null,
685
- "margin": null,
686
- "max_height": null,
687
- "max_width": null,
688
- "min_height": null,
689
- "min_width": null,
690
- "object_fit": null,
691
- "object_position": null,
692
- "order": null,
693
- "overflow": null,
694
- "padding": null,
695
- "right": null,
696
- "top": null,
697
- "visibility": null,
698
- "width": null
699
- }
700
- },
701
- "0a8d6584c2c745b19fbbe35ad99312a6": {
702
- "model_module": "@jupyter-widgets/controls",
703
- "model_module_version": "2.0.0",
704
- "model_name": "ProgressStyleModel",
705
- "state": {
706
- "_model_module": "@jupyter-widgets/controls",
707
- "_model_module_version": "2.0.0",
708
- "_model_name": "ProgressStyleModel",
709
- "_view_count": null,
710
- "_view_module": "@jupyter-widgets/base",
711
- "_view_module_version": "2.0.0",
712
- "_view_name": "StyleView",
713
- "bar_color": null,
714
- "description_width": ""
715
- }
716
- },
717
- "168a989d9d724abf92f2c02538b32c39": {
718
- "model_module": "@jupyter-widgets/controls",
719
- "model_module_version": "2.0.0",
720
- "model_name": "HBoxModel",
721
- "state": {
722
- "_dom_classes": [],
723
- "_model_module": "@jupyter-widgets/controls",
724
- "_model_module_version": "2.0.0",
725
- "_model_name": "HBoxModel",
726
- "_view_count": null,
727
- "_view_module": "@jupyter-widgets/controls",
728
- "_view_module_version": "2.0.0",
729
- "_view_name": "HBoxView",
730
- "box_style": "",
731
- "children": [
732
- "IPY_MODEL_f1c19e7db0684964b147abb8c00c0bbe",
733
- "IPY_MODEL_ccb14891737b488ebfc6ae39fb14a6a1",
734
- "IPY_MODEL_7690b035d029413bb7a6f6d849c4777e"
735
- ],
736
- "layout": "IPY_MODEL_563942a751b54656a2c953abf1b908cc",
737
- "tabbable": null,
738
- "tooltip": null
739
- }
740
- },
741
- "1fe57cefc03543c1af6ea64f44226e50": {
742
- "model_module": "@jupyter-widgets/controls",
743
- "model_module_version": "2.0.0",
744
- "model_name": "HTMLStyleModel",
745
- "state": {
746
- "_model_module": "@jupyter-widgets/controls",
747
- "_model_module_version": "2.0.0",
748
- "_model_name": "HTMLStyleModel",
749
- "_view_count": null,
750
- "_view_module": "@jupyter-widgets/base",
751
- "_view_module_version": "2.0.0",
752
- "_view_name": "StyleView",
753
- "background": null,
754
- "description_width": "",
755
- "font_size": null,
756
- "text_color": null
757
- }
758
- },
759
- "2a9bdf4a442c40ba81ca10ec0d504df4": {
760
- "model_module": "@jupyter-widgets/controls",
761
- "model_module_version": "2.0.0",
762
- "model_name": "HBoxModel",
763
- "state": {
764
- "_dom_classes": [],
765
- "_model_module": "@jupyter-widgets/controls",
766
- "_model_module_version": "2.0.0",
767
- "_model_name": "HBoxModel",
768
- "_view_count": null,
769
- "_view_module": "@jupyter-widgets/controls",
770
- "_view_module_version": "2.0.0",
771
- "_view_name": "HBoxView",
772
- "box_style": "",
773
- "children": [
774
- "IPY_MODEL_3b31de14259840818658de02f2c8a1c6",
775
- "IPY_MODEL_57b2175f96744c5da2054d5290a13f67",
776
- "IPY_MODEL_a6e177ae59604ba997934e000199f8ba"
777
- ],
778
- "layout": "IPY_MODEL_c46851eb620b40588a40695a0041cf9d",
779
- "tabbable": null,
780
- "tooltip": null
781
- }
782
- },
783
- "2d844fb4766849dab371d9e05be77694": {
784
- "model_module": "@jupyter-widgets/controls",
785
- "model_module_version": "2.0.0",
786
- "model_name": "HTMLModel",
787
- "state": {
788
- "_dom_classes": [],
789
- "_model_module": "@jupyter-widgets/controls",
790
- "_model_module_version": "2.0.0",
791
- "_model_name": "HTMLModel",
792
- "_view_count": null,
793
- "_view_module": "@jupyter-widgets/controls",
794
- "_view_module_version": "2.0.0",
795
- "_view_name": "HTMLView",
796
- "description": "",
797
- "description_allow_html": false,
798
- "layout": "IPY_MODEL_d20824fb12014cb8addd60a28c0356b7",
799
- "placeholder": "​",
800
- "style": "IPY_MODEL_1fe57cefc03543c1af6ea64f44226e50",
801
- "tabbable": null,
802
- "tooltip": null,
803
- "value": " 2/2 [00:01&lt;00:00,  1.20it/s]"
804
- }
805
- },
806
- "3b31de14259840818658de02f2c8a1c6": {
807
- "model_module": "@jupyter-widgets/controls",
808
- "model_module_version": "2.0.0",
809
- "model_name": "HTMLModel",
810
- "state": {
811
- "_dom_classes": [],
812
- "_model_module": "@jupyter-widgets/controls",
813
- "_model_module_version": "2.0.0",
814
- "_model_name": "HTMLModel",
815
- "_view_count": null,
816
- "_view_module": "@jupyter-widgets/controls",
817
- "_view_module_version": "2.0.0",
818
- "_view_name": "HTMLView",
819
- "description": "",
820
- "description_allow_html": false,
821
- "layout": "IPY_MODEL_c018bc738c14432abced177e7f346463",
822
- "placeholder": "​",
823
- "style": "IPY_MODEL_4632137f9ca8492bb0c1ff2a928ac0f8",
824
- "tabbable": null,
825
- "tooltip": null,
826
- "value": "Batches: 100%"
827
- }
828
- },
829
- "454ce0223f794fc580050bf4e13cc3a7": {
830
- "model_module": "@jupyter-widgets/base",
831
- "model_module_version": "2.0.0",
832
- "model_name": "LayoutModel",
833
- "state": {
834
- "_model_module": "@jupyter-widgets/base",
835
- "_model_module_version": "2.0.0",
836
- "_model_name": "LayoutModel",
837
- "_view_count": null,
838
- "_view_module": "@jupyter-widgets/base",
839
- "_view_module_version": "2.0.0",
840
- "_view_name": "LayoutView",
841
- "align_content": null,
842
- "align_items": null,
843
- "align_self": null,
844
- "border_bottom": null,
845
- "border_left": null,
846
- "border_right": null,
847
- "border_top": null,
848
- "bottom": null,
849
- "display": null,
850
- "flex": null,
851
- "flex_flow": null,
852
- "grid_area": null,
853
- "grid_auto_columns": null,
854
- "grid_auto_flow": null,
855
- "grid_auto_rows": null,
856
- "grid_column": null,
857
- "grid_gap": null,
858
- "grid_row": null,
859
- "grid_template_areas": null,
860
- "grid_template_columns": null,
861
- "grid_template_rows": null,
862
- "height": null,
863
- "justify_content": null,
864
- "justify_items": null,
865
- "left": null,
866
- "margin": null,
867
- "max_height": null,
868
- "max_width": null,
869
- "min_height": null,
870
- "min_width": null,
871
- "object_fit": null,
872
- "object_position": null,
873
- "order": null,
874
- "overflow": null,
875
- "padding": null,
876
- "right": null,
877
- "top": null,
878
- "visibility": null,
879
- "width": null
880
- }
881
- },
882
- "4632137f9ca8492bb0c1ff2a928ac0f8": {
883
- "model_module": "@jupyter-widgets/controls",
884
- "model_module_version": "2.0.0",
885
- "model_name": "HTMLStyleModel",
886
- "state": {
887
- "_model_module": "@jupyter-widgets/controls",
888
- "_model_module_version": "2.0.0",
889
- "_model_name": "HTMLStyleModel",
890
- "_view_count": null,
891
- "_view_module": "@jupyter-widgets/base",
892
- "_view_module_version": "2.0.0",
893
- "_view_name": "StyleView",
894
- "background": null,
895
- "description_width": "",
896
- "font_size": null,
897
- "text_color": null
898
- }
899
- },
900
- "52447d1bde014cb2b04eb51c7f7288b7": {
901
- "model_module": "@jupyter-widgets/base",
902
- "model_module_version": "2.0.0",
903
- "model_name": "LayoutModel",
904
- "state": {
905
- "_model_module": "@jupyter-widgets/base",
906
- "_model_module_version": "2.0.0",
907
- "_model_name": "LayoutModel",
908
- "_view_count": null,
909
- "_view_module": "@jupyter-widgets/base",
910
- "_view_module_version": "2.0.0",
911
- "_view_name": "LayoutView",
912
- "align_content": null,
913
- "align_items": null,
914
- "align_self": null,
915
- "border_bottom": null,
916
- "border_left": null,
917
- "border_right": null,
918
- "border_top": null,
919
- "bottom": null,
920
- "display": null,
921
- "flex": null,
922
- "flex_flow": null,
923
- "grid_area": null,
924
- "grid_auto_columns": null,
925
- "grid_auto_flow": null,
926
- "grid_auto_rows": null,
927
- "grid_column": null,
928
- "grid_gap": null,
929
- "grid_row": null,
930
- "grid_template_areas": null,
931
- "grid_template_columns": null,
932
- "grid_template_rows": null,
933
- "height": null,
934
- "justify_content": null,
935
- "justify_items": null,
936
- "left": null,
937
- "margin": null,
938
- "max_height": null,
939
- "max_width": null,
940
- "min_height": null,
941
- "min_width": null,
942
- "object_fit": null,
943
- "object_position": null,
944
- "order": null,
945
- "overflow": null,
946
- "padding": null,
947
- "right": null,
948
- "top": null,
949
- "visibility": null,
950
- "width": null
951
- }
952
- },
953
- "563942a751b54656a2c953abf1b908cc": {
954
- "model_module": "@jupyter-widgets/base",
955
- "model_module_version": "2.0.0",
956
- "model_name": "LayoutModel",
957
- "state": {
958
- "_model_module": "@jupyter-widgets/base",
959
- "_model_module_version": "2.0.0",
960
- "_model_name": "LayoutModel",
961
- "_view_count": null,
962
- "_view_module": "@jupyter-widgets/base",
963
- "_view_module_version": "2.0.0",
964
- "_view_name": "LayoutView",
965
- "align_content": null,
966
- "align_items": null,
967
- "align_self": null,
968
- "border_bottom": null,
969
- "border_left": null,
970
- "border_right": null,
971
- "border_top": null,
972
- "bottom": null,
973
- "display": null,
974
- "flex": null,
975
- "flex_flow": null,
976
- "grid_area": null,
977
- "grid_auto_columns": null,
978
- "grid_auto_flow": null,
979
- "grid_auto_rows": null,
980
- "grid_column": null,
981
- "grid_gap": null,
982
- "grid_row": null,
983
- "grid_template_areas": null,
984
- "grid_template_columns": null,
985
- "grid_template_rows": null,
986
- "height": null,
987
- "justify_content": null,
988
- "justify_items": null,
989
- "left": null,
990
- "margin": null,
991
- "max_height": null,
992
- "max_width": null,
993
- "min_height": null,
994
- "min_width": null,
995
- "object_fit": null,
996
- "object_position": null,
997
- "order": null,
998
- "overflow": null,
999
- "padding": null,
1000
- "right": null,
1001
- "top": null,
1002
- "visibility": null,
1003
- "width": null
1004
- }
1005
- },
1006
- "57b2175f96744c5da2054d5290a13f67": {
1007
- "model_module": "@jupyter-widgets/controls",
1008
- "model_module_version": "2.0.0",
1009
- "model_name": "FloatProgressModel",
1010
- "state": {
1011
- "_dom_classes": [],
1012
- "_model_module": "@jupyter-widgets/controls",
1013
- "_model_module_version": "2.0.0",
1014
- "_model_name": "FloatProgressModel",
1015
- "_view_count": null,
1016
- "_view_module": "@jupyter-widgets/controls",
1017
- "_view_module_version": "2.0.0",
1018
- "_view_name": "ProgressView",
1019
- "bar_style": "success",
1020
- "description": "",
1021
- "description_allow_html": false,
1022
- "layout": "IPY_MODEL_04f0fb9dfb2645cd8a728bf068b2ed2d",
1023
- "max": 1.0,
1024
- "min": 0.0,
1025
- "orientation": "horizontal",
1026
- "style": "IPY_MODEL_c6f89de8432b4587bec4e3e2cca32cbc",
1027
- "tabbable": null,
1028
- "tooltip": null,
1029
- "value": 1.0
1030
- }
1031
- },
1032
- "5d6e9dd429f14ee2b5de4b741602f944": {
1033
- "model_module": "@jupyter-widgets/base",
1034
- "model_module_version": "2.0.0",
1035
- "model_name": "LayoutModel",
1036
- "state": {
1037
- "_model_module": "@jupyter-widgets/base",
1038
- "_model_module_version": "2.0.0",
1039
- "_model_name": "LayoutModel",
1040
- "_view_count": null,
1041
- "_view_module": "@jupyter-widgets/base",
1042
- "_view_module_version": "2.0.0",
1043
- "_view_name": "LayoutView",
1044
- "align_content": null,
1045
- "align_items": null,
1046
- "align_self": null,
1047
- "border_bottom": null,
1048
- "border_left": null,
1049
- "border_right": null,
1050
- "border_top": null,
1051
- "bottom": null,
1052
- "display": null,
1053
- "flex": null,
1054
- "flex_flow": null,
1055
- "grid_area": null,
1056
- "grid_auto_columns": null,
1057
- "grid_auto_flow": null,
1058
- "grid_auto_rows": null,
1059
- "grid_column": null,
1060
- "grid_gap": null,
1061
- "grid_row": null,
1062
- "grid_template_areas": null,
1063
- "grid_template_columns": null,
1064
- "grid_template_rows": null,
1065
- "height": null,
1066
- "justify_content": null,
1067
- "justify_items": null,
1068
- "left": null,
1069
- "margin": null,
1070
- "max_height": null,
1071
- "max_width": null,
1072
- "min_height": null,
1073
- "min_width": null,
1074
- "object_fit": null,
1075
- "object_position": null,
1076
- "order": null,
1077
- "overflow": null,
1078
- "padding": null,
1079
- "right": null,
1080
- "top": null,
1081
- "visibility": null,
1082
- "width": null
1083
- }
1084
- },
1085
- "6024f87266144f52a440ee05bcc407c8": {
1086
- "model_module": "@jupyter-widgets/controls",
1087
- "model_module_version": "2.0.0",
1088
- "model_name": "HTMLStyleModel",
1089
- "state": {
1090
- "_model_module": "@jupyter-widgets/controls",
1091
- "_model_module_version": "2.0.0",
1092
- "_model_name": "HTMLStyleModel",
1093
- "_view_count": null,
1094
- "_view_module": "@jupyter-widgets/base",
1095
- "_view_module_version": "2.0.0",
1096
- "_view_name": "StyleView",
1097
- "background": null,
1098
- "description_width": "",
1099
- "font_size": null,
1100
- "text_color": null
1101
- }
1102
- },
1103
- "654a2b80c08b40beba55515d525a59af": {
1104
- "model_module": "@jupyter-widgets/base",
1105
- "model_module_version": "2.0.0",
1106
- "model_name": "LayoutModel",
1107
- "state": {
1108
- "_model_module": "@jupyter-widgets/base",
1109
- "_model_module_version": "2.0.0",
1110
- "_model_name": "LayoutModel",
1111
- "_view_count": null,
1112
- "_view_module": "@jupyter-widgets/base",
1113
- "_view_module_version": "2.0.0",
1114
- "_view_name": "LayoutView",
1115
- "align_content": null,
1116
- "align_items": null,
1117
- "align_self": null,
1118
- "border_bottom": null,
1119
- "border_left": null,
1120
- "border_right": null,
1121
- "border_top": null,
1122
- "bottom": null,
1123
- "display": null,
1124
- "flex": null,
1125
- "flex_flow": null,
1126
- "grid_area": null,
1127
- "grid_auto_columns": null,
1128
- "grid_auto_flow": null,
1129
- "grid_auto_rows": null,
1130
- "grid_column": null,
1131
- "grid_gap": null,
1132
- "grid_row": null,
1133
- "grid_template_areas": null,
1134
- "grid_template_columns": null,
1135
- "grid_template_rows": null,
1136
- "height": null,
1137
- "justify_content": null,
1138
- "justify_items": null,
1139
- "left": null,
1140
- "margin": null,
1141
- "max_height": null,
1142
- "max_width": null,
1143
- "min_height": null,
1144
- "min_width": null,
1145
- "object_fit": null,
1146
- "object_position": null,
1147
- "order": null,
1148
- "overflow": null,
1149
- "padding": null,
1150
- "right": null,
1151
- "top": null,
1152
- "visibility": null,
1153
- "width": null
1154
- }
1155
- },
1156
- "7690b035d029413bb7a6f6d849c4777e": {
1157
- "model_module": "@jupyter-widgets/controls",
1158
- "model_module_version": "2.0.0",
1159
- "model_name": "HTMLModel",
1160
- "state": {
1161
- "_dom_classes": [],
1162
- "_model_module": "@jupyter-widgets/controls",
1163
- "_model_module_version": "2.0.0",
1164
- "_model_name": "HTMLModel",
1165
- "_view_count": null,
1166
- "_view_module": "@jupyter-widgets/controls",
1167
- "_view_module_version": "2.0.0",
1168
- "_view_name": "HTMLView",
1169
- "description": "",
1170
- "description_allow_html": false,
1171
- "layout": "IPY_MODEL_a1972af713cf43aaaa9b6b695eaf5193",
1172
- "placeholder": "​",
1173
- "style": "IPY_MODEL_d35dfa8c17d64d4996e4ec6beb51d9cb",
1174
- "tabbable": null,
1175
- "tooltip": null,
1176
- "value": " 1/1 [00:00&lt;00:00,  4.47it/s]"
1177
- }
1178
- },
1179
- "a1972af713cf43aaaa9b6b695eaf5193": {
1180
- "model_module": "@jupyter-widgets/base",
1181
- "model_module_version": "2.0.0",
1182
- "model_name": "LayoutModel",
1183
- "state": {
1184
- "_model_module": "@jupyter-widgets/base",
1185
- "_model_module_version": "2.0.0",
1186
- "_model_name": "LayoutModel",
1187
- "_view_count": null,
1188
- "_view_module": "@jupyter-widgets/base",
1189
- "_view_module_version": "2.0.0",
1190
- "_view_name": "LayoutView",
1191
- "align_content": null,
1192
- "align_items": null,
1193
- "align_self": null,
1194
- "border_bottom": null,
1195
- "border_left": null,
1196
- "border_right": null,
1197
- "border_top": null,
1198
- "bottom": null,
1199
- "display": null,
1200
- "flex": null,
1201
- "flex_flow": null,
1202
- "grid_area": null,
1203
- "grid_auto_columns": null,
1204
- "grid_auto_flow": null,
1205
- "grid_auto_rows": null,
1206
- "grid_column": null,
1207
- "grid_gap": null,
1208
- "grid_row": null,
1209
- "grid_template_areas": null,
1210
- "grid_template_columns": null,
1211
- "grid_template_rows": null,
1212
- "height": null,
1213
- "justify_content": null,
1214
- "justify_items": null,
1215
- "left": null,
1216
- "margin": null,
1217
- "max_height": null,
1218
- "max_width": null,
1219
- "min_height": null,
1220
- "min_width": null,
1221
- "object_fit": null,
1222
- "object_position": null,
1223
- "order": null,
1224
- "overflow": null,
1225
- "padding": null,
1226
- "right": null,
1227
- "top": null,
1228
- "visibility": null,
1229
- "width": null
1230
- }
1231
- },
1232
- "a6e177ae59604ba997934e000199f8ba": {
1233
- "model_module": "@jupyter-widgets/controls",
1234
- "model_module_version": "2.0.0",
1235
- "model_name": "HTMLModel",
1236
- "state": {
1237
- "_dom_classes": [],
1238
- "_model_module": "@jupyter-widgets/controls",
1239
- "_model_module_version": "2.0.0",
1240
- "_model_name": "HTMLModel",
1241
- "_view_count": null,
1242
- "_view_module": "@jupyter-widgets/controls",
1243
- "_view_module_version": "2.0.0",
1244
- "_view_name": "HTMLView",
1245
- "description": "",
1246
- "description_allow_html": false,
1247
- "layout": "IPY_MODEL_b240a7aeec0a4e3faaef25c910fcf62e",
1248
- "placeholder": "​",
1249
- "style": "IPY_MODEL_6024f87266144f52a440ee05bcc407c8",
1250
- "tabbable": null,
1251
- "tooltip": null,
1252
- "value": " 1/1 [00:00&lt;00:00,  8.45it/s]"
1253
- }
1254
- },
1255
- "a72a536d760a4ccd95f857d2db3cc8be": {
1256
- "model_module": "@jupyter-widgets/base",
1257
- "model_module_version": "2.0.0",
1258
- "model_name": "LayoutModel",
1259
- "state": {
1260
- "_model_module": "@jupyter-widgets/base",
1261
- "_model_module_version": "2.0.0",
1262
- "_model_name": "LayoutModel",
1263
- "_view_count": null,
1264
- "_view_module": "@jupyter-widgets/base",
1265
- "_view_module_version": "2.0.0",
1266
- "_view_name": "LayoutView",
1267
- "align_content": null,
1268
- "align_items": null,
1269
- "align_self": null,
1270
- "border_bottom": null,
1271
- "border_left": null,
1272
- "border_right": null,
1273
- "border_top": null,
1274
- "bottom": null,
1275
- "display": null,
1276
- "flex": null,
1277
- "flex_flow": null,
1278
- "grid_area": null,
1279
- "grid_auto_columns": null,
1280
- "grid_auto_flow": null,
1281
- "grid_auto_rows": null,
1282
- "grid_column": null,
1283
- "grid_gap": null,
1284
- "grid_row": null,
1285
- "grid_template_areas": null,
1286
- "grid_template_columns": null,
1287
- "grid_template_rows": null,
1288
- "height": null,
1289
- "justify_content": null,
1290
- "justify_items": null,
1291
- "left": null,
1292
- "margin": null,
1293
- "max_height": null,
1294
- "max_width": null,
1295
- "min_height": null,
1296
- "min_width": null,
1297
- "object_fit": null,
1298
- "object_position": null,
1299
- "order": null,
1300
- "overflow": null,
1301
- "padding": null,
1302
- "right": null,
1303
- "top": null,
1304
- "visibility": null,
1305
- "width": null
1306
- }
1307
- },
1308
- "b240a7aeec0a4e3faaef25c910fcf62e": {
1309
- "model_module": "@jupyter-widgets/base",
1310
- "model_module_version": "2.0.0",
1311
- "model_name": "LayoutModel",
1312
- "state": {
1313
- "_model_module": "@jupyter-widgets/base",
1314
- "_model_module_version": "2.0.0",
1315
- "_model_name": "LayoutModel",
1316
- "_view_count": null,
1317
- "_view_module": "@jupyter-widgets/base",
1318
- "_view_module_version": "2.0.0",
1319
- "_view_name": "LayoutView",
1320
- "align_content": null,
1321
- "align_items": null,
1322
- "align_self": null,
1323
- "border_bottom": null,
1324
- "border_left": null,
1325
- "border_right": null,
1326
- "border_top": null,
1327
- "bottom": null,
1328
- "display": null,
1329
- "flex": null,
1330
- "flex_flow": null,
1331
- "grid_area": null,
1332
- "grid_auto_columns": null,
1333
- "grid_auto_flow": null,
1334
- "grid_auto_rows": null,
1335
- "grid_column": null,
1336
- "grid_gap": null,
1337
- "grid_row": null,
1338
- "grid_template_areas": null,
1339
- "grid_template_columns": null,
1340
- "grid_template_rows": null,
1341
- "height": null,
1342
- "justify_content": null,
1343
- "justify_items": null,
1344
- "left": null,
1345
- "margin": null,
1346
- "max_height": null,
1347
- "max_width": null,
1348
- "min_height": null,
1349
- "min_width": null,
1350
- "object_fit": null,
1351
- "object_position": null,
1352
- "order": null,
1353
- "overflow": null,
1354
- "padding": null,
1355
- "right": null,
1356
- "top": null,
1357
- "visibility": null,
1358
- "width": null
1359
- }
1360
- },
1361
- "c018bc738c14432abced177e7f346463": {
1362
- "model_module": "@jupyter-widgets/base",
1363
- "model_module_version": "2.0.0",
1364
- "model_name": "LayoutModel",
1365
- "state": {
1366
- "_model_module": "@jupyter-widgets/base",
1367
- "_model_module_version": "2.0.0",
1368
- "_model_name": "LayoutModel",
1369
- "_view_count": null,
1370
- "_view_module": "@jupyter-widgets/base",
1371
- "_view_module_version": "2.0.0",
1372
- "_view_name": "LayoutView",
1373
- "align_content": null,
1374
- "align_items": null,
1375
- "align_self": null,
1376
- "border_bottom": null,
1377
- "border_left": null,
1378
- "border_right": null,
1379
- "border_top": null,
1380
- "bottom": null,
1381
- "display": null,
1382
- "flex": null,
1383
- "flex_flow": null,
1384
- "grid_area": null,
1385
- "grid_auto_columns": null,
1386
- "grid_auto_flow": null,
1387
- "grid_auto_rows": null,
1388
- "grid_column": null,
1389
- "grid_gap": null,
1390
- "grid_row": null,
1391
- "grid_template_areas": null,
1392
- "grid_template_columns": null,
1393
- "grid_template_rows": null,
1394
- "height": null,
1395
- "justify_content": null,
1396
- "justify_items": null,
1397
- "left": null,
1398
- "margin": null,
1399
- "max_height": null,
1400
- "max_width": null,
1401
- "min_height": null,
1402
- "min_width": null,
1403
- "object_fit": null,
1404
- "object_position": null,
1405
- "order": null,
1406
- "overflow": null,
1407
- "padding": null,
1408
- "right": null,
1409
- "top": null,
1410
- "visibility": null,
1411
- "width": null
1412
- }
1413
- },
1414
- "c30057e29da04e6bb16cbbf1038338b9": {
1415
- "model_module": "@jupyter-widgets/controls",
1416
- "model_module_version": "2.0.0",
1417
- "model_name": "HTMLModel",
1418
- "state": {
1419
- "_dom_classes": [],
1420
- "_model_module": "@jupyter-widgets/controls",
1421
- "_model_module_version": "2.0.0",
1422
- "_model_name": "HTMLModel",
1423
- "_view_count": null,
1424
- "_view_module": "@jupyter-widgets/controls",
1425
- "_view_module_version": "2.0.0",
1426
- "_view_name": "HTMLView",
1427
- "description": "",
1428
- "description_allow_html": false,
1429
- "layout": "IPY_MODEL_52447d1bde014cb2b04eb51c7f7288b7",
1430
- "placeholder": "​",
1431
- "style": "IPY_MODEL_f2f35bcfdb8c44cda4afab0f92a73de8",
1432
- "tabbable": null,
1433
- "tooltip": null,
1434
- "value": "Loading PDFs: 100%"
1435
- }
1436
- },
1437
- "c46851eb620b40588a40695a0041cf9d": {
1438
- "model_module": "@jupyter-widgets/base",
1439
- "model_module_version": "2.0.0",
1440
- "model_name": "LayoutModel",
1441
- "state": {
1442
- "_model_module": "@jupyter-widgets/base",
1443
- "_model_module_version": "2.0.0",
1444
- "_model_name": "LayoutModel",
1445
- "_view_count": null,
1446
- "_view_module": "@jupyter-widgets/base",
1447
- "_view_module_version": "2.0.0",
1448
- "_view_name": "LayoutView",
1449
- "align_content": null,
1450
- "align_items": null,
1451
- "align_self": null,
1452
- "border_bottom": null,
1453
- "border_left": null,
1454
- "border_right": null,
1455
- "border_top": null,
1456
- "bottom": null,
1457
- "display": null,
1458
- "flex": null,
1459
- "flex_flow": null,
1460
- "grid_area": null,
1461
- "grid_auto_columns": null,
1462
- "grid_auto_flow": null,
1463
- "grid_auto_rows": null,
1464
- "grid_column": null,
1465
- "grid_gap": null,
1466
- "grid_row": null,
1467
- "grid_template_areas": null,
1468
- "grid_template_columns": null,
1469
- "grid_template_rows": null,
1470
- "height": null,
1471
- "justify_content": null,
1472
- "justify_items": null,
1473
- "left": null,
1474
- "margin": null,
1475
- "max_height": null,
1476
- "max_width": null,
1477
- "min_height": null,
1478
- "min_width": null,
1479
- "object_fit": null,
1480
- "object_position": null,
1481
- "order": null,
1482
- "overflow": null,
1483
- "padding": null,
1484
- "right": null,
1485
- "top": null,
1486
- "visibility": null,
1487
- "width": null
1488
- }
1489
- },
1490
- "c585a351229e48ba83f972610c1548bc": {
1491
- "model_module": "@jupyter-widgets/controls",
1492
- "model_module_version": "2.0.0",
1493
- "model_name": "FloatProgressModel",
1494
- "state": {
1495
- "_dom_classes": [],
1496
- "_model_module": "@jupyter-widgets/controls",
1497
- "_model_module_version": "2.0.0",
1498
- "_model_name": "FloatProgressModel",
1499
- "_view_count": null,
1500
- "_view_module": "@jupyter-widgets/controls",
1501
- "_view_module_version": "2.0.0",
1502
- "_view_name": "ProgressView",
1503
- "bar_style": "success",
1504
- "description": "",
1505
- "description_allow_html": false,
1506
- "layout": "IPY_MODEL_654a2b80c08b40beba55515d525a59af",
1507
- "max": 2.0,
1508
- "min": 0.0,
1509
- "orientation": "horizontal",
1510
- "style": "IPY_MODEL_f0bd73ebff9942168e0f68823c099dd1",
1511
- "tabbable": null,
1512
- "tooltip": null,
1513
- "value": 2.0
1514
- }
1515
- },
1516
- "c6f89de8432b4587bec4e3e2cca32cbc": {
1517
- "model_module": "@jupyter-widgets/controls",
1518
- "model_module_version": "2.0.0",
1519
- "model_name": "ProgressStyleModel",
1520
- "state": {
1521
- "_model_module": "@jupyter-widgets/controls",
1522
- "_model_module_version": "2.0.0",
1523
- "_model_name": "ProgressStyleModel",
1524
- "_view_count": null,
1525
- "_view_module": "@jupyter-widgets/base",
1526
- "_view_module_version": "2.0.0",
1527
- "_view_name": "StyleView",
1528
- "bar_color": null,
1529
- "description_width": ""
1530
- }
1531
- },
1532
- "ccb14891737b488ebfc6ae39fb14a6a1": {
1533
- "model_module": "@jupyter-widgets/controls",
1534
- "model_module_version": "2.0.0",
1535
- "model_name": "FloatProgressModel",
1536
- "state": {
1537
- "_dom_classes": [],
1538
- "_model_module": "@jupyter-widgets/controls",
1539
- "_model_module_version": "2.0.0",
1540
- "_model_name": "FloatProgressModel",
1541
- "_view_count": null,
1542
- "_view_module": "@jupyter-widgets/controls",
1543
- "_view_module_version": "2.0.0",
1544
- "_view_name": "ProgressView",
1545
- "bar_style": "success",
1546
- "description": "",
1547
- "description_allow_html": false,
1548
- "layout": "IPY_MODEL_5d6e9dd429f14ee2b5de4b741602f944",
1549
- "max": 1.0,
1550
- "min": 0.0,
1551
- "orientation": "horizontal",
1552
- "style": "IPY_MODEL_0a8d6584c2c745b19fbbe35ad99312a6",
1553
- "tabbable": null,
1554
- "tooltip": null,
1555
- "value": 1.0
1556
- }
1557
- },
1558
- "d20824fb12014cb8addd60a28c0356b7": {
1559
- "model_module": "@jupyter-widgets/base",
1560
- "model_module_version": "2.0.0",
1561
- "model_name": "LayoutModel",
1562
- "state": {
1563
- "_model_module": "@jupyter-widgets/base",
1564
- "_model_module_version": "2.0.0",
1565
- "_model_name": "LayoutModel",
1566
- "_view_count": null,
1567
- "_view_module": "@jupyter-widgets/base",
1568
- "_view_module_version": "2.0.0",
1569
- "_view_name": "LayoutView",
1570
- "align_content": null,
1571
- "align_items": null,
1572
- "align_self": null,
1573
- "border_bottom": null,
1574
- "border_left": null,
1575
- "border_right": null,
1576
- "border_top": null,
1577
- "bottom": null,
1578
- "display": null,
1579
- "flex": null,
1580
- "flex_flow": null,
1581
- "grid_area": null,
1582
- "grid_auto_columns": null,
1583
- "grid_auto_flow": null,
1584
- "grid_auto_rows": null,
1585
- "grid_column": null,
1586
- "grid_gap": null,
1587
- "grid_row": null,
1588
- "grid_template_areas": null,
1589
- "grid_template_columns": null,
1590
- "grid_template_rows": null,
1591
- "height": null,
1592
- "justify_content": null,
1593
- "justify_items": null,
1594
- "left": null,
1595
- "margin": null,
1596
- "max_height": null,
1597
- "max_width": null,
1598
- "min_height": null,
1599
- "min_width": null,
1600
- "object_fit": null,
1601
- "object_position": null,
1602
- "order": null,
1603
- "overflow": null,
1604
- "padding": null,
1605
- "right": null,
1606
- "top": null,
1607
- "visibility": null,
1608
- "width": null
1609
- }
1610
- },
1611
- "d35dfa8c17d64d4996e4ec6beb51d9cb": {
1612
- "model_module": "@jupyter-widgets/controls",
1613
- "model_module_version": "2.0.0",
1614
- "model_name": "HTMLStyleModel",
1615
- "state": {
1616
- "_model_module": "@jupyter-widgets/controls",
1617
- "_model_module_version": "2.0.0",
1618
- "_model_name": "HTMLStyleModel",
1619
- "_view_count": null,
1620
- "_view_module": "@jupyter-widgets/base",
1621
- "_view_module_version": "2.0.0",
1622
- "_view_name": "StyleView",
1623
- "background": null,
1624
- "description_width": "",
1625
- "font_size": null,
1626
- "text_color": null
1627
- }
1628
- },
1629
- "d7fd32b9ea9f487099b048e5b410f76f": {
1630
- "model_module": "@jupyter-widgets/controls",
1631
- "model_module_version": "2.0.0",
1632
- "model_name": "HBoxModel",
1633
- "state": {
1634
- "_dom_classes": [],
1635
- "_model_module": "@jupyter-widgets/controls",
1636
- "_model_module_version": "2.0.0",
1637
- "_model_name": "HBoxModel",
1638
- "_view_count": null,
1639
- "_view_module": "@jupyter-widgets/controls",
1640
- "_view_module_version": "2.0.0",
1641
- "_view_name": "HBoxView",
1642
- "box_style": "",
1643
- "children": [
1644
- "IPY_MODEL_c30057e29da04e6bb16cbbf1038338b9",
1645
- "IPY_MODEL_c585a351229e48ba83f972610c1548bc",
1646
- "IPY_MODEL_2d844fb4766849dab371d9e05be77694"
1647
- ],
1648
- "layout": "IPY_MODEL_454ce0223f794fc580050bf4e13cc3a7",
1649
- "tabbable": null,
1650
- "tooltip": null
1651
- }
1652
- },
1653
- "f0bd73ebff9942168e0f68823c099dd1": {
1654
- "model_module": "@jupyter-widgets/controls",
1655
- "model_module_version": "2.0.0",
1656
- "model_name": "ProgressStyleModel",
1657
- "state": {
1658
- "_model_module": "@jupyter-widgets/controls",
1659
- "_model_module_version": "2.0.0",
1660
- "_model_name": "ProgressStyleModel",
1661
- "_view_count": null,
1662
- "_view_module": "@jupyter-widgets/base",
1663
- "_view_module_version": "2.0.0",
1664
- "_view_name": "StyleView",
1665
- "bar_color": null,
1666
- "description_width": ""
1667
- }
1668
- },
1669
- "f1c19e7db0684964b147abb8c00c0bbe": {
1670
- "model_module": "@jupyter-widgets/controls",
1671
- "model_module_version": "2.0.0",
1672
- "model_name": "HTMLModel",
1673
- "state": {
1674
- "_dom_classes": [],
1675
- "_model_module": "@jupyter-widgets/controls",
1676
- "_model_module_version": "2.0.0",
1677
- "_model_name": "HTMLModel",
1678
- "_view_count": null,
1679
- "_view_module": "@jupyter-widgets/controls",
1680
- "_view_module_version": "2.0.0",
1681
- "_view_name": "HTMLView",
1682
- "description": "",
1683
- "description_allow_html": false,
1684
- "layout": "IPY_MODEL_a72a536d760a4ccd95f857d2db3cc8be",
1685
- "placeholder": "​",
1686
- "style": "IPY_MODEL_045481258c7a4764b11034c22edd1940",
1687
- "tabbable": null,
1688
- "tooltip": null,
1689
- "value": "Batches: 100%"
1690
- }
1691
- },
1692
- "f2f35bcfdb8c44cda4afab0f92a73de8": {
1693
- "model_module": "@jupyter-widgets/controls",
1694
- "model_module_version": "2.0.0",
1695
- "model_name": "HTMLStyleModel",
1696
- "state": {
1697
- "_model_module": "@jupyter-widgets/controls",
1698
- "_model_module_version": "2.0.0",
1699
- "_model_name": "HTMLStyleModel",
1700
- "_view_count": null,
1701
- "_view_module": "@jupyter-widgets/base",
1702
- "_view_module_version": "2.0.0",
1703
- "_view_name": "StyleView",
1704
- "background": null,
1705
- "description_width": "",
1706
- "font_size": null,
1707
- "text_color": null
1708
- }
1709
- }
1710
- },
1711
- "version_major": 2,
1712
- "version_minor": 0
1713
- }
1714
- }
1715
- },
1716
- "nbformat": 4,
1717
- "nbformat_minor": 5
1718
- }