natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
pdfs/.gitkeep ADDED
File without changes
pdfs/01-practice.pdf ADDED
@@ -0,0 +1,543 @@
1
+ %PDF-1.7
2
+ %µ¶
3
+
4
+ 1 0 obj
5
+ <</Type/Pages/Count 1/Kids[2 0 R]>>
6
+ endobj
7
+
8
+ 2 0 obj
9
+ <</Type/Page/MediaBox[0 0 612 792]/Resources 3 0 R/Parent 1 0 R/Contents 6 0 R>>
10
+ endobj
11
+
12
+ 3 0 obj
13
+ <</Font<</F0<</Type/Font/BaseFont/Helvetica/Subtype/Type1>>/F1<</Type/Font/BaseFont/Helvetica-Bold/Subtype/Type1>>>>>>
14
+ endobj
15
+
16
+ 4 0 obj
17
+ <</Length 4482>>
18
+ stream
19
+ q
20
+ 1 0 0 1 50 700 cm
21
+ 1 g
22
+ 325 65 180 -35 re
23
+ S
24
+ Q
25
+ q
26
+ 1 0 0 1 50 700 cm
27
+ BT
28
+ 335 50 Td
29
+ /F0 8 Tf
30
+ (Jungle Health and Safety Inspection Service) Tj
31
+ 0 -10 Td
32
+ 1 0 0 rg
33
+ (INS-UP70N51NCL41R) Tj
34
+ ET
35
+ Q
36
+ q
37
+ 1 0 0 1 50 700 cm
38
+ BT
39
+ /F1 10 Tf
40
+ (Site: ) Tj
41
+
42
+ /F0 10 Tf
43
+ (Durham's Meatpacking ) Tj
44
+
45
+ 0.5 g
46
+ (Chicago, Ill.) Tj
47
+ 0 g
48
+
49
+ 0 -20 Td
50
+
51
+ /F1 10 Tf
52
+ (Date: ) Tj
53
+
54
+ /F0 10 Tf
55
+ (February 3, 1905) Tj
56
+
57
+ 0 -20 Td
58
+
59
+ /F1 10 Tf
60
+ (Violation Count: ) Tj
61
+
62
+ /F0 10 Tf
63
+ (7) Tj
64
+
65
+ 0 -20 Td
66
+
67
+ /F1 10 Tf
68
+ (Summary: ) Tj
69
+
70
+ % 0 -20 Td
71
+
72
+ /F0 10 Tf
73
+ (Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.) Tj
74
+ 0 -16 Td
75
+ (These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary ) Tj
76
+ 0 -16 Td
77
+ (visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in ) Tj
78
+ 0 -16 Td
79
+ (some of which there were open vats near the level of the floor, their peculiar trouble was that they fell) Tj
80
+ 0 -16 Td
81
+ (into the vats; and when they were fished out, there was never enough of them left to be worth ) Tj
82
+ 0 -16 Td
83
+ (exhibiting - sometimes they would be overlooked for days, till all but the bones of them had gone out) Tj
84
+ 0 -16 Td
85
+ (to the world as Durham's Pure Leaf Lard!) Tj
86
+ ET
87
+ Q
88
+
89
+ q
90
+ 1 0 0 1 50 440 cm
91
+ 0 G
92
+ 2 w
93
+ 0 0 m
94
+ 500 0 l
95
+ S
96
+
97
+ BT
98
+ 0 -30 Td
99
+ /F1 12 Tf
100
+ (Violations) Tj
101
+ ET
102
+ Q
103
+
104
+
105
+ q
106
+ 1 0 0 1 50 400 cm
107
+ 0.5 G
108
+ 0 0 m
109
+ 500 0 l
110
+ S
111
+ 0 -20 m
112
+ 500 -20 l
113
+ S
114
+ 0 -40 m
115
+ 500 -40 l
116
+ S
117
+ 0 -60 m
118
+ 500 -60 l
119
+ S
120
+ 0 -80 m
121
+ 500 -80 l
122
+ S
123
+ 0 -100 m
124
+ 500 -100 l
125
+ S
126
+ 0 -120 m
127
+ 500 -120 l
128
+ S
129
+ 0 -140 m
130
+ 500 -140 l
131
+ S
132
+ 0 -160 m
133
+ 500 -160 l
134
+ S
135
+
136
+ 0 0 m
137
+ 0 -160 l
138
+ S
139
+
140
+ 50 0 m
141
+ 50 -160 l
142
+ S
143
+
144
+ 400 0 m
145
+ 400 -160 l
146
+ S
147
+
148
+ 450 0 m
149
+ 450 -160 l
150
+ S
151
+
152
+ 500 0 m
153
+ 500 -160 l
154
+ S
155
+ Q
156
+
157
+ q
158
+ 1 0 0 1 55 386 cm
159
+ BT
160
+ q
161
+ /F1 10 Tf
162
+ (Statute) Tj
163
+ 50 0 Td
164
+ (Description) Tj
165
+ 350 0 Td
166
+ (Level) Tj
167
+ 50 0 Td
168
+ (Repeat?) Tj
169
+ Q
170
+
171
+ /F0 10 Tf
172
+ q
173
+ 0 -20 Td
174
+ (4.12.7) Tj
175
+ 50 0 Td
176
+ (Unsanitary Working Conditions.) Tj
177
+ 350 0 Td
178
+ (Critical) Tj
179
+ Q
180
+ q
181
+ 0 -40 Td
182
+ (5.8.3) Tj
183
+ 50 0 Td
184
+ (Inadequate Protective Equipment.) Tj
185
+ 350 0 Td
186
+ (Serious) Tj
187
+ Q
188
+ q
189
+ 0 -60 Td
190
+ (6.3.9) Tj
191
+ 50 0 Td
192
+ (Ineffective Injury Prevention.) Tj
193
+ 350 0 Td
194
+ (Serious) Tj
195
+ Q
196
+ q
197
+ 0 -80 Td
198
+ (7.1.5) Tj
199
+ 50 0 Td
200
+ (Failure to Properly Store Hazardous Materials.) Tj
201
+ 350 0 Td
202
+ (Critical) Tj
203
+ Q
204
+ q
205
+ 0 -100 Td
206
+ (8.9.2) Tj
207
+ 50 0 Td
208
+ (Lack of Adequate Fire Safety Measures.) Tj
209
+ 350 0 Td
210
+ (Serious) Tj
211
+ Q
212
+ q
213
+ 0 -120 Td
214
+ (9.6.4) Tj
215
+ 50 0 Td
216
+ (Inadequate Ventilation Systems.) Tj
217
+ 350 0 Td
218
+ (Serious) Tj
219
+ Q
220
+ q
221
+ 0 -140 Td
222
+ (10.2.7) Tj
223
+ 50 0 Td
224
+ (Insufficient Employee Training for Safe Work Practices.) Tj
225
+ 350 0 Td
226
+ (Serious) Tj
227
+ Q
228
+
229
+ ET
230
+
231
+ q
232
+ 0 G
233
+ 0.5 w
234
+
235
+ 465 -20 8 8 re
236
+ S
237
+ 465 -20 m
238
+ 473 -12 l
239
+ s
240
+ 465 -12 m
241
+ 473 -20 l
242
+ s
243
+
244
+ 465 -40 8 8 re
245
+ S
246
+ 465 -40 m
247
+ 473 -32 l
248
+ s
249
+ 465 -32 m
250
+ 473 -40 l
251
+ s
252
+
253
+ 465 -60 8 8 re
254
+ S
255
+
256
+ 465 -80 8 8 re
257
+ S
258
+
259
+ 465 -100 8 8 re
260
+ S
261
+
262
+ 465 -120 8 8 re
263
+ S
264
+ 465 -120 m
265
+ 473 -112 l
266
+ s
267
+ 465 -112 m
268
+ 473 -120 l
269
+ s
270
+
271
+ 465 -140 8 8 re
272
+ S
273
+ Q
274
+ Q
275
+
276
+ q
277
+ 1 0 0 1 230 20 cm
278
+ BT
279
+ /F0 8 Tf
280
+ (Jungle Health and Safety Inspection Service) Tj
281
+ ET
282
+ Q
283
+
284
+
285
+ endstream
286
+ endobj
287
+
288
+ 5 0 obj
289
+ <</Type/Catalog/Pages 1 0 R>>
290
+ endobj
291
+
292
+ 6 0 obj
293
+ <</Length 2774>>
294
+ stream
295
+ q
296
+ 1 0 0 1 50 700 cm
297
+ 1 g
298
+ 325 65 180 -35 re
299
+ S
300
+ Q
301
+ q
302
+ 1 0 0 1 50 700 cm
303
+ BT
304
+ 335 50 Td
305
+ /F0 8 Tf
306
+ (Jungle Health and Safety Inspection Service) Tj
307
+ 0 -10 Td
308
+ 1 0 0 rg
309
+ (INS-UP70N51NCL41R) Tj
310
+ ET
311
+ Q
312
+ q
313
+ 1 0 0 1 50 700 cm
314
+ BT
315
+ /F1 10 Tf
316
+ (Site: ) Tj
317
+ /F0 10 Tf
318
+ (Durham's Meatpacking ) Tj
319
+ .5 g
320
+ (Chicago, Ill.) Tj
321
+ 0 g
322
+ 0 -20 Td
323
+ /F1 10 Tf
324
+ (Date: ) Tj
325
+ /F0 10 Tf
326
+ (February 3, 1905) Tj
327
+ 0 -20 Td
328
+ /F1 10 Tf
329
+ (Violation Count: ) Tj
330
+ /F0 10 Tf
331
+ (7) Tj
332
+ 0 -20 Td
333
+ /F1 10 Tf
334
+ (Summary: ) Tj
335
+ /F0 10 Tf
336
+ (Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.) Tj
337
+ 0 -16 Td
338
+ (These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary ) Tj
339
+ 0 -16 Td
340
+ (visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in ) Tj
341
+ 0 -16 Td
342
+ (some of which there were open vats near the level of the floor, their peculiar trouble was that they fell) Tj
343
+ 0 -16 Td
344
+ (into the vats; and when they were fished out, there was never enough of them left to be worth ) Tj
345
+ 0 -16 Td
346
+ (exhibiting - sometimes they would be overlooked for days, till all but the bones of them had gone out) Tj
347
+ 0 -16 Td
348
+ (to the world as Durham's Pure Leaf Lard!) Tj
349
+ ET
350
+ Q
351
+ q
352
+ 1 0 0 1 50 440 cm
353
+ 0 G
354
+ 2 w
355
+ 0 0 m
356
+ 500 0 l
357
+ S
358
+ BT
359
+ 0 -30 Td
360
+ /F1 12 Tf
361
+ (Violations) Tj
362
+ ET
363
+ Q
364
+ q
365
+ 1 0 0 1 50 400 cm
366
+ .5 G
367
+ 0 0 m
368
+ 500 0 l
369
+ S
370
+ 0 -20 m
371
+ 500 -20 l
372
+ S
373
+ 0 -40 m
374
+ 500 -40 l
375
+ S
376
+ 0 -60 m
377
+ 500 -60 l
378
+ S
379
+ 0 -80 m
380
+ 500 -80 l
381
+ S
382
+ 0 -100 m
383
+ 500 -100 l
384
+ S
385
+ 0 -120 m
386
+ 500 -120 l
387
+ S
388
+ 0 -140 m
389
+ 500 -140 l
390
+ S
391
+ 0 -160 m
392
+ 500 -160 l
393
+ S
394
+ 0 0 m
395
+ 0 -160 l
396
+ S
397
+ 50 0 m
398
+ 50 -160 l
399
+ S
400
+ 400 0 m
401
+ 400 -160 l
402
+ S
403
+ 450 0 m
404
+ 450 -160 l
405
+ S
406
+ 500 0 m
407
+ 500 -160 l
408
+ S
409
+ Q
410
+ q
411
+ 1 0 0 1 55 386 cm
412
+ BT
413
+ q
414
+ /F1 10 Tf
415
+ (Statute) Tj
416
+ 50 0 Td
417
+ (Description) Tj
418
+ 350 0 Td
419
+ (Level) Tj
420
+ 50 0 Td
421
+ (Repeat?) Tj
422
+ Q
423
+ /F0 10 Tf
424
+ q
425
+ 0 -20 Td
426
+ (4.12.7) Tj
427
+ 50 0 Td
428
+ (Unsanitary Working Conditions.) Tj
429
+ 350 0 Td
430
+ (Critical) Tj
431
+ Q
432
+ q
433
+ 0 -40 Td
434
+ (5.8.3) Tj
435
+ 50 0 Td
436
+ (Inadequate Protective Equipment.) Tj
437
+ 350 0 Td
438
+ (Serious) Tj
439
+ Q
440
+ q
441
+ 0 -60 Td
442
+ (6.3.9) Tj
443
+ 50 0 Td
444
+ (Ineffective Injury Prevention.) Tj
445
+ 350 0 Td
446
+ (Serious) Tj
447
+ Q
448
+ q
449
+ 0 -80 Td
450
+ (7.1.5) Tj
451
+ 50 0 Td
452
+ (Failure to Properly Store Hazardous Materials.) Tj
453
+ 350 0 Td
454
+ (Critical) Tj
455
+ Q
456
+ q
457
+ 0 -100 Td
458
+ (8.9.2) Tj
459
+ 50 0 Td
460
+ (Lack of Adequate Fire Safety Measures.) Tj
461
+ 350 0 Td
462
+ (Serious) Tj
463
+ Q
464
+ q
465
+ 0 -120 Td
466
+ (9.6.4) Tj
467
+ 50 0 Td
468
+ (Inadequate Ventilation Systems.) Tj
469
+ 350 0 Td
470
+ (Serious) Tj
471
+ Q
472
+ q
473
+ 0 -140 Td
474
+ (10.2.7) Tj
475
+ 50 0 Td
476
+ (Insufficient Employee Training for Safe Work Practices.) Tj
477
+ 350 0 Td
478
+ (Serious) Tj
479
+ Q
480
+ ET
481
+ q
482
+ 0 G
483
+ .5 w
484
+ 465 -20 8 8 re
485
+ S
486
+ 465 -20 m
487
+ 473 -12 l
488
+ s
489
+ 465 -12 m
490
+ 473 -20 l
491
+ s
492
+ 465 -40 8 8 re
493
+ S
494
+ 465 -40 m
495
+ 473 -32 l
496
+ s
497
+ 465 -32 m
498
+ 473 -40 l
499
+ s
500
+ 465 -60 8 8 re
501
+ S
502
+ 465 -80 8 8 re
503
+ S
504
+ 465 -100 8 8 re
505
+ S
506
+ 465 -120 8 8 re
507
+ S
508
+ 465 -120 m
509
+ 473 -112 l
510
+ s
511
+ 465 -112 m
512
+ 473 -120 l
513
+ s
514
+ 465 -140 8 8 re
515
+ S
516
+ Q
517
+ Q
518
+ q
519
+ 1 0 0 1 230 20 cm
520
+ BT
521
+ /F0 8 Tf
522
+ (Jungle Health and Safety Inspection Service) Tj
523
+ ET
524
+ Q
525
+
526
+ endstream
527
+ endobj
528
+
529
+ xref
530
+ 0 7
531
+ 0000000000 65536 f
532
+ 0000000016 00000 n
533
+ 0000000068 00000 n
534
+ 0000000165 00000 n
535
+ 0000000300 00000 n
536
+ 0000004833 00000 n
537
+ 0000004879 00000 n
538
+
539
+ trailer
540
+ <</Size 7/Root 5 0 R>>
541
+ startxref
542
+ 7704
543
+ %%EOF
Binary file
Binary file
Binary file
Binary file
pdfs/needs-ocr.pdf ADDED
Binary file
tests/test_loading.py ADDED
@@ -0,0 +1,50 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from natural_pdf import PDF
6
+
7
+ # URL for the test PDF used in the tutorial
8
+ TEST_PDF_URL = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf"
9
+
10
+
11
+ def test_pdf_loading_from_url():
12
+ """Tests if a PDF can be loaded successfully from a URL."""
13
+ try:
14
+ pdf = PDF(TEST_PDF_URL)
15
+ # Basic assertions after loading
16
+ assert pdf is not None
17
+ assert len(pdf.pages) > 0, "PDF should have at least one page"
18
+ assert os.path.exists(pdf.path), "PDF file should be downloaded locally"
19
+ # Check if metadata (like Title) is accessible, even if None
20
+ assert "Title" in pdf.metadata or pdf.metadata.get("Title") is None
21
+
22
+ except Exception as e:
23
+ pytest.fail(f"PDF loading from URL failed: {e}")
24
+
25
+
26
+ def test_page_text_extraction():
27
+ """Tests if text can be extracted from the first page."""
28
+ try:
29
+ pdf = PDF(TEST_PDF_URL)
30
+ assert len(pdf.pages) > 0, "PDF has no pages"
31
+ page = pdf.pages[0]
32
+ text = page.extract_text()
33
+ assert isinstance(text, str), "Extracted text should be a string"
34
+ assert len(text) > 50, "Extracted text seems too short or empty"
35
+ # Add a more specific assertion if you know some expected text
36
+ # assert "Expected sample text" in text
37
+
38
+ except Exception as e:
39
+ pytest.fail(f"Text extraction failed: {e}")
40
+
41
+
42
+ # Clean up downloaded file if necessary (optional, depends on PDF class behavior)
43
+ # You might want a fixture to handle setup/teardown of the downloaded file
44
+ # @pytest.fixture(scope="module")
45
+ # def downloaded_pdf():
46
+ # pdf = PDF(TEST_PDF_URL)
47
+ # yield pdf
48
+ # # Cleanup code here if PDF() doesn't handle it
49
+ # if os.path.exists(pdf.path):
50
+ # os.remove(pdf.path)