natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
pdfs/.gitkeep DELETED
File without changes
pdfs/01-practice.pdf DELETED
@@ -1,543 +0,0 @@
1
- %PDF-1.7
2
- %µ¶
3
-
4
- 1 0 obj
5
- <</Type/Pages/Count 1/Kids[2 0 R]>>
6
- endobj
7
-
8
- 2 0 obj
9
- <</Type/Page/MediaBox[0 0 612 792]/Resources 3 0 R/Parent 1 0 R/Contents 6 0 R>>
10
- endobj
11
-
12
- 3 0 obj
13
- <</Font<</F0<</Type/Font/BaseFont/Helvetica/Subtype/Type1>>/F1<</Type/Font/BaseFont/Helvetica-Bold/Subtype/Type1>>>>>>
14
- endobj
15
-
16
- 4 0 obj
17
- <</Length 4482>>
18
- stream
19
- q
20
- 1 0 0 1 50 700 cm
21
- 1 g
22
- 325 65 180 -35 re
23
- S
24
- Q
25
- q
26
- 1 0 0 1 50 700 cm
27
- BT
28
- 335 50 Td
29
- /F0 8 Tf
30
- (Jungle Health and Safety Inspection Service) Tj
31
- 0 -10 Td
32
- 1 0 0 rg
33
- (INS-UP70N51NCL41R) Tj
34
- ET
35
- Q
36
- q
37
- 1 0 0 1 50 700 cm
38
- BT
39
- /F1 10 Tf
40
- (Site: ) Tj
41
-
42
- /F0 10 Tf
43
- (Durham's Meatpacking ) Tj
44
-
45
- 0.5 g
46
- (Chicago, Ill.) Tj
47
- 0 g
48
-
49
- 0 -20 Td
50
-
51
- /F1 10 Tf
52
- (Date: ) Tj
53
-
54
- /F0 10 Tf
55
- (February 3, 1905) Tj
56
-
57
- 0 -20 Td
58
-
59
- /F1 10 Tf
60
- (Violation Count: ) Tj
61
-
62
- /F0 10 Tf
63
- (7) Tj
64
-
65
- 0 -20 Td
66
-
67
- /F1 10 Tf
68
- (Summary: ) Tj
69
-
70
- % 0 -20 Td
71
-
72
- /F0 10 Tf
73
- (Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.) Tj
74
- 0 -16 Td
75
- (These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary ) Tj
76
- 0 -16 Td
77
- (visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in ) Tj
78
- 0 -16 Td
79
- (some of which there were open vats near the level of the floor, their peculiar trouble was that they fell) Tj
80
- 0 -16 Td
81
- (into the vats; and when they were fished out, there was never enough of them left to be worth ) Tj
82
- 0 -16 Td
83
- (exhibiting - sometimes they would be overlooked for days, till all but the bones of them had gone out) Tj
84
- 0 -16 Td
85
- (to the world as Durham's Pure Leaf Lard!) Tj
86
- ET
87
- Q
88
-
89
- q
90
- 1 0 0 1 50 440 cm
91
- 0 G
92
- 2 w
93
- 0 0 m
94
- 500 0 l
95
- S
96
-
97
- BT
98
- 0 -30 Td
99
- /F1 12 Tf
100
- (Violations) Tj
101
- ET
102
- Q
103
-
104
-
105
- q
106
- 1 0 0 1 50 400 cm
107
- 0.5 G
108
- 0 0 m
109
- 500 0 l
110
- S
111
- 0 -20 m
112
- 500 -20 l
113
- S
114
- 0 -40 m
115
- 500 -40 l
116
- S
117
- 0 -60 m
118
- 500 -60 l
119
- S
120
- 0 -80 m
121
- 500 -80 l
122
- S
123
- 0 -100 m
124
- 500 -100 l
125
- S
126
- 0 -120 m
127
- 500 -120 l
128
- S
129
- 0 -140 m
130
- 500 -140 l
131
- S
132
- 0 -160 m
133
- 500 -160 l
134
- S
135
-
136
- 0 0 m
137
- 0 -160 l
138
- S
139
-
140
- 50 0 m
141
- 50 -160 l
142
- S
143
-
144
- 400 0 m
145
- 400 -160 l
146
- S
147
-
148
- 450 0 m
149
- 450 -160 l
150
- S
151
-
152
- 500 0 m
153
- 500 -160 l
154
- S
155
- Q
156
-
157
- q
158
- 1 0 0 1 55 386 cm
159
- BT
160
- q
161
- /F1 10 Tf
162
- (Statute) Tj
163
- 50 0 Td
164
- (Description) Tj
165
- 350 0 Td
166
- (Level) Tj
167
- 50 0 Td
168
- (Repeat?) Tj
169
- Q
170
-
171
- /F0 10 Tf
172
- q
173
- 0 -20 Td
174
- (4.12.7) Tj
175
- 50 0 Td
176
- (Unsanitary Working Conditions.) Tj
177
- 350 0 Td
178
- (Critical) Tj
179
- Q
180
- q
181
- 0 -40 Td
182
- (5.8.3) Tj
183
- 50 0 Td
184
- (Inadequate Protective Equipment.) Tj
185
- 350 0 Td
186
- (Serious) Tj
187
- Q
188
- q
189
- 0 -60 Td
190
- (6.3.9) Tj
191
- 50 0 Td
192
- (Ineffective Injury Prevention.) Tj
193
- 350 0 Td
194
- (Serious) Tj
195
- Q
196
- q
197
- 0 -80 Td
198
- (7.1.5) Tj
199
- 50 0 Td
200
- (Failure to Properly Store Hazardous Materials.) Tj
201
- 350 0 Td
202
- (Critical) Tj
203
- Q
204
- q
205
- 0 -100 Td
206
- (8.9.2) Tj
207
- 50 0 Td
208
- (Lack of Adequate Fire Safety Measures.) Tj
209
- 350 0 Td
210
- (Serious) Tj
211
- Q
212
- q
213
- 0 -120 Td
214
- (9.6.4) Tj
215
- 50 0 Td
216
- (Inadequate Ventilation Systems.) Tj
217
- 350 0 Td
218
- (Serious) Tj
219
- Q
220
- q
221
- 0 -140 Td
222
- (10.2.7) Tj
223
- 50 0 Td
224
- (Insufficient Employee Training for Safe Work Practices.) Tj
225
- 350 0 Td
226
- (Serious) Tj
227
- Q
228
-
229
- ET
230
-
231
- q
232
- 0 G
233
- 0.5 w
234
-
235
- 465 -20 8 8 re
236
- S
237
- 465 -20 m
238
- 473 -12 l
239
- s
240
- 465 -12 m
241
- 473 -20 l
242
- s
243
-
244
- 465 -40 8 8 re
245
- S
246
- 465 -40 m
247
- 473 -32 l
248
- s
249
- 465 -32 m
250
- 473 -40 l
251
- s
252
-
253
- 465 -60 8 8 re
254
- S
255
-
256
- 465 -80 8 8 re
257
- S
258
-
259
- 465 -100 8 8 re
260
- S
261
-
262
- 465 -120 8 8 re
263
- S
264
- 465 -120 m
265
- 473 -112 l
266
- s
267
- 465 -112 m
268
- 473 -120 l
269
- s
270
-
271
- 465 -140 8 8 re
272
- S
273
- Q
274
- Q
275
-
276
- q
277
- 1 0 0 1 230 20 cm
278
- BT
279
- /F0 8 Tf
280
- (Jungle Health and Safety Inspection Service) Tj
281
- ET
282
- Q
283
-
284
-
285
- endstream
286
- endobj
287
-
288
- 5 0 obj
289
- <</Type/Catalog/Pages 1 0 R>>
290
- endobj
291
-
292
- 6 0 obj
293
- <</Length 2774>>
294
- stream
295
- q
296
- 1 0 0 1 50 700 cm
297
- 1 g
298
- 325 65 180 -35 re
299
- S
300
- Q
301
- q
302
- 1 0 0 1 50 700 cm
303
- BT
304
- 335 50 Td
305
- /F0 8 Tf
306
- (Jungle Health and Safety Inspection Service) Tj
307
- 0 -10 Td
308
- 1 0 0 rg
309
- (INS-UP70N51NCL41R) Tj
310
- ET
311
- Q
312
- q
313
- 1 0 0 1 50 700 cm
314
- BT
315
- /F1 10 Tf
316
- (Site: ) Tj
317
- /F0 10 Tf
318
- (Durham's Meatpacking ) Tj
319
- .5 g
320
- (Chicago, Ill.) Tj
321
- 0 g
322
- 0 -20 Td
323
- /F1 10 Tf
324
- (Date: ) Tj
325
- /F0 10 Tf
326
- (February 3, 1905) Tj
327
- 0 -20 Td
328
- /F1 10 Tf
329
- (Violation Count: ) Tj
330
- /F0 10 Tf
331
- (7) Tj
332
- 0 -20 Td
333
- /F1 10 Tf
334
- (Summary: ) Tj
335
- /F0 10 Tf
336
- (Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.) Tj
337
- 0 -16 Td
338
- (These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary ) Tj
339
- 0 -16 Td
340
- (visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in ) Tj
341
- 0 -16 Td
342
- (some of which there were open vats near the level of the floor, their peculiar trouble was that they fell) Tj
343
- 0 -16 Td
344
- (into the vats; and when they were fished out, there was never enough of them left to be worth ) Tj
345
- 0 -16 Td
346
- (exhibiting - sometimes they would be overlooked for days, till all but the bones of them had gone out) Tj
347
- 0 -16 Td
348
- (to the world as Durham's Pure Leaf Lard!) Tj
349
- ET
350
- Q
351
- q
352
- 1 0 0 1 50 440 cm
353
- 0 G
354
- 2 w
355
- 0 0 m
356
- 500 0 l
357
- S
358
- BT
359
- 0 -30 Td
360
- /F1 12 Tf
361
- (Violations) Tj
362
- ET
363
- Q
364
- q
365
- 1 0 0 1 50 400 cm
366
- .5 G
367
- 0 0 m
368
- 500 0 l
369
- S
370
- 0 -20 m
371
- 500 -20 l
372
- S
373
- 0 -40 m
374
- 500 -40 l
375
- S
376
- 0 -60 m
377
- 500 -60 l
378
- S
379
- 0 -80 m
380
- 500 -80 l
381
- S
382
- 0 -100 m
383
- 500 -100 l
384
- S
385
- 0 -120 m
386
- 500 -120 l
387
- S
388
- 0 -140 m
389
- 500 -140 l
390
- S
391
- 0 -160 m
392
- 500 -160 l
393
- S
394
- 0 0 m
395
- 0 -160 l
396
- S
397
- 50 0 m
398
- 50 -160 l
399
- S
400
- 400 0 m
401
- 400 -160 l
402
- S
403
- 450 0 m
404
- 450 -160 l
405
- S
406
- 500 0 m
407
- 500 -160 l
408
- S
409
- Q
410
- q
411
- 1 0 0 1 55 386 cm
412
- BT
413
- q
414
- /F1 10 Tf
415
- (Statute) Tj
416
- 50 0 Td
417
- (Description) Tj
418
- 350 0 Td
419
- (Level) Tj
420
- 50 0 Td
421
- (Repeat?) Tj
422
- Q
423
- /F0 10 Tf
424
- q
425
- 0 -20 Td
426
- (4.12.7) Tj
427
- 50 0 Td
428
- (Unsanitary Working Conditions.) Tj
429
- 350 0 Td
430
- (Critical) Tj
431
- Q
432
- q
433
- 0 -40 Td
434
- (5.8.3) Tj
435
- 50 0 Td
436
- (Inadequate Protective Equipment.) Tj
437
- 350 0 Td
438
- (Serious) Tj
439
- Q
440
- q
441
- 0 -60 Td
442
- (6.3.9) Tj
443
- 50 0 Td
444
- (Ineffective Injury Prevention.) Tj
445
- 350 0 Td
446
- (Serious) Tj
447
- Q
448
- q
449
- 0 -80 Td
450
- (7.1.5) Tj
451
- 50 0 Td
452
- (Failure to Properly Store Hazardous Materials.) Tj
453
- 350 0 Td
454
- (Critical) Tj
455
- Q
456
- q
457
- 0 -100 Td
458
- (8.9.2) Tj
459
- 50 0 Td
460
- (Lack of Adequate Fire Safety Measures.) Tj
461
- 350 0 Td
462
- (Serious) Tj
463
- Q
464
- q
465
- 0 -120 Td
466
- (9.6.4) Tj
467
- 50 0 Td
468
- (Inadequate Ventilation Systems.) Tj
469
- 350 0 Td
470
- (Serious) Tj
471
- Q
472
- q
473
- 0 -140 Td
474
- (10.2.7) Tj
475
- 50 0 Td
476
- (Insufficient Employee Training for Safe Work Practices.) Tj
477
- 350 0 Td
478
- (Serious) Tj
479
- Q
480
- ET
481
- q
482
- 0 G
483
- .5 w
484
- 465 -20 8 8 re
485
- S
486
- 465 -20 m
487
- 473 -12 l
488
- s
489
- 465 -12 m
490
- 473 -20 l
491
- s
492
- 465 -40 8 8 re
493
- S
494
- 465 -40 m
495
- 473 -32 l
496
- s
497
- 465 -32 m
498
- 473 -40 l
499
- s
500
- 465 -60 8 8 re
501
- S
502
- 465 -80 8 8 re
503
- S
504
- 465 -100 8 8 re
505
- S
506
- 465 -120 8 8 re
507
- S
508
- 465 -120 m
509
- 473 -112 l
510
- s
511
- 465 -112 m
512
- 473 -120 l
513
- s
514
- 465 -140 8 8 re
515
- S
516
- Q
517
- Q
518
- q
519
- 1 0 0 1 230 20 cm
520
- BT
521
- /F0 8 Tf
522
- (Jungle Health and Safety Inspection Service) Tj
523
- ET
524
- Q
525
-
526
- endstream
527
- endobj
528
-
529
- xref
530
- 0 7
531
- 0000000000 65536 f
532
- 0000000016 00000 n
533
- 0000000068 00000 n
534
- 0000000165 00000 n
535
- 0000000300 00000 n
536
- 0000004833 00000 n
537
- 0000004879 00000 n
538
-
539
- trailer
540
- <</Size 7/Root 5 0 R>>
541
- startxref
542
- 7704
543
- %%EOF
pdfs/0500000US42001.pdf DELETED
Binary file
pdfs/0500000US42007.pdf DELETED
Binary file
pdfs/2014 Statistics.pdf DELETED
Binary file
pdfs/2019 Statistics.pdf DELETED
Binary file
Binary file
pdfs/needs-ocr.pdf DELETED
Binary file