natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
pdfs/.gitkeep DELETED
File without changes
pdfs/01-practice.pdf DELETED
@@ -1,543 +0,0 @@
1
- %PDF-1.7
2
- %µ¶
3
-
4
- 1 0 obj
5
- <</Type/Pages/Count 1/Kids[2 0 R]>>
6
- endobj
7
-
8
- 2 0 obj
9
- <</Type/Page/MediaBox[0 0 612 792]/Resources 3 0 R/Parent 1 0 R/Contents 6 0 R>>
10
- endobj
11
-
12
- 3 0 obj
13
- <</Font<</F0<</Type/Font/BaseFont/Helvetica/Subtype/Type1>>/F1<</Type/Font/BaseFont/Helvetica-Bold/Subtype/Type1>>>>>>
14
- endobj
15
-
16
- 4 0 obj
17
- <</Length 4482>>
18
- stream
19
- q
20
- 1 0 0 1 50 700 cm
21
- 1 g
22
- 325 65 180 -35 re
23
- S
24
- Q
25
- q
26
- 1 0 0 1 50 700 cm
27
- BT
28
- 335 50 Td
29
- /F0 8 Tf
30
- (Jungle Health and Safety Inspection Service) Tj
31
- 0 -10 Td
32
- 1 0 0 rg
33
- (INS-UP70N51NCL41R) Tj
34
- ET
35
- Q
36
- q
37
- 1 0 0 1 50 700 cm
38
- BT
39
- /F1 10 Tf
40
- (Site: ) Tj
41
-
42
- /F0 10 Tf
43
- (Durham's Meatpacking ) Tj
44
-
45
- 0.5 g
46
- (Chicago, Ill.) Tj
47
- 0 g
48
-
49
- 0 -20 Td
50
-
51
- /F1 10 Tf
52
- (Date: ) Tj
53
-
54
- /F0 10 Tf
55
- (February 3, 1905) Tj
56
-
57
- 0 -20 Td
58
-
59
- /F1 10 Tf
60
- (Violation Count: ) Tj
61
-
62
- /F0 10 Tf
63
- (7) Tj
64
-
65
- 0 -20 Td
66
-
67
- /F1 10 Tf
68
- (Summary: ) Tj
69
-
70
- % 0 -20 Td
71
-
72
- /F0 10 Tf
73
- (Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.) Tj
74
- 0 -16 Td
75
- (These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary ) Tj
76
- 0 -16 Td
77
- (visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in ) Tj
78
- 0 -16 Td
79
- (some of which there were open vats near the level of the floor, their peculiar trouble was that they fell) Tj
80
- 0 -16 Td
81
- (into the vats; and when they were fished out, there was never enough of them left to be worth ) Tj
82
- 0 -16 Td
83
- (exhibiting - sometimes they would be overlooked for days, till all but the bones of them had gone out) Tj
84
- 0 -16 Td
85
- (to the world as Durham's Pure Leaf Lard!) Tj
86
- ET
87
- Q
88
-
89
- q
90
- 1 0 0 1 50 440 cm
91
- 0 G
92
- 2 w
93
- 0 0 m
94
- 500 0 l
95
- S
96
-
97
- BT
98
- 0 -30 Td
99
- /F1 12 Tf
100
- (Violations) Tj
101
- ET
102
- Q
103
-
104
-
105
- q
106
- 1 0 0 1 50 400 cm
107
- 0.5 G
108
- 0 0 m
109
- 500 0 l
110
- S
111
- 0 -20 m
112
- 500 -20 l
113
- S
114
- 0 -40 m
115
- 500 -40 l
116
- S
117
- 0 -60 m
118
- 500 -60 l
119
- S
120
- 0 -80 m
121
- 500 -80 l
122
- S
123
- 0 -100 m
124
- 500 -100 l
125
- S
126
- 0 -120 m
127
- 500 -120 l
128
- S
129
- 0 -140 m
130
- 500 -140 l
131
- S
132
- 0 -160 m
133
- 500 -160 l
134
- S
135
-
136
- 0 0 m
137
- 0 -160 l
138
- S
139
-
140
- 50 0 m
141
- 50 -160 l
142
- S
143
-
144
- 400 0 m
145
- 400 -160 l
146
- S
147
-
148
- 450 0 m
149
- 450 -160 l
150
- S
151
-
152
- 500 0 m
153
- 500 -160 l
154
- S
155
- Q
156
-
157
- q
158
- 1 0 0 1 55 386 cm
159
- BT
160
- q
161
- /F1 10 Tf
162
- (Statute) Tj
163
- 50 0 Td
164
- (Description) Tj
165
- 350 0 Td
166
- (Level) Tj
167
- 50 0 Td
168
- (Repeat?) Tj
169
- Q
170
-
171
- /F0 10 Tf
172
- q
173
- 0 -20 Td
174
- (4.12.7) Tj
175
- 50 0 Td
176
- (Unsanitary Working Conditions.) Tj
177
- 350 0 Td
178
- (Critical) Tj
179
- Q
180
- q
181
- 0 -40 Td
182
- (5.8.3) Tj
183
- 50 0 Td
184
- (Inadequate Protective Equipment.) Tj
185
- 350 0 Td
186
- (Serious) Tj
187
- Q
188
- q
189
- 0 -60 Td
190
- (6.3.9) Tj
191
- 50 0 Td
192
- (Ineffective Injury Prevention.) Tj
193
- 350 0 Td
194
- (Serious) Tj
195
- Q
196
- q
197
- 0 -80 Td
198
- (7.1.5) Tj
199
- 50 0 Td
200
- (Failure to Properly Store Hazardous Materials.) Tj
201
- 350 0 Td
202
- (Critical) Tj
203
- Q
204
- q
205
- 0 -100 Td
206
- (8.9.2) Tj
207
- 50 0 Td
208
- (Lack of Adequate Fire Safety Measures.) Tj
209
- 350 0 Td
210
- (Serious) Tj
211
- Q
212
- q
213
- 0 -120 Td
214
- (9.6.4) Tj
215
- 50 0 Td
216
- (Inadequate Ventilation Systems.) Tj
217
- 350 0 Td
218
- (Serious) Tj
219
- Q
220
- q
221
- 0 -140 Td
222
- (10.2.7) Tj
223
- 50 0 Td
224
- (Insufficient Employee Training for Safe Work Practices.) Tj
225
- 350 0 Td
226
- (Serious) Tj
227
- Q
228
-
229
- ET
230
-
231
- q
232
- 0 G
233
- 0.5 w
234
-
235
- 465 -20 8 8 re
236
- S
237
- 465 -20 m
238
- 473 -12 l
239
- s
240
- 465 -12 m
241
- 473 -20 l
242
- s
243
-
244
- 465 -40 8 8 re
245
- S
246
- 465 -40 m
247
- 473 -32 l
248
- s
249
- 465 -32 m
250
- 473 -40 l
251
- s
252
-
253
- 465 -60 8 8 re
254
- S
255
-
256
- 465 -80 8 8 re
257
- S
258
-
259
- 465 -100 8 8 re
260
- S
261
-
262
- 465 -120 8 8 re
263
- S
264
- 465 -120 m
265
- 473 -112 l
266
- s
267
- 465 -112 m
268
- 473 -120 l
269
- s
270
-
271
- 465 -140 8 8 re
272
- S
273
- Q
274
- Q
275
-
276
- q
277
- 1 0 0 1 230 20 cm
278
- BT
279
- /F0 8 Tf
280
- (Jungle Health and Safety Inspection Service) Tj
281
- ET
282
- Q
283
-
284
-
285
- endstream
286
- endobj
287
-
288
- 5 0 obj
289
- <</Type/Catalog/Pages 1 0 R>>
290
- endobj
291
-
292
- 6 0 obj
293
- <</Length 2774>>
294
- stream
295
- q
296
- 1 0 0 1 50 700 cm
297
- 1 g
298
- 325 65 180 -35 re
299
- S
300
- Q
301
- q
302
- 1 0 0 1 50 700 cm
303
- BT
304
- 335 50 Td
305
- /F0 8 Tf
306
- (Jungle Health and Safety Inspection Service) Tj
307
- 0 -10 Td
308
- 1 0 0 rg
309
- (INS-UP70N51NCL41R) Tj
310
- ET
311
- Q
312
- q
313
- 1 0 0 1 50 700 cm
314
- BT
315
- /F1 10 Tf
316
- (Site: ) Tj
317
- /F0 10 Tf
318
- (Durham's Meatpacking ) Tj
319
- .5 g
320
- (Chicago, Ill.) Tj
321
- 0 g
322
- 0 -20 Td
323
- /F1 10 Tf
324
- (Date: ) Tj
325
- /F0 10 Tf
326
- (February 3, 1905) Tj
327
- 0 -20 Td
328
- /F1 10 Tf
329
- (Violation Count: ) Tj
330
- /F0 10 Tf
331
- (7) Tj
332
- 0 -20 Td
333
- /F1 10 Tf
334
- (Summary: ) Tj
335
- /F0 10 Tf
336
- (Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.) Tj
337
- 0 -16 Td
338
- (These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary ) Tj
339
- 0 -16 Td
340
- (visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in ) Tj
341
- 0 -16 Td
342
- (some of which there were open vats near the level of the floor, their peculiar trouble was that they fell) Tj
343
- 0 -16 Td
344
- (into the vats; and when they were fished out, there was never enough of them left to be worth ) Tj
345
- 0 -16 Td
346
- (exhibiting - sometimes they would be overlooked for days, till all but the bones of them had gone out) Tj
347
- 0 -16 Td
348
- (to the world as Durham's Pure Leaf Lard!) Tj
349
- ET
350
- Q
351
- q
352
- 1 0 0 1 50 440 cm
353
- 0 G
354
- 2 w
355
- 0 0 m
356
- 500 0 l
357
- S
358
- BT
359
- 0 -30 Td
360
- /F1 12 Tf
361
- (Violations) Tj
362
- ET
363
- Q
364
- q
365
- 1 0 0 1 50 400 cm
366
- .5 G
367
- 0 0 m
368
- 500 0 l
369
- S
370
- 0 -20 m
371
- 500 -20 l
372
- S
373
- 0 -40 m
374
- 500 -40 l
375
- S
376
- 0 -60 m
377
- 500 -60 l
378
- S
379
- 0 -80 m
380
- 500 -80 l
381
- S
382
- 0 -100 m
383
- 500 -100 l
384
- S
385
- 0 -120 m
386
- 500 -120 l
387
- S
388
- 0 -140 m
389
- 500 -140 l
390
- S
391
- 0 -160 m
392
- 500 -160 l
393
- S
394
- 0 0 m
395
- 0 -160 l
396
- S
397
- 50 0 m
398
- 50 -160 l
399
- S
400
- 400 0 m
401
- 400 -160 l
402
- S
403
- 450 0 m
404
- 450 -160 l
405
- S
406
- 500 0 m
407
- 500 -160 l
408
- S
409
- Q
410
- q
411
- 1 0 0 1 55 386 cm
412
- BT
413
- q
414
- /F1 10 Tf
415
- (Statute) Tj
416
- 50 0 Td
417
- (Description) Tj
418
- 350 0 Td
419
- (Level) Tj
420
- 50 0 Td
421
- (Repeat?) Tj
422
- Q
423
- /F0 10 Tf
424
- q
425
- 0 -20 Td
426
- (4.12.7) Tj
427
- 50 0 Td
428
- (Unsanitary Working Conditions.) Tj
429
- 350 0 Td
430
- (Critical) Tj
431
- Q
432
- q
433
- 0 -40 Td
434
- (5.8.3) Tj
435
- 50 0 Td
436
- (Inadequate Protective Equipment.) Tj
437
- 350 0 Td
438
- (Serious) Tj
439
- Q
440
- q
441
- 0 -60 Td
442
- (6.3.9) Tj
443
- 50 0 Td
444
- (Ineffective Injury Prevention.) Tj
445
- 350 0 Td
446
- (Serious) Tj
447
- Q
448
- q
449
- 0 -80 Td
450
- (7.1.5) Tj
451
- 50 0 Td
452
- (Failure to Properly Store Hazardous Materials.) Tj
453
- 350 0 Td
454
- (Critical) Tj
455
- Q
456
- q
457
- 0 -100 Td
458
- (8.9.2) Tj
459
- 50 0 Td
460
- (Lack of Adequate Fire Safety Measures.) Tj
461
- 350 0 Td
462
- (Serious) Tj
463
- Q
464
- q
465
- 0 -120 Td
466
- (9.6.4) Tj
467
- 50 0 Td
468
- (Inadequate Ventilation Systems.) Tj
469
- 350 0 Td
470
- (Serious) Tj
471
- Q
472
- q
473
- 0 -140 Td
474
- (10.2.7) Tj
475
- 50 0 Td
476
- (Insufficient Employee Training for Safe Work Practices.) Tj
477
- 350 0 Td
478
- (Serious) Tj
479
- Q
480
- ET
481
- q
482
- 0 G
483
- .5 w
484
- 465 -20 8 8 re
485
- S
486
- 465 -20 m
487
- 473 -12 l
488
- s
489
- 465 -12 m
490
- 473 -20 l
491
- s
492
- 465 -40 8 8 re
493
- S
494
- 465 -40 m
495
- 473 -32 l
496
- s
497
- 465 -32 m
498
- 473 -40 l
499
- s
500
- 465 -60 8 8 re
501
- S
502
- 465 -80 8 8 re
503
- S
504
- 465 -100 8 8 re
505
- S
506
- 465 -120 8 8 re
507
- S
508
- 465 -120 m
509
- 473 -112 l
510
- s
511
- 465 -112 m
512
- 473 -120 l
513
- s
514
- 465 -140 8 8 re
515
- S
516
- Q
517
- Q
518
- q
519
- 1 0 0 1 230 20 cm
520
- BT
521
- /F0 8 Tf
522
- (Jungle Health and Safety Inspection Service) Tj
523
- ET
524
- Q
525
-
526
- endstream
527
- endobj
528
-
529
- xref
530
- 0 7
531
- 0000000000 65536 f
532
- 0000000016 00000 n
533
- 0000000068 00000 n
534
- 0000000165 00000 n
535
- 0000000300 00000 n
536
- 0000004833 00000 n
537
- 0000004879 00000 n
538
-
539
- trailer
540
- <</Size 7/Root 5 0 R>>
541
- startxref
542
- 7704
543
- %%EOF
pdfs/0500000US42001.pdf DELETED
Binary file
pdfs/0500000US42007.pdf DELETED
Binary file
pdfs/2014 Statistics.pdf DELETED
Binary file
pdfs/2019 Statistics.pdf DELETED
Binary file
Binary file
pdfs/needs-ocr.pdf DELETED
Binary file