natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -115,10 +115,8 @@ buffered_max_length
115
115
  ```
116
116
 
117
117
  ```python
118
- import shutil
119
- from datetime import datetime
120
-
121
118
  MAX_ALLOWED = buffered_max_length
119
+ MIN_ALLOWED = 3
122
120
  removed = 0
123
121
  cleaned_lines = []
124
122
 
@@ -130,6 +128,12 @@ for i, line in enumerate(original_lines):
130
128
  if len(parts) == 2 and len(parts[1]) > MAX_ALLOWED:
131
129
  removed += 1
132
130
  print(f"⚠️ Line {i} exceeds max_text_length: {len(parts[1])} chars: {parts[1]}")
131
+ elif len(parts[1]) < MIN_ALLOWED:
132
+ removed += 1
133
+ print(f"⚠️ Line {i} under min_text_length: {len(parts[1])} chars: {parts[1]}")
134
+ elif "Sorry, I can't" in parts[1]:
135
+ removed += 1
136
+ print(f"⚠️ Line {i} was not OCR'd correctly")
133
137
  else:
134
138
  cleaned_lines.append(line)
135
139
 
@@ -284,7 +288,7 @@ We need the PaddleOCR repository for its training scripts. Once we have it we'll
284
288
  # Start training!
285
289
  # -c points to our config file
286
290
  # -o Override specific config options if needed (e.g., Global.epoch_num=10)
287
- !python paddleocr_repo/tools/train.py -c ../finetune_rec.yml
291
+ !python paddleocr_repo/tools/train.py -c finetune_rec.yml
288
292
  ```
289
293
 
290
294
  Training will begin, printing logs and saving checkpoints to the directory specified in `Global.save_model_dir` (`./output/finetune_rec/` in the example). Monitor the accuracy (`acc`) and loss on the training and validation sets. You can stop training early if validation accuracy plateaus or starts to decrease.
@@ -329,10 +333,11 @@ ocr = PaddleOCR(
329
333
  with open("finetune_data/val.txt", encoding="utf-8") as f:
330
334
  line = random.choice([l.strip() for l in f if l.strip()])
331
335
  img_path, ground_truth = line.split(maxsplit=1)
336
+ img_path = "finetune_data/" + img_path
332
337
 
333
338
  # Run inference
334
339
  result = ocr.ocr(img_path, det=False)
335
- prediction = result[0][0][1]['text'] if result else '[No result]'
340
+ prediction = result[0][0][1] if result else '[No result]'
336
341
 
337
342
  # Display
338
343
  display(Image(filename=img_path))
@@ -0,0 +1,8 @@
1
+ """
2
+ Shared locks for thread synchronization across the natural-pdf library.
3
+ """
4
+
5
+ import threading
6
+
7
+ # Global lock for PDF rendering operations to prevent PDFium concurrency issues
8
+ pdf_render_lock = threading.RLock()
@@ -2,7 +2,7 @@
2
2
  import logging
3
3
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
4
4
 
5
- from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
+ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox, cluster_objects
6
6
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
7
7
 
8
8
  if TYPE_CHECKING:
@@ -11,6 +11,57 @@ if TYPE_CHECKING:
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def _get_layout_kwargs(
15
+ layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
16
+ user_kwargs: Optional[Dict[str, Any]] = None,
17
+ ) -> Dict[str, Any]:
18
+ """
19
+ Prepares the keyword arguments for pdfplumber's chars_to_textmap based
20
+ on defaults, context bbox, and allowed user overrides.
21
+ """
22
+ # 1. Start with an empty dict for layout kwargs
23
+ layout_kwargs = {}
24
+
25
+ # Build allowed keys set without trying to copy the constants
26
+ allowed_keys = set(TEXTMAP_KWARGS) | set(WORD_EXTRACTOR_KWARGS)
27
+
28
+ # Add common, well-known default values
29
+ layout_kwargs.update({
30
+ 'x_tolerance': 5,
31
+ 'y_tolerance': 5,
32
+ 'x_density': 7.25,
33
+ 'y_density': 13,
34
+ 'mode': 'box',
35
+ 'min_words_vertical': 1,
36
+ 'min_words_horizontal': 1,
37
+ })
38
+
39
+ # 2. Apply context if provided
40
+ if layout_context_bbox:
41
+ ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
42
+ layout_kwargs["layout_width"] = ctx_x1 - ctx_x0
43
+ layout_kwargs["layout_height"] = ctx_bottom - ctx_top
44
+ layout_kwargs["x_shift"] = ctx_x0
45
+ layout_kwargs["y_shift"] = ctx_top
46
+ # Add layout_bbox itself
47
+ layout_kwargs["layout_bbox"] = layout_context_bbox
48
+
49
+ # 3. Apply user overrides (only for allowed keys)
50
+ if user_kwargs:
51
+ for key, value in user_kwargs.items():
52
+ if key in allowed_keys:
53
+ layout_kwargs[key] = value
54
+ elif key == 'layout': # Always allow layout flag
55
+ layout_kwargs[key] = value
56
+ else:
57
+ logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
58
+
59
+ # 4. Ensure layout flag is present, defaulting to True
60
+ if 'layout' not in layout_kwargs:
61
+ layout_kwargs['layout'] = True
62
+
63
+ return layout_kwargs
64
+
14
65
  def filter_chars_spatially(
15
66
  char_dicts: List[Dict[str, Any]],
16
67
  exclusion_regions: List["Region"],
@@ -0,0 +1,43 @@
1
+ import sys
2
+ import os
3
+
4
+ # Default to standard tqdm
5
+ try:
6
+ from tqdm.std import tqdm as selected_tqdm
7
+ except ImportError:
8
+ # Basic fallback if even std is missing (though unlikely)
9
+ def selected_tqdm(*args, **kwargs):
10
+ iterable = args[0] if args else None
11
+ if iterable:
12
+ return iterable
13
+ return None # Simple passthrough if no iterable
14
+
15
+ # Try to detect notebook environment
16
+ try:
17
+ # Check 1: Are we running in an IPython kernel?
18
+ from IPython import get_ipython
19
+ ipython = get_ipython()
20
+ if ipython and 'IPKernelApp' in ipython.config:
21
+ # Check 2: Is it likely a notebook UI (Jupyter Notebook/Lab, VSCode, etc.)?
22
+ # This checks for common indicators. Might not be foolproof.
23
+ if 'VSCODE_PID' in os.environ or ('ipykernel' in sys.modules and 'spyder' not in sys.modules):
24
+ # Check 3: Can we import notebook version?
25
+ try:
26
+ from tqdm.notebook import tqdm as notebook_tqdm
27
+ selected_tqdm = notebook_tqdm # Use notebook version
28
+ except ImportError:
29
+ pass # Stick with std if notebook version missing
30
+ except ImportError:
31
+ pass # Stick with std if IPython not available
32
+
33
+ def get_tqdm():
34
+ """Returns the tqdm class best suited for the detected environment."""
35
+ return selected_tqdm
36
+
37
+ # Example usage (for testing):
38
+ if __name__ == '__main__':
39
+ import time
40
+ tqdm_instance = get_tqdm()
41
+ print(f"Using tqdm class: {tqdm_instance}")
42
+ for i in tqdm_instance(range(10), desc="Testing tqdm"):
43
+ time.sleep(0.1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -44,6 +44,10 @@ Requires-Dist: natural-pdf[core-ml]; extra == "docling"
44
44
  Provides-Extra: llm
45
45
  Requires-Dist: openai>=1.0; extra == "llm"
46
46
  Requires-Dist: pydantic; extra == "llm"
47
+ Provides-Extra: classification
48
+ Requires-Dist: sentence-transformers; extra == "classification"
49
+ Requires-Dist: timm; extra == "classification"
50
+ Requires-Dist: natural-pdf[core-ml]; extra == "classification"
47
51
  Provides-Extra: test
48
52
  Requires-Dist: pytest; extra == "test"
49
53
  Provides-Extra: dev
@@ -70,6 +74,7 @@ Requires-Dist: natural-pdf[qa]; extra == "all"
70
74
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
71
75
  Requires-Dist: natural-pdf[docling]; extra == "all"
72
76
  Requires-Dist: natural-pdf[llm]; extra == "all"
77
+ Requires-Dist: natural-pdf[classification]; extra == "all"
73
78
  Requires-Dist: natural-pdf[test]; extra == "all"
74
79
  Provides-Extra: core-ml
75
80
  Requires-Dist: torch; extra == "core-ml"
@@ -1,4 +1,4 @@
1
- docs/index.md,sha256=P1kXZc8aefnxH0bBjvBgj1o3puRiezjUiBLqS4bcUhM,4889
1
+ docs/index.md,sha256=FG4MYQs-gUR16NQ4XF0AVoQeLuykLeY8XxNwW3h-qUM,5572
2
2
  docs/api/index.md,sha256=4bn8nYklWJuNDrnY-Kt7sf7IejeAEDhcnqYmjH9GJTA,22405
3
3
  docs/assets/favicon.png,sha256=nxca8jM2Y4GxZKzkmagUHO1GpUREK-GRA5LEFue9OOU,284
4
4
  docs/assets/favicon.svg,sha256=nxca8jM2Y4GxZKzkmagUHO1GpUREK-GRA5LEFue9OOU,284
@@ -8,17 +8,19 @@ docs/assets/social-preview.png,sha256=AvyzzM8dC0j5SPFF63bvQrxU4GE1f9j-GUNUv0oA9t
8
8
  docs/assets/social-preview.svg,sha256=AvyzzM8dC0j5SPFF63bvQrxU4GE1f9j-GUNUv0oA9ts,1085
9
9
  docs/assets/javascripts/custom.js,sha256=0NVHGprwiLPFYdYunJcHjOphzk_EhBSNuOUz5Uzdv_k,594
10
10
  docs/assets/stylesheets/custom.css,sha256=PbTp3k77gzUBUQQ01pDXzpNwo4wUv3aJD-SMBQvQItY,1156
11
+ docs/categorizing-documents/index.md,sha256=tgKfv3DidZysrFhaOEM-FiIVDAzNPPnK02sKaE5pE2I,8196
12
+ docs/data-extraction/index.md,sha256=LwQ2MJVI5u5ELI51Iq0WUdDo5sl_s18GWG_cBABI8fQ,3430
11
13
  docs/document-qa/index.ipynb,sha256=MXJoFhi8TUKK6ZnRFiUBglLGpMbzwdb7LJYfzw8Gp48,528713
12
14
  docs/document-qa/index.md,sha256=mwuO4tothg0OzBXewnj73QEJu46Udq7f1pQBYrKOHwM,2131
13
- docs/element-selection/index.ipynb,sha256=-7PwKw1RbPlZ4stzN1Rd1GJ8mwjOD4ySsLcpqVX7chc,1193628
14
- docs/element-selection/index.md,sha256=_1P8vI64Y0aSVwUzdRJD4ayb80BJWBLED9TvVpveFx8,6979
15
+ docs/element-selection/index.ipynb,sha256=WuKd3bTTOnzBDfbuzkxmJxO6EzM9RAkFXoF0U3-8qRA,1223398
16
+ docs/element-selection/index.md,sha256=ZUkOD6VVK11K6WQ86FPnTeeco27PrFWtkObKw8j6Fok,7867
15
17
  docs/finetuning/index.md,sha256=Ur3zqSaR0X8PvBCSyI7cFiDv5qZ6Jtv4omBKXCKAzEk,9200
16
18
  docs/installation/index.md,sha256=nd4RZrQFR8_vv7Xm3xAzp7z-CQQr9ffAcGa7yuEYn2U,1594
17
19
  docs/interactive-widget/index.ipynb,sha256=zY1rz5N34OUW-OtgcbI6iiOjlIJqXjVcx9OoNWMjuyU,293111
18
20
  docs/interactive-widget/index.md,sha256=tZbq0uYI7Zwo9mLbhXpqeBriuAjazkIyEJeP-jasJ-Q,259
19
21
  docs/layout-analysis/index.ipynb,sha256=dkS_-cu-KGir5G2LGRcxBThKnW0dfA5nPPnwpoYGFtU,1869093
20
22
  docs/layout-analysis/index.md,sha256=ZnH5yd7B_eOLgGxW_4rNlzQs4Tn3Xx1cK3jX43CSpSM,5390
21
- docs/ocr/index.md,sha256=uuzTqcAgUmMN7jZVq8VkVcbRDHn8Yg2nJVvHJ-bDK-Y,8177
23
+ docs/ocr/index.md,sha256=BR8a3_X6zng5yAo8O8isOBhb2Gm9hM9FIasc58aYF78,11137
22
24
  docs/pdf-navigation/index.ipynb,sha256=h6yew0HePXK1_c5FmETqzjBQceUBT0MU-vnXx_y91mo,8018
23
25
  docs/pdf-navigation/index.md,sha256=P3b3tsmOcmRtnfRxpsMeTgwm7vApnH_4le_QIwJd51M,2391
24
26
  docs/regions/index.ipynb,sha256=5A-N5A4v4lcXNptOAeI4i7i9Gx66To-Yus8B816dHBk,1303347
@@ -29,38 +31,38 @@ docs/text-analysis/index.ipynb,sha256=iaup8pcQXGp0ZK3IWi-HHssQLdIzWYGYfvZK5i8yjj
29
31
  docs/text-analysis/index.md,sha256=02pfZemOgV37izV7H-XzKmHu7AedDKLidQ-sKhYaMVw,3527
30
32
  docs/text-extraction/index.ipynb,sha256=809y9ZamXT3bc3GhwwFyoDnlyEpO-kUZ3tIsZZWyrj8,2537087
31
33
  docs/text-extraction/index.md,sha256=b1KfQpvIEelc8cPbFETUnK92az7iB4b7-LqK2DRH8vw,6985
32
- docs/tutorials/01-loading-and-extraction.ipynb,sha256=SCW26hxW9PhOspiR-2X5CD6L1EiJRfXouO-OF_Nc718,4548
34
+ docs/tutorials/01-loading-and-extraction.ipynb,sha256=2vGLM1_2_Xcpn32HvMLXj_Ro8w4HPofSZNpxZ1qPtL8,520140
33
35
  docs/tutorials/01-loading-and-extraction.md,sha256=g40J8GhKz-ikM2URj5MqIatKKj4l5kTFozHeVjxDJQA,2191
34
- docs/tutorials/02-finding-elements.ipynb,sha256=k1CSz47_atA9D6DXfQzVS64t5-L-KjssU2VuFvdy7oU,524374
36
+ docs/tutorials/02-finding-elements.ipynb,sha256=yVW3B578mKXkFUWJQnBaDB0SlnNodROjemMbdx-LWBw,524506
35
37
  docs/tutorials/02-finding-elements.md,sha256=qOkjcWUzem05of54aKzKvy-MMzRX_S4CyZisVV-73QM,4162
36
- docs/tutorials/03-extracting-blocks.ipynb,sha256=1UjdP0j3kPCE3aU8p1jBCBqflG-xRLli2Ltx80DhOVk,260729
38
+ docs/tutorials/03-extracting-blocks.ipynb,sha256=qifBv5bsKcZIQVQAHtl84GqD6Wy-IZiUMkSXURCu3ug,263329
37
39
  docs/tutorials/03-extracting-blocks.md,sha256=_kqvhk6rSL7cGp2MSwTJk8LYlJGbK_r_umnCSBdR8XU,1665
38
- docs/tutorials/04-table-extraction.ipynb,sha256=u92Wppw1qHG__Mx3ZKtETm4AWuGF8X-Ln3kvmF8zCSo,3973
40
+ docs/tutorials/04-table-extraction.ipynb,sha256=Jj2OzN32I5z1_gfMVgdr2GGyEgbWTgI7harwMWfHxYc,4089
39
41
  docs/tutorials/04-table-extraction.md,sha256=4q4v17VX8K-ZBtWYy0nbWPccyqB_ybd5Vl_IROmxz6Q,2130
40
- docs/tutorials/05-excluding-content.ipynb,sha256=oSg8ll_nuWOfQHGLp0fNKVeyYyn_L8a-F7HJADjjdq8,336857
42
+ docs/tutorials/05-excluding-content.ipynb,sha256=EaZwfDJK3BUghY1iwQ4qR8Z9nXf9e8QUeHxvJmZ3xsw,336933
41
43
  docs/tutorials/05-excluding-content.md,sha256=U52SPlc5knqxiyhRokmxrj06T54r2ENyTfP7BMGykhY,3907
42
- docs/tutorials/06-document-qa.ipynb,sha256=Facyqns8jw2bTvsOSbNnsLskFH8kg1JTz4kmJ16dpcE,10303
44
+ docs/tutorials/06-document-qa.ipynb,sha256=sGesxP26CMSD2GD-47dXq7EnqK3tlEDzM-uu7sZVR2E,10421
43
45
  docs/tutorials/06-document-qa.md,sha256=PzPPgw0Rkkfe6sfz3XyKD9S9JbQ40qf4bDzCBvwH1P0,3026
44
- docs/tutorials/07-layout-analysis.ipynb,sha256=tdNnMro1V66YPx0h96HZnujSm-zDpy7o78euQix4lyU,559517
46
+ docs/tutorials/07-layout-analysis.ipynb,sha256=DgyocqPRt9Rxsz-Sjgi153MvvnoDF3Vpsyhq27N72sE,571321
45
47
  docs/tutorials/07-layout-analysis.md,sha256=NAYVzJTecDnXjo_isbPCSUBSn3c-xM1tELct1Zn5GmI,2533
46
- docs/tutorials/07-working-with-regions.ipynb,sha256=s4BFKKbKUemmURCpg6j91rNI8eFFOJUgxY4QN4alK4I,69584
48
+ docs/tutorials/07-working-with-regions.ipynb,sha256=JMUnjQ_tCBqs4dWIyZ2jNHQCnJkwAzTJuxQVRGBqLqI,67945
47
49
  docs/tutorials/07-working-with-regions.md,sha256=oanbTFSQ-topAVd9kjfkaPiMjHcx6Y8cqyxVbmxLhgs,4365
48
- docs/tutorials/08-spatial-navigation.ipynb,sha256=jfwF6OHLvrMvaaknp-9AfUvr-pPXjPljUyGnFKF9wsw,194523
50
+ docs/tutorials/08-spatial-navigation.ipynb,sha256=Q0N-az8ZiaMmS42HXMnpDYp97Z_9YPXfM-azC9Sf_f8,186624
49
51
  docs/tutorials/08-spatial-navigation.md,sha256=IMbOYBjayXKE7pHfBjApTxOoKRD8WYj7opf8fsJCtzA,4855
50
- docs/tutorials/09-section-extraction.ipynb,sha256=Aqcy08oXTJ1pkJCmVVumndje-4WXnbkl_QfJPhps7f8,1100736
52
+ docs/tutorials/09-section-extraction.ipynb,sha256=CPBXw28Y7WjWE3HY5SJlUnGlOFQQQ0ZUB65c_uVissA,1101081
51
53
  docs/tutorials/09-section-extraction.md,sha256=Jy_be8ftAl_VPBWl5nEv7_5sKSZPx22DLUcBVHMD3Nc,7832
52
- docs/tutorials/10-form-field-extraction.ipynb,sha256=yyopvBoS5vkKKtUQ6rZ4Kyo5E0Olp2WYnmunhfzSQkQ,281491
54
+ docs/tutorials/10-form-field-extraction.ipynb,sha256=S0S5cdnrioweeKVjdRQnZptUEG-b0VvgrROkOygjAzk,268148
53
55
  docs/tutorials/10-form-field-extraction.md,sha256=t9tPlW36vJEhDrKIsHGg_f3P_MK62DT4-ZK1thKFs4Y,5494
54
- docs/tutorials/11-enhanced-table-processing.ipynb,sha256=BWpVUhtjaAX7r4OOdiy5gQgrSqREaoB0L5TuHqoHEn8,1278
56
+ docs/tutorials/11-enhanced-table-processing.ipynb,sha256=2i8gQRwkLDH14Yie56-3K5YIhdaR83XbL7m-8pQ5cJU,1394
55
57
  docs/tutorials/11-enhanced-table-processing.md,sha256=2HK-r1UwU7FLn7zWr_pMG7iLk-i0L4U4-t6ubOEeduc,282
56
- docs/tutorials/12-ocr-integration.ipynb,sha256=xurkoPwgk2p6mhmPdCehy9ccuYHrAhBCb1zGnjRbZ7Y,26724
57
- docs/tutorials/12-ocr-integration.md,sha256=wU90sfnm1R6BoMFq-orbGpl8OUVcm-wEBTlK0bLgJC4,4572
58
- docs/tutorials/13-semantic-search.ipynb,sha256=5h806AIal3EwXPVuXJESbXwdUImCx7fo0mo5-f3Dj44,42817
58
+ docs/tutorials/12-ocr-integration.ipynb,sha256=DB1pWJG1vW4aNVdQ2g5w42a71TFThmzObaVQs8h63U0,194084
59
+ docs/tutorials/12-ocr-integration.md,sha256=-IW4wqLb10eOIWC00NHTGXwtD6jDv7Tp7d-UCOk9SuE,5057
60
+ docs/tutorials/13-semantic-search.ipynb,sha256=BwFepMsOuHrWTFqczvxikPgTh5o97sYX4uleylnOBmc,54126
59
61
  docs/tutorials/13-semantic-search.md,sha256=nsNjv0ipYUC3YPSqT5d6dga9ZjObEc04Mc8c0-gsRnU,2914
60
62
  docs/visual-debugging/index.ipynb,sha256=MJ92u3Q9sfRCyDAQM4KWmCrs4QhKwIagbn6ytPF83L4,2175800
61
63
  docs/visual-debugging/index.md,sha256=ueGD2kNFhEAgIHt7qxCfrLRLjHcR7NTD3AU9okBhX9k,4176
62
64
  docs/visual-debugging/region.png,sha256=ULAJs3ZTxMjpD9F4w1DKaZXmhxga3KRq3NrUsXgw28s,67835
63
- natural_pdf/__init__.py,sha256=UdS-I3d7MzSvpxL-QMQUSUO5IGhh8c5of34BIs49TaU,2670
65
+ natural_pdf/__init__.py,sha256=aCnIBTYZlUCL1j78sScPX8kXF88JnuQSHsErboTcjnM,2727
64
66
  natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
65
67
  natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
66
68
  natural_pdf/analyzers/text_structure.py,sha256=9h8hKRz0JWnr13xQr3b4FFr_-hDIjue07WvG7LmT8nc,12827
@@ -76,34 +78,41 @@ natural_pdf/analyzers/layout/paddle.py,sha256=gTI9ZqNd5-t4H5IByGfL32WgcE6JrdchW6
76
78
  natural_pdf/analyzers/layout/surya.py,sha256=vhji6ynHPMyQLHuYRPQcplNi7m_lG4P4NYtWv6MzcME,13556
77
79
  natural_pdf/analyzers/layout/tatr.py,sha256=-GJhMy4d0yx6egkO9-ULAIdQkkQRyAKExoIta-b256U,12971
78
80
  natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
79
- natural_pdf/collections/pdf_collection.py,sha256=afE0tNIfwA7IRCc8g0EGgiBgJz3TuJbEzZ5meDNAnQw,13272
81
+ natural_pdf/classification/manager.py,sha256=pLcEDe1a5QARJCMimE5Ul_HKZD4jX-eREUCeUuniA0U,16445
82
+ natural_pdf/classification/mixin.py,sha256=aySe0bEjkaI9qYDmSkQe536w0Xrxcg4j6k3JGPvj-cY,6737
83
+ natural_pdf/classification/results.py,sha256=Hn-3xDSThR8x7XpoTlQLWpX6JE1VHVe2QpOeWNY2Ycw,2949
84
+ natural_pdf/collections/mixins.py,sha256=BXk4o_PRrczSXjR7vorIEe4WyEKyms4_qYnY8ZAZd-A,2737
85
+ natural_pdf/collections/pdf_collection.py,sha256=F_4Z-nrL9wFQ-mt4T4cJ2ERVUnkh2kyQdmOV8ASBgoM,27281
80
86
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
81
- natural_pdf/core/element_manager.py,sha256=RjLCzeHDRJCoCx1W_6jGg8KsiCTuXz7Uc2BoSY4M7mE,22144
87
+ natural_pdf/core/element_manager.py,sha256=KZ9yNtpFwuImDWmFUXgISAoWQdSib93E4t3ILUZzIic,24805
82
88
  natural_pdf/core/highlighting_service.py,sha256=CTVd7y-fpIreFSe70cTpMu1Pwl6HKMtTHp0bh2U7VXk,32609
83
- natural_pdf/core/page.py,sha256=emS6jJdb-J7xnK8Uo8Hs1n0plbIAGA_YH6kmp36wVgM,84955
84
- natural_pdf/core/pdf.py,sha256=hOR1i3bJjfJCBCI2m4pBNAMEYpmbtG905QbFe-l8gZU,46525
89
+ natural_pdf/core/page.py,sha256=4iykmXdVwmSQOpGukTxfJYU-5XEgSafNbKsnIedVaGA,94051
90
+ natural_pdf/core/pdf.py,sha256=yPAaOv5vNKZlC9oVk5sKsFxb4LdoRygz_Qkp2EaDtOY,43074
85
91
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
86
92
  natural_pdf/elements/base.py,sha256=UtoSD-c_s0yiLpWZrIIJjeJ9MgGz_4R0UHYcsFWH6bc,35157
87
- natural_pdf/elements/collections.py,sha256=w0JqLwn57Je00Aq4Ay8SeYmxPjPJvUOtkLbgfGM2-nM,68882
93
+ natural_pdf/elements/collections.py,sha256=CCQVgglxWLfhuy4FZvVHXdmgiZxU27Ay7Myt8ttQYWg,79467
88
94
  natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
89
95
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
90
- natural_pdf/elements/region.py,sha256=9E21LYQWB98coi_73Kpf9mQ60p9ElzGOJzxdtgOUfh4,69662
91
- natural_pdf/elements/text.py,sha256=8PNKSLUgXUhEu9IFfbNbSSpuu0Slm11T6UH8jn4O6hQ,11078
96
+ natural_pdf/elements/region.py,sha256=f7ArCPizkosIei9ePixHYqedK3K6LBVJotwKZ-y33a0,74058
97
+ natural_pdf/elements/text.py,sha256=ZpPluwZtAVfOkoeM4Fm2PDsN87BBZduURZaFWns03RM,11158
92
98
  natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
93
99
  natural_pdf/exporters/base.py,sha256=s-NpHoH81x80GQxs0oqjdhPGrzbUa8npjnK8apKOsHQ,2115
94
- natural_pdf/exporters/paddleocr.py,sha256=1G2bS2-CcuAtS78JZYRczO3r5k8fdO9jrExH0Kr9r7M,16249
100
+ natural_pdf/exporters/paddleocr.py,sha256=vyVetJ6RgEY46qS5Yl5mKl4cSJadwOxLWGGsdiDjico,16248
95
101
  natural_pdf/exporters/searchable_pdf.py,sha256=qsaPsnbOOaZHA_aplfZbwQnBoK9KghWm-wzbyRRomeY,16859
102
+ natural_pdf/extraction/manager.py,sha256=YH5dyUorMItGxuaZ-DhuJD5Sh_Ozjj0fa-WBMcQw1E0,4903
103
+ natural_pdf/extraction/mixin.py,sha256=6CWYyutGcKCxFVYun8yXC4H1IZWLMXaeYZ-cWJRx5FE,11430
104
+ natural_pdf/extraction/result.py,sha256=ihY1g_C2hsMACYqU7bcvAKRijuh-FHVtpnn0uoP--pk,1047
96
105
  natural_pdf/ocr/__init__.py,sha256=jKaDbo13CdCDcas1WiBmg5gjBvVeG-Z9uaeYxyzvaNY,2464
97
106
  natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
98
- natural_pdf/ocr/engine_easyocr.py,sha256=rnDXLNa-keymonR3qbLEbbxA6bqk4QUAVCHKUDixqKg,9045
107
+ natural_pdf/ocr/engine_easyocr.py,sha256=9TbxJjmhWFrzM8mcNnZjoRtIDr6gwpuwKm4-Zfub2-8,9281
99
108
  natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
100
109
  natural_pdf/ocr/engine_surya.py,sha256=iySjG-Dahgh0cLICfbMtOcwUpRFcZjo-5Ed5Zwz-o5Y,4805
101
110
  natural_pdf/ocr/ocr_factory.py,sha256=IFccj0BB75YGV4hjcy4ECtGQX_JQzdptpvDFfeGxxgI,4391
102
- natural_pdf/ocr/ocr_manager.py,sha256=PqF1z1ET8emSw19r7jtEkC9_LZJXY7C5zK5cFklo57I,9238
103
- natural_pdf/ocr/ocr_options.py,sha256=MIH7cOe8esuiGcVe4AtArSeQdaIpUu9RaUZbuwwvKQw,3294
104
- natural_pdf/ocr/utils.py,sha256=kdO4sCBqCb5qB-9iPqdPN8_5t1jWwijpT-ci5UHnz6A,3867
111
+ natural_pdf/ocr/ocr_manager.py,sha256=ivk4Aqr5gsDJWiCxP1-FLkhuvfJiQtilwbPtgIPm--4,13320
112
+ natural_pdf/ocr/ocr_options.py,sha256=BcPVwJGYE3vMug7wsVh_ARUJlm_4emz9ynOAwYgwHBk,4257
113
+ natural_pdf/ocr/utils.py,sha256=4b_A47hfynfV00iR8I9OWmXCzDzRvSdEkQhZLcSV4kQ,4394
105
114
  natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
106
- natural_pdf/qa/document_qa.py,sha256=W4E4vS_Eox_IBsYpVb0ifQbJb0FP-PYEIG93CU3rUkE,15246
115
+ natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
107
116
  natural_pdf/search/__init__.py,sha256=EB_HRwlktJn5WGPVtSaRbOQNjLAZTxujeYf_eN-zd2U,4191
108
117
  natural_pdf/search/haystack_search_service.py,sha256=6RjTFWbTo3gaO-90IF6PEuo_9WRwOdj232eWn3OT0BQ,29270
109
118
  natural_pdf/search/haystack_utils.py,sha256=UI4eu3SVieGR_QnBtLhP8Fjtt2AJgeLgxrpa_dBmD6k,19289
@@ -111,9 +120,9 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
111
120
  natural_pdf/search/search_service_protocol.py,sha256=ybNcF_NoLZuIx0rb4XB1dsDl3o_LAaWR1fVVKld2TxI,6818
112
121
  natural_pdf/search/searchable_mixin.py,sha256=M2a6FaFVM0vcfh7FgjDH6BLhS-7ggeVpcfft4OOBDxY,26390
113
122
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
114
- natural_pdf/selectors/parser.py,sha256=59_GSsTApM6MFvtqhrrmbKaBfODPbGXMluvvQJcrqhE,15754
123
+ natural_pdf/selectors/parser.py,sha256=AKXGv4MaZDiaWT_jSfn_vU-qVlECB8b-IxnyocXtaaE,22671
115
124
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
116
- natural_pdf/templates/finetune/fine_tune_paddleocr.md,sha256=AGt6kQWSTJZ8F28iN1D4p_Q6f1bvFML9gyUk6QcSHDc,14517
125
+ natural_pdf/templates/finetune/fine_tune_paddleocr.md,sha256=H6Wmu3Nvi2qKK-rPwr8KUZfILzXz8VmWyCWYOTe6QCI,14764
117
126
  natural_pdf/templates/spa/index.html,sha256=6hLTp07OeV5Q4jUMp5Sgl-dwfBs3oPzBxqphG4kEs24,787
118
127
  natural_pdf/templates/spa/words.txt,sha256=vkGtl5Y7-Nq-3Vhx1daRWWF1Jp1UCVaw-ZZaiFwrurk,2493885
119
128
  natural_pdf/templates/spa/css/style.css,sha256=Qdl0U3L5HMyhBDNzyRPklfb3OxW6rMxCfQbzO8i8IW4,7643
@@ -122,14 +131,16 @@ natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg
122
131
  natural_pdf/utils/debug.py,sha256=lk_6qzxan8NagjEtJEZpZ2MS30SO8ce6iznBxmA0xgk,995
123
132
  natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
124
133
  natural_pdf/utils/identifiers.py,sha256=n61viCQiMlf5-E_jsPLe-FkPBdKkMKv-gfs5tGqlKiw,1117
134
+ natural_pdf/utils/locks.py,sha256=E_Fb6GnRNq-tF5aE7jnllkpidsNr8LXPhSaqgr56Ks4,215
125
135
  natural_pdf/utils/packaging.py,sha256=HSgpubpHICU75L4ZAZPU8iOjium055XWnklV9_YqoCA,21579
126
136
  natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
127
- natural_pdf/utils/text_extraction.py,sha256=ujhqU2C9y2YwzGDBfT9oiGPUvSz6mVqq72ttd3Ksskg,7712
137
+ natural_pdf/utils/text_extraction.py,sha256=qZfOuO57XeKg7p-Q7yzTBMTrpAvDRslYXjDSjiJLStI,9545
138
+ natural_pdf/utils/tqdm_utils.py,sha256=bKWvsoAOl0lPOPLJC2hkTtkdxBf5f9aVtcA3DmUE19M,1570
128
139
  natural_pdf/utils/visualization.py,sha256=5GbhxtvZW-77ONVnICupg-s2D-OaxLZNqkKlOrQESK4,8593
129
140
  natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
130
141
  natural_pdf/widgets/viewer.py,sha256=Aiw6kuBc0WkhcZrPNKyLNzzWbmtmU6rvOmHV0IuXCBk,40862
131
142
  natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
132
- natural_pdf-0.1.7.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
143
+ natural_pdf-0.1.8.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
133
144
  notebooks/Examples.ipynb,sha256=l4YMtMEx_DWBzWIjl9CmBkWTo0g_nK8l_XWOyzYooQM,4275170
134
145
  pdfs/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
146
  pdfs/01-practice.pdf,sha256=dxWyJIa2cm7bALE3BWDJ2dg3inyFlo1n8ntVyy0hkTo,7906
@@ -139,7 +150,7 @@ pdfs/2014 Statistics.pdf,sha256=B-30OQVjqj_3718-G9cGUefNddnz-MosPdHAzfGfkcc,9559
139
150
  pdfs/2019 Statistics.pdf,sha256=reuSJxvAlx9_P-pW7IPqzox0jFCxSPbK1i1-WFu-uGA,511439
140
151
  pdfs/Atlanta_Public_Schools_GA_sample.pdf,sha256=PLBh_uWJQH0MnBaSm5ng5Ima63_m6Mi11CjdravB_S8,137689
141
152
  pdfs/needs-ocr.pdf,sha256=vusKiLxSOlELUTetfZfaotNU54RtMj9PCzGfLc2cuNs,139305
142
- natural_pdf-0.1.7.dist-info/METADATA,sha256=BMzSroqVMlbJrti_56ilNFZkSEH2-hJc8vUVrjk3OZU,6766
143
- natural_pdf-0.1.7.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
144
- natural_pdf-0.1.7.dist-info/top_level.txt,sha256=7nDKUnpkN7B8cBI7DEpW5JM8S7OcOgHw3jXH-1iCX2o,32
145
- natural_pdf-0.1.7.dist-info/RECORD,,
153
+ natural_pdf-0.1.8.dist-info/METADATA,sha256=Qz_ePmFWt4poceUJnVcldvhJoIRWuo2lEIEoVp-mnwE,7030
154
+ natural_pdf-0.1.8.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
155
+ natural_pdf-0.1.8.dist-info/top_level.txt,sha256=7nDKUnpkN7B8cBI7DEpW5JM8S7OcOgHw3jXH-1iCX2o,32
156
+ natural_pdf-0.1.8.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.0)
2
+ Generator: setuptools (79.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5