natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -115,10 +115,8 @@ buffered_max_length
|
|
115
115
|
```
|
116
116
|
|
117
117
|
```python
|
118
|
-
import shutil
|
119
|
-
from datetime import datetime
|
120
|
-
|
121
118
|
MAX_ALLOWED = buffered_max_length
|
119
|
+
MIN_ALLOWED = 3
|
122
120
|
removed = 0
|
123
121
|
cleaned_lines = []
|
124
122
|
|
@@ -130,6 +128,12 @@ for i, line in enumerate(original_lines):
|
|
130
128
|
if len(parts) == 2 and len(parts[1]) > MAX_ALLOWED:
|
131
129
|
removed += 1
|
132
130
|
print(f"⚠️ Line {i} exceeds max_text_length: {len(parts[1])} chars: {parts[1]}")
|
131
|
+
elif len(parts[1]) < MIN_ALLOWED:
|
132
|
+
removed += 1
|
133
|
+
print(f"⚠️ Line {i} under min_text_length: {len(parts[1])} chars: {parts[1]}")
|
134
|
+
elif "Sorry, I can't" in parts[1]:
|
135
|
+
removed += 1
|
136
|
+
print(f"⚠️ Line {i} was not OCR'd correctly")
|
133
137
|
else:
|
134
138
|
cleaned_lines.append(line)
|
135
139
|
|
@@ -284,7 +288,7 @@ We need the PaddleOCR repository for its training scripts. Once we have it we'll
|
|
284
288
|
# Start training!
|
285
289
|
# -c points to our config file
|
286
290
|
# -o Override specific config options if needed (e.g., Global.epoch_num=10)
|
287
|
-
!python paddleocr_repo/tools/train.py -c
|
291
|
+
!python paddleocr_repo/tools/train.py -c finetune_rec.yml
|
288
292
|
```
|
289
293
|
|
290
294
|
Training will begin, printing logs and saving checkpoints to the directory specified in `Global.save_model_dir` (`./output/finetune_rec/` in the example). Monitor the accuracy (`acc`) and loss on the training and validation sets. You can stop training early if validation accuracy plateaus or starts to decrease.
|
@@ -329,10 +333,11 @@ ocr = PaddleOCR(
|
|
329
333
|
with open("finetune_data/val.txt", encoding="utf-8") as f:
|
330
334
|
line = random.choice([l.strip() for l in f if l.strip()])
|
331
335
|
img_path, ground_truth = line.split(maxsplit=1)
|
336
|
+
img_path = "finetune_data/" + img_path
|
332
337
|
|
333
338
|
# Run inference
|
334
339
|
result = ocr.ocr(img_path, det=False)
|
335
|
-
prediction = result[0][0][1]
|
340
|
+
prediction = result[0][0][1] if result else '[No result]'
|
336
341
|
|
337
342
|
# Display
|
338
343
|
display(Image(filename=img_path))
|
@@ -2,7 +2,7 @@
|
|
2
2
|
import logging
|
3
3
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
4
4
|
|
5
|
-
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox, cluster_objects
|
6
6
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
7
7
|
|
8
8
|
if TYPE_CHECKING:
|
@@ -11,6 +11,57 @@ if TYPE_CHECKING:
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
13
13
|
|
14
|
+
def _get_layout_kwargs(
|
15
|
+
layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
|
16
|
+
user_kwargs: Optional[Dict[str, Any]] = None,
|
17
|
+
) -> Dict[str, Any]:
|
18
|
+
"""
|
19
|
+
Prepares the keyword arguments for pdfplumber's chars_to_textmap based
|
20
|
+
on defaults, context bbox, and allowed user overrides.
|
21
|
+
"""
|
22
|
+
# 1. Start with an empty dict for layout kwargs
|
23
|
+
layout_kwargs = {}
|
24
|
+
|
25
|
+
# Build allowed keys set without trying to copy the constants
|
26
|
+
allowed_keys = set(TEXTMAP_KWARGS) | set(WORD_EXTRACTOR_KWARGS)
|
27
|
+
|
28
|
+
# Add common, well-known default values
|
29
|
+
layout_kwargs.update({
|
30
|
+
'x_tolerance': 5,
|
31
|
+
'y_tolerance': 5,
|
32
|
+
'x_density': 7.25,
|
33
|
+
'y_density': 13,
|
34
|
+
'mode': 'box',
|
35
|
+
'min_words_vertical': 1,
|
36
|
+
'min_words_horizontal': 1,
|
37
|
+
})
|
38
|
+
|
39
|
+
# 2. Apply context if provided
|
40
|
+
if layout_context_bbox:
|
41
|
+
ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
|
42
|
+
layout_kwargs["layout_width"] = ctx_x1 - ctx_x0
|
43
|
+
layout_kwargs["layout_height"] = ctx_bottom - ctx_top
|
44
|
+
layout_kwargs["x_shift"] = ctx_x0
|
45
|
+
layout_kwargs["y_shift"] = ctx_top
|
46
|
+
# Add layout_bbox itself
|
47
|
+
layout_kwargs["layout_bbox"] = layout_context_bbox
|
48
|
+
|
49
|
+
# 3. Apply user overrides (only for allowed keys)
|
50
|
+
if user_kwargs:
|
51
|
+
for key, value in user_kwargs.items():
|
52
|
+
if key in allowed_keys:
|
53
|
+
layout_kwargs[key] = value
|
54
|
+
elif key == 'layout': # Always allow layout flag
|
55
|
+
layout_kwargs[key] = value
|
56
|
+
else:
|
57
|
+
logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
|
58
|
+
|
59
|
+
# 4. Ensure layout flag is present, defaulting to True
|
60
|
+
if 'layout' not in layout_kwargs:
|
61
|
+
layout_kwargs['layout'] = True
|
62
|
+
|
63
|
+
return layout_kwargs
|
64
|
+
|
14
65
|
def filter_chars_spatially(
|
15
66
|
char_dicts: List[Dict[str, Any]],
|
16
67
|
exclusion_regions: List["Region"],
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import sys
|
2
|
+
import os
|
3
|
+
|
4
|
+
# Default to standard tqdm
|
5
|
+
try:
|
6
|
+
from tqdm.std import tqdm as selected_tqdm
|
7
|
+
except ImportError:
|
8
|
+
# Basic fallback if even std is missing (though unlikely)
|
9
|
+
def selected_tqdm(*args, **kwargs):
|
10
|
+
iterable = args[0] if args else None
|
11
|
+
if iterable:
|
12
|
+
return iterable
|
13
|
+
return None # Simple passthrough if no iterable
|
14
|
+
|
15
|
+
# Try to detect notebook environment
|
16
|
+
try:
|
17
|
+
# Check 1: Are we running in an IPython kernel?
|
18
|
+
from IPython import get_ipython
|
19
|
+
ipython = get_ipython()
|
20
|
+
if ipython and 'IPKernelApp' in ipython.config:
|
21
|
+
# Check 2: Is it likely a notebook UI (Jupyter Notebook/Lab, VSCode, etc.)?
|
22
|
+
# This checks for common indicators. Might not be foolproof.
|
23
|
+
if 'VSCODE_PID' in os.environ or ('ipykernel' in sys.modules and 'spyder' not in sys.modules):
|
24
|
+
# Check 3: Can we import notebook version?
|
25
|
+
try:
|
26
|
+
from tqdm.notebook import tqdm as notebook_tqdm
|
27
|
+
selected_tqdm = notebook_tqdm # Use notebook version
|
28
|
+
except ImportError:
|
29
|
+
pass # Stick with std if notebook version missing
|
30
|
+
except ImportError:
|
31
|
+
pass # Stick with std if IPython not available
|
32
|
+
|
33
|
+
def get_tqdm():
|
34
|
+
"""Returns the tqdm class best suited for the detected environment."""
|
35
|
+
return selected_tqdm
|
36
|
+
|
37
|
+
# Example usage (for testing):
|
38
|
+
if __name__ == '__main__':
|
39
|
+
import time
|
40
|
+
tqdm_instance = get_tqdm()
|
41
|
+
print(f"Using tqdm class: {tqdm_instance}")
|
42
|
+
for i in tqdm_instance(range(10), desc="Testing tqdm"):
|
43
|
+
time.sleep(0.1)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.8
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -44,6 +44,10 @@ Requires-Dist: natural-pdf[core-ml]; extra == "docling"
|
|
44
44
|
Provides-Extra: llm
|
45
45
|
Requires-Dist: openai>=1.0; extra == "llm"
|
46
46
|
Requires-Dist: pydantic; extra == "llm"
|
47
|
+
Provides-Extra: classification
|
48
|
+
Requires-Dist: sentence-transformers; extra == "classification"
|
49
|
+
Requires-Dist: timm; extra == "classification"
|
50
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "classification"
|
47
51
|
Provides-Extra: test
|
48
52
|
Requires-Dist: pytest; extra == "test"
|
49
53
|
Provides-Extra: dev
|
@@ -70,6 +74,7 @@ Requires-Dist: natural-pdf[qa]; extra == "all"
|
|
70
74
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
71
75
|
Requires-Dist: natural-pdf[docling]; extra == "all"
|
72
76
|
Requires-Dist: natural-pdf[llm]; extra == "all"
|
77
|
+
Requires-Dist: natural-pdf[classification]; extra == "all"
|
73
78
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
74
79
|
Provides-Extra: core-ml
|
75
80
|
Requires-Dist: torch; extra == "core-ml"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
docs/index.md,sha256=
|
1
|
+
docs/index.md,sha256=FG4MYQs-gUR16NQ4XF0AVoQeLuykLeY8XxNwW3h-qUM,5572
|
2
2
|
docs/api/index.md,sha256=4bn8nYklWJuNDrnY-Kt7sf7IejeAEDhcnqYmjH9GJTA,22405
|
3
3
|
docs/assets/favicon.png,sha256=nxca8jM2Y4GxZKzkmagUHO1GpUREK-GRA5LEFue9OOU,284
|
4
4
|
docs/assets/favicon.svg,sha256=nxca8jM2Y4GxZKzkmagUHO1GpUREK-GRA5LEFue9OOU,284
|
@@ -8,17 +8,19 @@ docs/assets/social-preview.png,sha256=AvyzzM8dC0j5SPFF63bvQrxU4GE1f9j-GUNUv0oA9t
|
|
8
8
|
docs/assets/social-preview.svg,sha256=AvyzzM8dC0j5SPFF63bvQrxU4GE1f9j-GUNUv0oA9ts,1085
|
9
9
|
docs/assets/javascripts/custom.js,sha256=0NVHGprwiLPFYdYunJcHjOphzk_EhBSNuOUz5Uzdv_k,594
|
10
10
|
docs/assets/stylesheets/custom.css,sha256=PbTp3k77gzUBUQQ01pDXzpNwo4wUv3aJD-SMBQvQItY,1156
|
11
|
+
docs/categorizing-documents/index.md,sha256=tgKfv3DidZysrFhaOEM-FiIVDAzNPPnK02sKaE5pE2I,8196
|
12
|
+
docs/data-extraction/index.md,sha256=LwQ2MJVI5u5ELI51Iq0WUdDo5sl_s18GWG_cBABI8fQ,3430
|
11
13
|
docs/document-qa/index.ipynb,sha256=MXJoFhi8TUKK6ZnRFiUBglLGpMbzwdb7LJYfzw8Gp48,528713
|
12
14
|
docs/document-qa/index.md,sha256=mwuO4tothg0OzBXewnj73QEJu46Udq7f1pQBYrKOHwM,2131
|
13
|
-
docs/element-selection/index.ipynb,sha256
|
14
|
-
docs/element-selection/index.md,sha256=
|
15
|
+
docs/element-selection/index.ipynb,sha256=WuKd3bTTOnzBDfbuzkxmJxO6EzM9RAkFXoF0U3-8qRA,1223398
|
16
|
+
docs/element-selection/index.md,sha256=ZUkOD6VVK11K6WQ86FPnTeeco27PrFWtkObKw8j6Fok,7867
|
15
17
|
docs/finetuning/index.md,sha256=Ur3zqSaR0X8PvBCSyI7cFiDv5qZ6Jtv4omBKXCKAzEk,9200
|
16
18
|
docs/installation/index.md,sha256=nd4RZrQFR8_vv7Xm3xAzp7z-CQQr9ffAcGa7yuEYn2U,1594
|
17
19
|
docs/interactive-widget/index.ipynb,sha256=zY1rz5N34OUW-OtgcbI6iiOjlIJqXjVcx9OoNWMjuyU,293111
|
18
20
|
docs/interactive-widget/index.md,sha256=tZbq0uYI7Zwo9mLbhXpqeBriuAjazkIyEJeP-jasJ-Q,259
|
19
21
|
docs/layout-analysis/index.ipynb,sha256=dkS_-cu-KGir5G2LGRcxBThKnW0dfA5nPPnwpoYGFtU,1869093
|
20
22
|
docs/layout-analysis/index.md,sha256=ZnH5yd7B_eOLgGxW_4rNlzQs4Tn3Xx1cK3jX43CSpSM,5390
|
21
|
-
docs/ocr/index.md,sha256=
|
23
|
+
docs/ocr/index.md,sha256=BR8a3_X6zng5yAo8O8isOBhb2Gm9hM9FIasc58aYF78,11137
|
22
24
|
docs/pdf-navigation/index.ipynb,sha256=h6yew0HePXK1_c5FmETqzjBQceUBT0MU-vnXx_y91mo,8018
|
23
25
|
docs/pdf-navigation/index.md,sha256=P3b3tsmOcmRtnfRxpsMeTgwm7vApnH_4le_QIwJd51M,2391
|
24
26
|
docs/regions/index.ipynb,sha256=5A-N5A4v4lcXNptOAeI4i7i9Gx66To-Yus8B816dHBk,1303347
|
@@ -29,38 +31,38 @@ docs/text-analysis/index.ipynb,sha256=iaup8pcQXGp0ZK3IWi-HHssQLdIzWYGYfvZK5i8yjj
|
|
29
31
|
docs/text-analysis/index.md,sha256=02pfZemOgV37izV7H-XzKmHu7AedDKLidQ-sKhYaMVw,3527
|
30
32
|
docs/text-extraction/index.ipynb,sha256=809y9ZamXT3bc3GhwwFyoDnlyEpO-kUZ3tIsZZWyrj8,2537087
|
31
33
|
docs/text-extraction/index.md,sha256=b1KfQpvIEelc8cPbFETUnK92az7iB4b7-LqK2DRH8vw,6985
|
32
|
-
docs/tutorials/01-loading-and-extraction.ipynb,sha256=
|
34
|
+
docs/tutorials/01-loading-and-extraction.ipynb,sha256=2vGLM1_2_Xcpn32HvMLXj_Ro8w4HPofSZNpxZ1qPtL8,520140
|
33
35
|
docs/tutorials/01-loading-and-extraction.md,sha256=g40J8GhKz-ikM2URj5MqIatKKj4l5kTFozHeVjxDJQA,2191
|
34
|
-
docs/tutorials/02-finding-elements.ipynb,sha256=
|
36
|
+
docs/tutorials/02-finding-elements.ipynb,sha256=yVW3B578mKXkFUWJQnBaDB0SlnNodROjemMbdx-LWBw,524506
|
35
37
|
docs/tutorials/02-finding-elements.md,sha256=qOkjcWUzem05of54aKzKvy-MMzRX_S4CyZisVV-73QM,4162
|
36
|
-
docs/tutorials/03-extracting-blocks.ipynb,sha256=
|
38
|
+
docs/tutorials/03-extracting-blocks.ipynb,sha256=qifBv5bsKcZIQVQAHtl84GqD6Wy-IZiUMkSXURCu3ug,263329
|
37
39
|
docs/tutorials/03-extracting-blocks.md,sha256=_kqvhk6rSL7cGp2MSwTJk8LYlJGbK_r_umnCSBdR8XU,1665
|
38
|
-
docs/tutorials/04-table-extraction.ipynb,sha256=
|
40
|
+
docs/tutorials/04-table-extraction.ipynb,sha256=Jj2OzN32I5z1_gfMVgdr2GGyEgbWTgI7harwMWfHxYc,4089
|
39
41
|
docs/tutorials/04-table-extraction.md,sha256=4q4v17VX8K-ZBtWYy0nbWPccyqB_ybd5Vl_IROmxz6Q,2130
|
40
|
-
docs/tutorials/05-excluding-content.ipynb,sha256=
|
42
|
+
docs/tutorials/05-excluding-content.ipynb,sha256=EaZwfDJK3BUghY1iwQ4qR8Z9nXf9e8QUeHxvJmZ3xsw,336933
|
41
43
|
docs/tutorials/05-excluding-content.md,sha256=U52SPlc5knqxiyhRokmxrj06T54r2ENyTfP7BMGykhY,3907
|
42
|
-
docs/tutorials/06-document-qa.ipynb,sha256=
|
44
|
+
docs/tutorials/06-document-qa.ipynb,sha256=sGesxP26CMSD2GD-47dXq7EnqK3tlEDzM-uu7sZVR2E,10421
|
43
45
|
docs/tutorials/06-document-qa.md,sha256=PzPPgw0Rkkfe6sfz3XyKD9S9JbQ40qf4bDzCBvwH1P0,3026
|
44
|
-
docs/tutorials/07-layout-analysis.ipynb,sha256=
|
46
|
+
docs/tutorials/07-layout-analysis.ipynb,sha256=DgyocqPRt9Rxsz-Sjgi153MvvnoDF3Vpsyhq27N72sE,571321
|
45
47
|
docs/tutorials/07-layout-analysis.md,sha256=NAYVzJTecDnXjo_isbPCSUBSn3c-xM1tELct1Zn5GmI,2533
|
46
|
-
docs/tutorials/07-working-with-regions.ipynb,sha256=
|
48
|
+
docs/tutorials/07-working-with-regions.ipynb,sha256=JMUnjQ_tCBqs4dWIyZ2jNHQCnJkwAzTJuxQVRGBqLqI,67945
|
47
49
|
docs/tutorials/07-working-with-regions.md,sha256=oanbTFSQ-topAVd9kjfkaPiMjHcx6Y8cqyxVbmxLhgs,4365
|
48
|
-
docs/tutorials/08-spatial-navigation.ipynb,sha256=
|
50
|
+
docs/tutorials/08-spatial-navigation.ipynb,sha256=Q0N-az8ZiaMmS42HXMnpDYp97Z_9YPXfM-azC9Sf_f8,186624
|
49
51
|
docs/tutorials/08-spatial-navigation.md,sha256=IMbOYBjayXKE7pHfBjApTxOoKRD8WYj7opf8fsJCtzA,4855
|
50
|
-
docs/tutorials/09-section-extraction.ipynb,sha256=
|
52
|
+
docs/tutorials/09-section-extraction.ipynb,sha256=CPBXw28Y7WjWE3HY5SJlUnGlOFQQQ0ZUB65c_uVissA,1101081
|
51
53
|
docs/tutorials/09-section-extraction.md,sha256=Jy_be8ftAl_VPBWl5nEv7_5sKSZPx22DLUcBVHMD3Nc,7832
|
52
|
-
docs/tutorials/10-form-field-extraction.ipynb,sha256=
|
54
|
+
docs/tutorials/10-form-field-extraction.ipynb,sha256=S0S5cdnrioweeKVjdRQnZptUEG-b0VvgrROkOygjAzk,268148
|
53
55
|
docs/tutorials/10-form-field-extraction.md,sha256=t9tPlW36vJEhDrKIsHGg_f3P_MK62DT4-ZK1thKFs4Y,5494
|
54
|
-
docs/tutorials/11-enhanced-table-processing.ipynb,sha256=
|
56
|
+
docs/tutorials/11-enhanced-table-processing.ipynb,sha256=2i8gQRwkLDH14Yie56-3K5YIhdaR83XbL7m-8pQ5cJU,1394
|
55
57
|
docs/tutorials/11-enhanced-table-processing.md,sha256=2HK-r1UwU7FLn7zWr_pMG7iLk-i0L4U4-t6ubOEeduc,282
|
56
|
-
docs/tutorials/12-ocr-integration.ipynb,sha256=
|
57
|
-
docs/tutorials/12-ocr-integration.md,sha256
|
58
|
-
docs/tutorials/13-semantic-search.ipynb,sha256=
|
58
|
+
docs/tutorials/12-ocr-integration.ipynb,sha256=DB1pWJG1vW4aNVdQ2g5w42a71TFThmzObaVQs8h63U0,194084
|
59
|
+
docs/tutorials/12-ocr-integration.md,sha256=-IW4wqLb10eOIWC00NHTGXwtD6jDv7Tp7d-UCOk9SuE,5057
|
60
|
+
docs/tutorials/13-semantic-search.ipynb,sha256=BwFepMsOuHrWTFqczvxikPgTh5o97sYX4uleylnOBmc,54126
|
59
61
|
docs/tutorials/13-semantic-search.md,sha256=nsNjv0ipYUC3YPSqT5d6dga9ZjObEc04Mc8c0-gsRnU,2914
|
60
62
|
docs/visual-debugging/index.ipynb,sha256=MJ92u3Q9sfRCyDAQM4KWmCrs4QhKwIagbn6ytPF83L4,2175800
|
61
63
|
docs/visual-debugging/index.md,sha256=ueGD2kNFhEAgIHt7qxCfrLRLjHcR7NTD3AU9okBhX9k,4176
|
62
64
|
docs/visual-debugging/region.png,sha256=ULAJs3ZTxMjpD9F4w1DKaZXmhxga3KRq3NrUsXgw28s,67835
|
63
|
-
natural_pdf/__init__.py,sha256=
|
65
|
+
natural_pdf/__init__.py,sha256=aCnIBTYZlUCL1j78sScPX8kXF88JnuQSHsErboTcjnM,2727
|
64
66
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
65
67
|
natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
|
66
68
|
natural_pdf/analyzers/text_structure.py,sha256=9h8hKRz0JWnr13xQr3b4FFr_-hDIjue07WvG7LmT8nc,12827
|
@@ -76,34 +78,41 @@ natural_pdf/analyzers/layout/paddle.py,sha256=gTI9ZqNd5-t4H5IByGfL32WgcE6JrdchW6
|
|
76
78
|
natural_pdf/analyzers/layout/surya.py,sha256=vhji6ynHPMyQLHuYRPQcplNi7m_lG4P4NYtWv6MzcME,13556
|
77
79
|
natural_pdf/analyzers/layout/tatr.py,sha256=-GJhMy4d0yx6egkO9-ULAIdQkkQRyAKExoIta-b256U,12971
|
78
80
|
natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
|
79
|
-
natural_pdf/
|
81
|
+
natural_pdf/classification/manager.py,sha256=pLcEDe1a5QARJCMimE5Ul_HKZD4jX-eREUCeUuniA0U,16445
|
82
|
+
natural_pdf/classification/mixin.py,sha256=aySe0bEjkaI9qYDmSkQe536w0Xrxcg4j6k3JGPvj-cY,6737
|
83
|
+
natural_pdf/classification/results.py,sha256=Hn-3xDSThR8x7XpoTlQLWpX6JE1VHVe2QpOeWNY2Ycw,2949
|
84
|
+
natural_pdf/collections/mixins.py,sha256=BXk4o_PRrczSXjR7vorIEe4WyEKyms4_qYnY8ZAZd-A,2737
|
85
|
+
natural_pdf/collections/pdf_collection.py,sha256=F_4Z-nrL9wFQ-mt4T4cJ2ERVUnkh2kyQdmOV8ASBgoM,27281
|
80
86
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
81
|
-
natural_pdf/core/element_manager.py,sha256=
|
87
|
+
natural_pdf/core/element_manager.py,sha256=KZ9yNtpFwuImDWmFUXgISAoWQdSib93E4t3ILUZzIic,24805
|
82
88
|
natural_pdf/core/highlighting_service.py,sha256=CTVd7y-fpIreFSe70cTpMu1Pwl6HKMtTHp0bh2U7VXk,32609
|
83
|
-
natural_pdf/core/page.py,sha256=
|
84
|
-
natural_pdf/core/pdf.py,sha256=
|
89
|
+
natural_pdf/core/page.py,sha256=4iykmXdVwmSQOpGukTxfJYU-5XEgSafNbKsnIedVaGA,94051
|
90
|
+
natural_pdf/core/pdf.py,sha256=yPAaOv5vNKZlC9oVk5sKsFxb4LdoRygz_Qkp2EaDtOY,43074
|
85
91
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
86
92
|
natural_pdf/elements/base.py,sha256=UtoSD-c_s0yiLpWZrIIJjeJ9MgGz_4R0UHYcsFWH6bc,35157
|
87
|
-
natural_pdf/elements/collections.py,sha256=
|
93
|
+
natural_pdf/elements/collections.py,sha256=CCQVgglxWLfhuy4FZvVHXdmgiZxU27Ay7Myt8ttQYWg,79467
|
88
94
|
natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
|
89
95
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
90
|
-
natural_pdf/elements/region.py,sha256=
|
91
|
-
natural_pdf/elements/text.py,sha256=
|
96
|
+
natural_pdf/elements/region.py,sha256=f7ArCPizkosIei9ePixHYqedK3K6LBVJotwKZ-y33a0,74058
|
97
|
+
natural_pdf/elements/text.py,sha256=ZpPluwZtAVfOkoeM4Fm2PDsN87BBZduURZaFWns03RM,11158
|
92
98
|
natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
|
93
99
|
natural_pdf/exporters/base.py,sha256=s-NpHoH81x80GQxs0oqjdhPGrzbUa8npjnK8apKOsHQ,2115
|
94
|
-
natural_pdf/exporters/paddleocr.py,sha256=
|
100
|
+
natural_pdf/exporters/paddleocr.py,sha256=vyVetJ6RgEY46qS5Yl5mKl4cSJadwOxLWGGsdiDjico,16248
|
95
101
|
natural_pdf/exporters/searchable_pdf.py,sha256=qsaPsnbOOaZHA_aplfZbwQnBoK9KghWm-wzbyRRomeY,16859
|
102
|
+
natural_pdf/extraction/manager.py,sha256=YH5dyUorMItGxuaZ-DhuJD5Sh_Ozjj0fa-WBMcQw1E0,4903
|
103
|
+
natural_pdf/extraction/mixin.py,sha256=6CWYyutGcKCxFVYun8yXC4H1IZWLMXaeYZ-cWJRx5FE,11430
|
104
|
+
natural_pdf/extraction/result.py,sha256=ihY1g_C2hsMACYqU7bcvAKRijuh-FHVtpnn0uoP--pk,1047
|
96
105
|
natural_pdf/ocr/__init__.py,sha256=jKaDbo13CdCDcas1WiBmg5gjBvVeG-Z9uaeYxyzvaNY,2464
|
97
106
|
natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
|
98
|
-
natural_pdf/ocr/engine_easyocr.py,sha256=
|
107
|
+
natural_pdf/ocr/engine_easyocr.py,sha256=9TbxJjmhWFrzM8mcNnZjoRtIDr6gwpuwKm4-Zfub2-8,9281
|
99
108
|
natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
|
100
109
|
natural_pdf/ocr/engine_surya.py,sha256=iySjG-Dahgh0cLICfbMtOcwUpRFcZjo-5Ed5Zwz-o5Y,4805
|
101
110
|
natural_pdf/ocr/ocr_factory.py,sha256=IFccj0BB75YGV4hjcy4ECtGQX_JQzdptpvDFfeGxxgI,4391
|
102
|
-
natural_pdf/ocr/ocr_manager.py,sha256=
|
103
|
-
natural_pdf/ocr/ocr_options.py,sha256=
|
104
|
-
natural_pdf/ocr/utils.py,sha256=
|
111
|
+
natural_pdf/ocr/ocr_manager.py,sha256=ivk4Aqr5gsDJWiCxP1-FLkhuvfJiQtilwbPtgIPm--4,13320
|
112
|
+
natural_pdf/ocr/ocr_options.py,sha256=BcPVwJGYE3vMug7wsVh_ARUJlm_4emz9ynOAwYgwHBk,4257
|
113
|
+
natural_pdf/ocr/utils.py,sha256=4b_A47hfynfV00iR8I9OWmXCzDzRvSdEkQhZLcSV4kQ,4394
|
105
114
|
natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
|
106
|
-
natural_pdf/qa/document_qa.py,sha256=
|
115
|
+
natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
|
107
116
|
natural_pdf/search/__init__.py,sha256=EB_HRwlktJn5WGPVtSaRbOQNjLAZTxujeYf_eN-zd2U,4191
|
108
117
|
natural_pdf/search/haystack_search_service.py,sha256=6RjTFWbTo3gaO-90IF6PEuo_9WRwOdj232eWn3OT0BQ,29270
|
109
118
|
natural_pdf/search/haystack_utils.py,sha256=UI4eu3SVieGR_QnBtLhP8Fjtt2AJgeLgxrpa_dBmD6k,19289
|
@@ -111,9 +120,9 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
111
120
|
natural_pdf/search/search_service_protocol.py,sha256=ybNcF_NoLZuIx0rb4XB1dsDl3o_LAaWR1fVVKld2TxI,6818
|
112
121
|
natural_pdf/search/searchable_mixin.py,sha256=M2a6FaFVM0vcfh7FgjDH6BLhS-7ggeVpcfft4OOBDxY,26390
|
113
122
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
114
|
-
natural_pdf/selectors/parser.py,sha256=
|
123
|
+
natural_pdf/selectors/parser.py,sha256=AKXGv4MaZDiaWT_jSfn_vU-qVlECB8b-IxnyocXtaaE,22671
|
115
124
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
116
|
-
natural_pdf/templates/finetune/fine_tune_paddleocr.md,sha256=
|
125
|
+
natural_pdf/templates/finetune/fine_tune_paddleocr.md,sha256=H6Wmu3Nvi2qKK-rPwr8KUZfILzXz8VmWyCWYOTe6QCI,14764
|
117
126
|
natural_pdf/templates/spa/index.html,sha256=6hLTp07OeV5Q4jUMp5Sgl-dwfBs3oPzBxqphG4kEs24,787
|
118
127
|
natural_pdf/templates/spa/words.txt,sha256=vkGtl5Y7-Nq-3Vhx1daRWWF1Jp1UCVaw-ZZaiFwrurk,2493885
|
119
128
|
natural_pdf/templates/spa/css/style.css,sha256=Qdl0U3L5HMyhBDNzyRPklfb3OxW6rMxCfQbzO8i8IW4,7643
|
@@ -122,14 +131,16 @@ natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg
|
|
122
131
|
natural_pdf/utils/debug.py,sha256=lk_6qzxan8NagjEtJEZpZ2MS30SO8ce6iznBxmA0xgk,995
|
123
132
|
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
124
133
|
natural_pdf/utils/identifiers.py,sha256=n61viCQiMlf5-E_jsPLe-FkPBdKkMKv-gfs5tGqlKiw,1117
|
134
|
+
natural_pdf/utils/locks.py,sha256=E_Fb6GnRNq-tF5aE7jnllkpidsNr8LXPhSaqgr56Ks4,215
|
125
135
|
natural_pdf/utils/packaging.py,sha256=HSgpubpHICU75L4ZAZPU8iOjium055XWnklV9_YqoCA,21579
|
126
136
|
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
127
|
-
natural_pdf/utils/text_extraction.py,sha256=
|
137
|
+
natural_pdf/utils/text_extraction.py,sha256=qZfOuO57XeKg7p-Q7yzTBMTrpAvDRslYXjDSjiJLStI,9545
|
138
|
+
natural_pdf/utils/tqdm_utils.py,sha256=bKWvsoAOl0lPOPLJC2hkTtkdxBf5f9aVtcA3DmUE19M,1570
|
128
139
|
natural_pdf/utils/visualization.py,sha256=5GbhxtvZW-77ONVnICupg-s2D-OaxLZNqkKlOrQESK4,8593
|
129
140
|
natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
|
130
141
|
natural_pdf/widgets/viewer.py,sha256=Aiw6kuBc0WkhcZrPNKyLNzzWbmtmU6rvOmHV0IuXCBk,40862
|
131
142
|
natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
|
132
|
-
natural_pdf-0.1.
|
143
|
+
natural_pdf-0.1.8.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
133
144
|
notebooks/Examples.ipynb,sha256=l4YMtMEx_DWBzWIjl9CmBkWTo0g_nK8l_XWOyzYooQM,4275170
|
134
145
|
pdfs/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
135
146
|
pdfs/01-practice.pdf,sha256=dxWyJIa2cm7bALE3BWDJ2dg3inyFlo1n8ntVyy0hkTo,7906
|
@@ -139,7 +150,7 @@ pdfs/2014 Statistics.pdf,sha256=B-30OQVjqj_3718-G9cGUefNddnz-MosPdHAzfGfkcc,9559
|
|
139
150
|
pdfs/2019 Statistics.pdf,sha256=reuSJxvAlx9_P-pW7IPqzox0jFCxSPbK1i1-WFu-uGA,511439
|
140
151
|
pdfs/Atlanta_Public_Schools_GA_sample.pdf,sha256=PLBh_uWJQH0MnBaSm5ng5Ima63_m6Mi11CjdravB_S8,137689
|
141
152
|
pdfs/needs-ocr.pdf,sha256=vusKiLxSOlELUTetfZfaotNU54RtMj9PCzGfLc2cuNs,139305
|
142
|
-
natural_pdf-0.1.
|
143
|
-
natural_pdf-0.1.
|
144
|
-
natural_pdf-0.1.
|
145
|
-
natural_pdf-0.1.
|
153
|
+
natural_pdf-0.1.8.dist-info/METADATA,sha256=Qz_ePmFWt4poceUJnVcldvhJoIRWuo2lEIEoVp-mnwE,7030
|
154
|
+
natural_pdf-0.1.8.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
155
|
+
natural_pdf-0.1.8.dist-info/top_level.txt,sha256=7nDKUnpkN7B8cBI7DEpW5JM8S7OcOgHw3jXH-1iCX2o,32
|
156
|
+
natural_pdf-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|