natural-pdf 25.3.16.2__py3-none-any.whl → 25.3.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/direct_qa_example.py +17 -111
- examples/docling_comprehensive_test.py +325 -0
- examples/docling_example.py +192 -0
- examples/docling_hierarchy_example.py +230 -0
- examples/docling_text_sources.py +241 -0
- examples/improved_qa_example.py +66 -0
- natural_pdf/analyzers/document_layout.py +276 -0
- natural_pdf/core/page.py +72 -21
- natural_pdf/core/pdf.py +50 -68
- natural_pdf/elements/region.py +174 -19
- natural_pdf/qa/document_qa.py +29 -38
- natural_pdf/selectors/parser.py +6 -2
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +19 -2
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +17 -12
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -18,7 +18,7 @@ class Region:
|
|
18
18
|
Represents a rectangular region on a page.
|
19
19
|
"""
|
20
20
|
|
21
|
-
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None):
|
21
|
+
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
|
22
22
|
"""
|
23
23
|
Initialize a region.
|
24
24
|
|
@@ -26,6 +26,7 @@ class Region:
|
|
26
26
|
page: Parent page
|
27
27
|
bbox: Bounding box as (x0, top, x1, bottom)
|
28
28
|
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
|
29
|
+
parent: Optional parent region (for hierarchical document structure)
|
29
30
|
"""
|
30
31
|
self._page = page
|
31
32
|
self._bbox = bbox
|
@@ -48,6 +49,12 @@ class Region:
|
|
48
49
|
# Region management attributes
|
49
50
|
self.name = None
|
50
51
|
self.source = None # Will be set by creation methods
|
52
|
+
|
53
|
+
# Hierarchy support for nested document structure
|
54
|
+
self.parent_region = parent
|
55
|
+
self.child_regions = []
|
56
|
+
self.text_content = None # Direct text content (e.g., from Docling)
|
57
|
+
self.associated_text_elements = [] # Native text elements that overlap with this region
|
51
58
|
|
52
59
|
@property
|
53
60
|
def page(self) -> 'Page':
|
@@ -387,6 +394,11 @@ class Region:
|
|
387
394
|
"""
|
388
395
|
Extract text from this region using pdfplumber's native functionality.
|
389
396
|
|
397
|
+
For regions created by Docling, this will first try to use:
|
398
|
+
1. Associated text elements from the PDF (if available)
|
399
|
+
2. Direct text content from Docling (if available)
|
400
|
+
3. Fall back to standard pdfplumber extraction
|
401
|
+
|
390
402
|
Args:
|
391
403
|
keep_blank_chars: Whether to keep blank characters (legacy parameter)
|
392
404
|
apply_exclusions: Whether to apply exclusion regions
|
@@ -398,6 +410,28 @@ class Region:
|
|
398
410
|
Returns:
|
399
411
|
Extracted text as string
|
400
412
|
"""
|
413
|
+
import logging
|
414
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
415
|
+
|
416
|
+
# Check for Docling model or if we have direct text content
|
417
|
+
if self.model == 'docling' or hasattr(self, 'text_content'):
|
418
|
+
# First priority: check if we have associated native text elements
|
419
|
+
if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
|
420
|
+
source_count = len(self.associated_text_elements)
|
421
|
+
logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
|
422
|
+
# Sort elements in reading order
|
423
|
+
sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
|
424
|
+
# Extract and join their text
|
425
|
+
text_result = " ".join(elem.text for elem in sorted_elements)
|
426
|
+
return text_result
|
427
|
+
|
428
|
+
# Second priority: use direct text content from Docling
|
429
|
+
elif self.text_content:
|
430
|
+
logger.info(f"Region {self.region_type}: Using Docling OCR text content")
|
431
|
+
return self.text_content
|
432
|
+
|
433
|
+
logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
|
434
|
+
|
401
435
|
# Handle preserve_whitespace parameter for consistency with Page.extract_text
|
402
436
|
if preserve_whitespace is not None:
|
403
437
|
keep_blank_chars = preserve_whitespace
|
@@ -1346,21 +1380,142 @@ class Region:
|
|
1346
1380
|
"source_elements": list of elements that contain the answer (if found)
|
1347
1381
|
}
|
1348
1382
|
"""
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1383
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1384
|
+
|
1385
|
+
# Get or initialize QA engine with specified model
|
1386
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1387
|
+
|
1388
|
+
# Ask the question using the QA engine
|
1389
|
+
|
1390
|
+
def add_child(self, child):
|
1391
|
+
"""
|
1392
|
+
Add a child region to this region.
|
1393
|
+
|
1394
|
+
Used for hierarchical document structure when using models like Docling
|
1395
|
+
that understand document hierarchy.
|
1396
|
+
|
1397
|
+
Args:
|
1398
|
+
child: Region object to add as a child
|
1399
|
+
|
1400
|
+
Returns:
|
1401
|
+
Self for method chaining
|
1402
|
+
"""
|
1403
|
+
self.child_regions.append(child)
|
1404
|
+
child.parent_region = self
|
1405
|
+
return self
|
1406
|
+
|
1407
|
+
def get_children(self, selector=None):
|
1408
|
+
"""
|
1409
|
+
Get immediate child regions, optionally filtered by selector.
|
1410
|
+
|
1411
|
+
Args:
|
1412
|
+
selector: Optional selector to filter children
|
1413
|
+
|
1414
|
+
Returns:
|
1415
|
+
List of child regions matching the selector
|
1416
|
+
"""
|
1417
|
+
import logging
|
1418
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
1419
|
+
|
1420
|
+
if selector is None:
|
1421
|
+
return self.child_regions
|
1422
|
+
|
1423
|
+
# Use existing selector parser to filter
|
1424
|
+
from natural_pdf.selectors.parser import match_elements_with_selector
|
1425
|
+
matched = match_elements_with_selector(self.child_regions, selector)
|
1426
|
+
logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
|
1427
|
+
return matched
|
1428
|
+
|
1429
|
+
def get_descendants(self, selector=None):
|
1430
|
+
"""
|
1431
|
+
Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
|
1432
|
+
|
1433
|
+
Args:
|
1434
|
+
selector: Optional selector to filter descendants
|
1435
|
+
|
1436
|
+
Returns:
|
1437
|
+
List of descendant regions matching the selector
|
1438
|
+
"""
|
1439
|
+
import logging
|
1440
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
1441
|
+
|
1442
|
+
all_descendants = []
|
1443
|
+
|
1444
|
+
# First add direct children
|
1445
|
+
all_descendants.extend(self.child_regions)
|
1446
|
+
|
1447
|
+
# Then recursively add their descendants
|
1448
|
+
for child in self.child_regions:
|
1449
|
+
all_descendants.extend(child.get_descendants())
|
1450
|
+
|
1451
|
+
logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
|
1452
|
+
|
1453
|
+
# Filter by selector if provided
|
1454
|
+
if selector is not None:
|
1455
|
+
from natural_pdf.selectors.parser import match_elements_with_selector
|
1456
|
+
matched = match_elements_with_selector(all_descendants, selector)
|
1457
|
+
logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
|
1458
|
+
return matched
|
1459
|
+
|
1460
|
+
return all_descendants
|
1461
|
+
|
1462
|
+
def find_all(self, selector, recursive=True, **kwargs):
|
1463
|
+
"""
|
1464
|
+
Find all matching elements within this region, with optional recursion through child regions.
|
1465
|
+
|
1466
|
+
Args:
|
1467
|
+
selector: The selector to find elements with
|
1468
|
+
recursive: Whether to search recursively through child regions
|
1469
|
+
**kwargs: Additional parameters to pass to the selector parser
|
1470
|
+
|
1471
|
+
Returns:
|
1472
|
+
Collection of matching elements
|
1473
|
+
"""
|
1474
|
+
# Get direct matches
|
1475
|
+
direct_matches = self.page.find_all(selector, region=self, **kwargs)
|
1476
|
+
|
1477
|
+
if not recursive or not self.child_regions:
|
1478
|
+
return direct_matches
|
1479
|
+
|
1480
|
+
# Get recursive matches from children
|
1481
|
+
from natural_pdf.elements.collections import ElementCollection
|
1482
|
+
all_matches = list(direct_matches)
|
1483
|
+
|
1484
|
+
for child in self.child_regions:
|
1485
|
+
child_matches = child.find_all(selector, recursive=True, **kwargs)
|
1486
|
+
for match in child_matches:
|
1487
|
+
if match not in all_matches:
|
1488
|
+
all_matches.append(match)
|
1489
|
+
|
1490
|
+
return ElementCollection(all_matches)
|
1491
|
+
|
1492
|
+
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1493
|
+
"""
|
1494
|
+
Ask a question about the region content using document QA.
|
1495
|
+
|
1496
|
+
This method uses a document question answering model to extract answers from the region content.
|
1497
|
+
It leverages both textual content and layout information for better understanding.
|
1498
|
+
|
1499
|
+
Args:
|
1500
|
+
question: The question to ask about the region content
|
1501
|
+
min_confidence: Minimum confidence threshold for answers (0.0-1.0)
|
1502
|
+
model: Optional model name to use for QA (if None, uses default model)
|
1503
|
+
**kwargs: Additional parameters to pass to the QA engine
|
1504
|
+
|
1505
|
+
Returns:
|
1506
|
+
Dictionary with answer details: {
|
1507
|
+
"answer": extracted text,
|
1508
|
+
"confidence": confidence score,
|
1509
|
+
"found": whether an answer was found,
|
1510
|
+
"page_num": page number,
|
1511
|
+
"region": reference to this region,
|
1512
|
+
"source_elements": list of elements that contain the answer (if found)
|
1513
|
+
}
|
1514
|
+
"""
|
1515
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1516
|
+
|
1517
|
+
# Get or initialize QA engine with specified model
|
1518
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1519
|
+
|
1520
|
+
# Ask the question using the QA engine
|
1521
|
+
return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Dict, Any, Optional, Union, Tuple
|
3
3
|
import numpy as np
|
4
|
-
from PIL import Image
|
4
|
+
from PIL import Image, ImageDraw
|
5
5
|
import os
|
6
6
|
import tempfile
|
7
7
|
import json
|
@@ -207,47 +207,38 @@ class DocumentQA:
|
|
207
207
|
logger.info(f"Visualization: {vis_path}")
|
208
208
|
|
209
209
|
# Run the query through the pipeline
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
227
|
-
return {
|
228
|
-
"answer": "",
|
229
|
-
"confidence": result["score"],
|
230
|
-
"start": result.get("start", -1),
|
231
|
-
"end": result.get("end", -1),
|
232
|
-
"found": False
|
233
|
-
}
|
234
|
-
|
235
|
-
return {
|
236
|
-
"answer": result["answer"],
|
237
|
-
"confidence": result["score"],
|
238
|
-
"start": result.get("start", 0),
|
239
|
-
"end": result.get("end", 0),
|
240
|
-
"found": True
|
241
|
-
}
|
242
|
-
|
243
|
-
except Exception as e:
|
244
|
-
logger.error(f"Error in document QA: {e}")
|
210
|
+
logger.info(f"Running document QA pipeline with question: {question}")
|
211
|
+
result = self.pipe(query)[0]
|
212
|
+
logger.info(f"Raw result: {result}")
|
213
|
+
|
214
|
+
# Save the result if debugging
|
215
|
+
if debug:
|
216
|
+
result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
|
217
|
+
with open(result_path, 'w') as f:
|
218
|
+
# Convert any non-serializable data
|
219
|
+
serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
|
220
|
+
for k, v in result.items()}
|
221
|
+
json.dump(serializable_result, f, indent=2)
|
222
|
+
|
223
|
+
# Check confidence against threshold
|
224
|
+
if result["score"] < min_confidence:
|
225
|
+
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
245
226
|
return {
|
246
227
|
"answer": "",
|
247
|
-
"confidence":
|
248
|
-
"
|
228
|
+
"confidence": result["score"],
|
229
|
+
"start": result.get("start", -1),
|
230
|
+
"end": result.get("end", -1),
|
249
231
|
"found": False
|
250
232
|
}
|
233
|
+
|
234
|
+
return {
|
235
|
+
"answer": result["answer"],
|
236
|
+
"confidence": result["score"],
|
237
|
+
"start": result.get("start", 0),
|
238
|
+
"end": result.get("end", 0),
|
239
|
+
"found": True
|
240
|
+
}
|
241
|
+
|
251
242
|
|
252
243
|
def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
|
253
244
|
"""
|
natural_pdf/selectors/parser.py
CHANGED
@@ -162,8 +162,12 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
|
162
162
|
return False
|
163
163
|
|
164
164
|
# If 'type' attribute specified, it will be checked in the attributes section
|
165
|
-
#
|
166
|
-
elif element.
|
165
|
+
# Check for Docling-specific types (section-header, etc.)
|
166
|
+
elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
|
167
|
+
# This is a direct match with a Docling region type
|
168
|
+
pass
|
169
|
+
# Otherwise, require exact match with the element's type attribute
|
170
|
+
elif not hasattr(element, 'type') or element.type != selector['type']:
|
167
171
|
return False
|
168
172
|
|
169
173
|
# Check attributes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 25.3.
|
3
|
+
Version: 25.3.17.2
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Home-page: https://github.com/jsoma/natural-pdf
|
6
6
|
Author: Jonathan Soma
|
@@ -60,7 +60,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
60
60
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
61
61
|
|
62
62
|
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
63
|
-
- [Live demo here](https://colab.research.google.com/github/jsoma/)
|
63
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
|
64
64
|
|
65
65
|
## Features
|
66
66
|
|
@@ -268,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
|
|
268
268
|
- `natural_pdf.analyzers` - Layout analysis operations
|
269
269
|
- `natural_pdf.ocr` - OCR engine operations
|
270
270
|
|
271
|
+
## Document QA
|
272
|
+
|
273
|
+
Ask questions directly to your documents:
|
274
|
+
|
275
|
+
```python
|
276
|
+
# Ask questions about the document content
|
277
|
+
result = pdf.ask("What was the company's revenue in 2022?")
|
278
|
+
print(f"Answer: {result['answer']}")
|
279
|
+
print(f"Confidence: {result['confidence']:.2f}")
|
280
|
+
|
281
|
+
# Access more details in the result dictionary
|
282
|
+
result = pdf.ask("Who is the CEO?")
|
283
|
+
print(f"Answer: {result['answer']}")
|
284
|
+
print(f"Found on page: {result['page_num']}")
|
285
|
+
print(f"Source text: {result.get('source_text', 'N/A')}")
|
286
|
+
```
|
287
|
+
|
271
288
|
## More details
|
272
289
|
|
273
290
|
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
@@ -10,7 +10,11 @@ examples/color_test.py,sha256=owJ64qH5j5WgVhb7Nl2F-v89ilr1mQNwFm1xphvNnX4,2320
|
|
10
10
|
examples/debug_ocr.py,sha256=oHTHWyamz2B9Est1uuYaC1W4Acubh7YNQcb_4eTMZPw,2521
|
11
11
|
examples/direct_ocr_test.py,sha256=eO151awEYv24qKToj1704G4gq0SLgnx9bDXJgMHln64,5661
|
12
12
|
examples/direct_paddle_test.py,sha256=awoHkR0WLpDEzcOD_HtxhXoBe9FmVBjzzBwtN4HcbVE,3335
|
13
|
-
examples/direct_qa_example.py,sha256=
|
13
|
+
examples/direct_qa_example.py,sha256=bT79h491kVg3SAtK5jBSX6n1GPSW9egPbHfF9HpRr4g,2311
|
14
|
+
examples/docling_comprehensive_test.py,sha256=0SxnIuvjVCzBRdbWIVMBT8P9lfBFam9SzvNWc2MmoIE,11129
|
15
|
+
examples/docling_example.py,sha256=BUZFaA2aUpY7v-bF5AQCKL_iq8DZipeAgyXtwx_khc4,6582
|
16
|
+
examples/docling_hierarchy_example.py,sha256=Il5L-LNr8yale5fhB-x4HmDja1I7843wBygxmVtdHFQ,8032
|
17
|
+
examples/docling_text_sources.py,sha256=mjgkvNZ66Js0hdeaQFOOcAMzusSlWQ0835ICKiRVeZ0,8212
|
14
18
|
examples/document_layout_analysis.py,sha256=zpr2LroZrrTqoGeb0BWhexc3gwfBIPB6FeEL-VhKyqU,4928
|
15
19
|
examples/document_qa_example.py,sha256=IKe7FKnKbdaDPwQ-1uwm3KJenxennzqigQJGiQBIL1g,7240
|
16
20
|
examples/exclusion_count_debug.py,sha256=UKrWiFkV42iuXShMR1AlzZyCVJpl9IoalIRcfazbVm4,4990
|
@@ -30,6 +34,7 @@ examples/highlight_test.py,sha256=nlirCchUB6P3jV9shn55kVSABZd5YcVozx1R47c39Gk,58
|
|
30
34
|
examples/highlighting_example.py,sha256=8WoXL92nef61e4IDAEqL2EMC_nH06dM09OTXG7EJZO0,4893
|
31
35
|
examples/image_width_example.py,sha256=jq5Xekx9oWHzNFfVOGt_H9nIcdlM1xZGbNjRWzdq9Kc,3414
|
32
36
|
examples/improved_api_example.py,sha256=R_y_mzZYLJR1WnsAblgIBVGNBPKesLBMvnNEwhlhGpQ,4807
|
37
|
+
examples/improved_qa_example.py,sha256=kpai2QjnmgZ6AeP4DXnm_LrYf0TLXlA4u-dLIteA5rc,2406
|
33
38
|
examples/layout_confidence_display_test.py,sha256=9Ywg5YYUb8yLICtkKWmnRuyHkFsnwC3e4kBfqvq2MSE,2605
|
34
39
|
examples/layout_confidence_test.py,sha256=hDr0Fwrv6p1UZ1vfMRXT3oR_SOgwj5GkLw3pd3D_Qaw,3438
|
35
40
|
examples/layout_coordinate_debug.py,sha256=H8qG7kAsdw1ZkmOTdcH9tU3WGgMdMGpFj8DDnNAXTO8,11405
|
@@ -74,26 +79,26 @@ examples/url_pdf_example.py,sha256=WjZMlKyIlcXhJNdSe9uMmWdTHixnRfeacnp64gQkNAo,1
|
|
74
79
|
examples/very_basics.py,sha256=cNLnr1z701ri0LgE1cVM4gfMMND0C9UnvvWybnwum6g,418
|
75
80
|
natural_pdf/__init__.py,sha256=kKHL7SWzk0_ydDDX12X5W3s9-vEKgVYOBubXzp_SCdM,1784
|
76
81
|
natural_pdf/analyzers/__init__.py,sha256=XhxlbwiqbGpeIlS88c4P2t7-MLP98U3CcIr-3nGp488,188
|
77
|
-
natural_pdf/analyzers/document_layout.py,sha256=
|
82
|
+
natural_pdf/analyzers/document_layout.py,sha256=vAvHCXAykdJnar6GqwC2YH1N1APzEtKORUT6coib51w,43431
|
78
83
|
natural_pdf/analyzers/text_structure.py,sha256=ZmUsBMNBENjEYcABHqwziDXIHyCVYdUaEyAW0Ohagzc,5208
|
79
84
|
natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
|
80
|
-
natural_pdf/core/page.py,sha256=
|
81
|
-
natural_pdf/core/pdf.py,sha256=
|
85
|
+
natural_pdf/core/page.py,sha256=zkBApbTMjaND_bu65veZSwd53uVDOtstXEnU-hCmqlQ,108813
|
86
|
+
natural_pdf/core/pdf.py,sha256=lqTY6tCS-ZN9A4_LJ0w39Sr1tvgjpGRu0W3RK_0BEXQ,24878
|
82
87
|
natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
|
83
88
|
natural_pdf/elements/base.py,sha256=QJmhk6sYDKErLGrQ5VYhloytuntufxiP6wTzGfZ__9w,22754
|
84
89
|
natural_pdf/elements/collections.py,sha256=vFFeMS0XiBL3p9PyNmwXndKMlPhlwp8os3xKLveN_8k,31558
|
85
90
|
natural_pdf/elements/line.py,sha256=GrlpcfwcjugUGlneSDLaGa4ojz98A0l16wknlzrBCe0,4056
|
86
91
|
natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
|
87
|
-
natural_pdf/elements/region.py,sha256=
|
92
|
+
natural_pdf/elements/region.py,sha256=MNzZtawVlhrRBO1EQY6OOw1RLGZBLsDp-7Ujq0HD2Xk,64701
|
88
93
|
natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
|
89
94
|
natural_pdf/ocr/__init__.py,sha256=m2_hQRNEQihg9yIx6tMkurKfAnsBU7jbREvZyR_c2fs,2206
|
90
95
|
natural_pdf/ocr/easyocr_engine.py,sha256=wwAgOf6IrCETyTGppjaVN6d46VlYRgK9ehbpwCE0LVA,9392
|
91
96
|
natural_pdf/ocr/engine.py,sha256=NuvZszyHRQoEGwJC7cNdR8UMumgaLADwWiwK0ey9sls,5679
|
92
97
|
natural_pdf/ocr/paddleocr_engine.py,sha256=iPKxfZhOLO146rOekOhRRB8y2BqqVTqrm8pkDgkaJgI,9157
|
93
98
|
natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
|
94
|
-
natural_pdf/qa/document_qa.py,sha256=
|
99
|
+
natural_pdf/qa/document_qa.py,sha256=x_AYE0kbs7_4n5NC7zWcxQpHFh0vxP3g3q-l_w4RgSU,15845
|
95
100
|
natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
|
96
|
-
natural_pdf/selectors/parser.py,sha256=
|
101
|
+
natural_pdf/selectors/parser.py,sha256=SFbiL0UdbX4VlA1bF26ahOq33fjXmBOUQEAGbqji65g,14536
|
97
102
|
natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
|
98
103
|
natural_pdf/templates/ocr_debug.html,sha256=Zy9StzBeHFQU8ity6cjFSZLe3TY0QOabUux4c5WQUzs,19171
|
99
104
|
natural_pdf/utils/__init__.py,sha256=iAq5tqNSGcX5-t5CZkyyt6EFpAwH84spIXTCuJ46YMc,87
|
@@ -103,8 +108,8 @@ natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6T
|
|
103
108
|
natural_pdf/utils/visualization.py,sha256=Dujxp5xKbEap6UpoVEpArpkHChJLa_Je7FGz2S3Iwvw,5403
|
104
109
|
tests/__init__.py,sha256=34RJiJqy8uDxasGCbzXIaJlHQklHprscPcA4xp2s97g,30
|
105
110
|
tests/test_pdf.py,sha256=Ud-DI-GHAvnSJGMJewM_EwHtI_UgWTi7Gn9uIwQcpfE,1001
|
106
|
-
natural_pdf-25.3.
|
107
|
-
natural_pdf-25.3.
|
108
|
-
natural_pdf-25.3.
|
109
|
-
natural_pdf-25.3.
|
110
|
-
natural_pdf-25.3.
|
111
|
+
natural_pdf-25.3.17.2.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
112
|
+
natural_pdf-25.3.17.2.dist-info/METADATA,sha256=YqtGq0o7CHwugehRetW1fvvZTH-6JC8mia7cRXamApk,9082
|
113
|
+
natural_pdf-25.3.17.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
114
|
+
natural_pdf-25.3.17.2.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
|
115
|
+
natural_pdf-25.3.17.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|