onflow-awb-ocr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onflow_awb_ocr-0.1.0/PKG-INFO +141 -0
- onflow_awb_ocr-0.1.0/README.md +128 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/__init__.py +11 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/constants.py +48 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/extractor.py +36 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/input.py +116 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/ocr.py +130 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/postprocess.py +39 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/py.typed +1 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/receiver.py +13 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/text_layer.py +209 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/types.py +9 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr/utils.py +45 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr.egg-info/PKG-INFO +141 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr.egg-info/SOURCES.txt +18 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr.egg-info/dependency_links.txt +1 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr.egg-info/requires.txt +7 -0
- onflow_awb_ocr-0.1.0/onflow_awb_ocr.egg-info/top_level.txt +1 -0
- onflow_awb_ocr-0.1.0/pyproject.toml +27 -0
- onflow_awb_ocr-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: onflow-awb-ocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for extracting receiver information from AWB/shipping labels.
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: PyMuPDF>=1.20
|
|
8
|
+
Requires-Dist: requests>=2.25
|
|
9
|
+
Provides-Extra: ocr
|
|
10
|
+
Requires-Dist: opencv-python>=4.5; extra == "ocr"
|
|
11
|
+
Requires-Dist: pytesseract>=0.3; extra == "ocr"
|
|
12
|
+
Requires-Dist: numpy>=1.21; extra == "ocr"
|
|
13
|
+
|
|
14
|
+
# Onflow AWB OCR
|
|
15
|
+
|
|
16
|
+
Python SDK for extracting receiver information from AWB and shipping label files.
|
|
17
|
+
|
|
18
|
+
The package supports PDF files with a text layer first, then falls back to OCR for
|
|
19
|
+
scanned PDFs and image files when OCR dependencies are installed.
|
|
20
|
+
|
|
21
|
+
## Requirements
|
|
22
|
+
|
|
23
|
+
- Python 3.8+
|
|
24
|
+
- PyMuPDF for PDF text-layer extraction
|
|
25
|
+
- Optional OCR stack for scanned files and images:
|
|
26
|
+
- Tesseract OCR
|
|
27
|
+
- Vietnamese Tesseract language data
|
|
28
|
+
- Poppler `pdftoppm`
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
Install from PyPI:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install onflow-awb-ocr
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Install with OCR dependencies:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install "onflow-awb-ocr[ocr]"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
On Ubuntu, install the native OCR tools:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
sudo apt install -y tesseract-ocr tesseract-ocr-vie poppler-utils
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
For local development:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install -e ".[ocr]"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from onflow_awb_ocr import OnflowAwbOcr
|
|
60
|
+
|
|
61
|
+
ocr = OnflowAwbOcr(lang="vie+eng")
|
|
62
|
+
result = ocr.extract("label.pdf")
|
|
63
|
+
|
|
64
|
+
print(result)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Example result:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
{
|
|
71
|
+
"name": "Nguyen Van A",
|
|
72
|
+
"address": "123 Nguyen Trai\nQuan 1, TP. Ho Chi Minh",
|
|
73
|
+
"strategy": "shopee",
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
If no receiver can be detected, `extract()` returns `None`.
|
|
78
|
+
|
|
79
|
+
## Supported Inputs
|
|
80
|
+
|
|
81
|
+
`extract()` accepts:
|
|
82
|
+
|
|
83
|
+
- Local file path as `str`
|
|
84
|
+
- Local file path as `pathlib.Path`
|
|
85
|
+
- HTTP/HTTPS URL
|
|
86
|
+
- `bytes`
|
|
87
|
+
- `bytearray`
|
|
88
|
+
- Binary file-like object
|
|
89
|
+
|
|
90
|
+
Examples:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from pathlib import Path
|
|
94
|
+
|
|
95
|
+
from onflow_awb_ocr import OnflowAwbOcr
|
|
96
|
+
|
|
97
|
+
ocr = OnflowAwbOcr()
|
|
98
|
+
|
|
99
|
+
from_path = ocr.extract(Path("label.pdf"))
|
|
100
|
+
from_url = ocr.extract("https://example.com/label.pdf")
|
|
101
|
+
|
|
102
|
+
with open("label.pdf", "rb") as file:
|
|
103
|
+
from_file = ocr.extract(file)
|
|
104
|
+
|
|
105
|
+
with open("label.png", "rb") as file:
|
|
106
|
+
from_bytes = ocr.extract(file.read())
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Compatibility
|
|
110
|
+
|
|
111
|
+
The old `ReceiverExtractor` class name is still available as an alias:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from onflow_awb_ocr import ReceiverExtractor
|
|
115
|
+
|
|
116
|
+
ocr = ReceiverExtractor()
|
|
117
|
+
result = ocr.extract("label.pdf")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Package Structure
|
|
121
|
+
|
|
122
|
+
- `extractor.py`: public `OnflowAwbOcr` class
|
|
123
|
+
- `input.py`: input preparation for paths, URLs, bytes, and binary streams
|
|
124
|
+
- `text_layer.py`: PDF text-layer extraction strategies
|
|
125
|
+
- `ocr.py`: OCR fallback for scanned PDFs and images
|
|
126
|
+
- `postprocess.py`: address cleanup
|
|
127
|
+
- `types.py`, `constants.py`, `utils.py`: shared types, constants, and helpers
|
|
128
|
+
|
|
129
|
+
## Publishing
|
|
130
|
+
|
|
131
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
132
|
+
|
|
133
|
+
The repository must define this GitHub secret:
|
|
134
|
+
|
|
135
|
+
```text
|
|
136
|
+
PYPI_API_TOKEN
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
140
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
141
|
+
distribution.
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Onflow AWB OCR
|
|
2
|
+
|
|
3
|
+
Python SDK for extracting receiver information from AWB and shipping label files.
|
|
4
|
+
|
|
5
|
+
The package supports PDF files with a text layer first, then falls back to OCR for
|
|
6
|
+
scanned PDFs and image files when OCR dependencies are installed.
|
|
7
|
+
|
|
8
|
+
## Requirements
|
|
9
|
+
|
|
10
|
+
- Python 3.8+
|
|
11
|
+
- PyMuPDF for PDF text-layer extraction
|
|
12
|
+
- Optional OCR stack for scanned files and images:
|
|
13
|
+
- Tesseract OCR
|
|
14
|
+
- Vietnamese Tesseract language data
|
|
15
|
+
- Poppler `pdftoppm`
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
Install from PyPI:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install onflow-awb-ocr
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Install with OCR dependencies:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install "onflow-awb-ocr[ocr]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
On Ubuntu, install the native OCR tools:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
sudo apt install -y tesseract-ocr tesseract-ocr-vie poppler-utils
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
For local development:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -e ".[ocr]"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from onflow_awb_ocr import OnflowAwbOcr
|
|
47
|
+
|
|
48
|
+
ocr = OnflowAwbOcr(lang="vie+eng")
|
|
49
|
+
result = ocr.extract("label.pdf")
|
|
50
|
+
|
|
51
|
+
print(result)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Example result:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
{
|
|
58
|
+
"name": "Nguyen Van A",
|
|
59
|
+
"address": "123 Nguyen Trai\nQuan 1, TP. Ho Chi Minh",
|
|
60
|
+
"strategy": "shopee",
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
If no receiver can be detected, `extract()` returns `None`.
|
|
65
|
+
|
|
66
|
+
## Supported Inputs
|
|
67
|
+
|
|
68
|
+
`extract()` accepts:
|
|
69
|
+
|
|
70
|
+
- Local file path as `str`
|
|
71
|
+
- Local file path as `pathlib.Path`
|
|
72
|
+
- HTTP/HTTPS URL
|
|
73
|
+
- `bytes`
|
|
74
|
+
- `bytearray`
|
|
75
|
+
- Binary file-like object
|
|
76
|
+
|
|
77
|
+
Examples:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from pathlib import Path
|
|
81
|
+
|
|
82
|
+
from onflow_awb_ocr import OnflowAwbOcr
|
|
83
|
+
|
|
84
|
+
ocr = OnflowAwbOcr()
|
|
85
|
+
|
|
86
|
+
from_path = ocr.extract(Path("label.pdf"))
|
|
87
|
+
from_url = ocr.extract("https://example.com/label.pdf")
|
|
88
|
+
|
|
89
|
+
with open("label.pdf", "rb") as file:
|
|
90
|
+
from_file = ocr.extract(file)
|
|
91
|
+
|
|
92
|
+
with open("label.png", "rb") as file:
|
|
93
|
+
from_bytes = ocr.extract(file.read())
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Compatibility
|
|
97
|
+
|
|
98
|
+
The old `ReceiverExtractor` class name is still available as an alias:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from onflow_awb_ocr import ReceiverExtractor
|
|
102
|
+
|
|
103
|
+
ocr = ReceiverExtractor()
|
|
104
|
+
result = ocr.extract("label.pdf")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Package Structure
|
|
108
|
+
|
|
109
|
+
- `extractor.py`: public `OnflowAwbOcr` class
|
|
110
|
+
- `input.py`: input preparation for paths, URLs, bytes, and binary streams
|
|
111
|
+
- `text_layer.py`: PDF text-layer extraction strategies
|
|
112
|
+
- `ocr.py`: OCR fallback for scanned PDFs and images
|
|
113
|
+
- `postprocess.py`: address cleanup
|
|
114
|
+
- `types.py`, `constants.py`, `utils.py`: shared types, constants, and helpers
|
|
115
|
+
|
|
116
|
+
## Publishing
|
|
117
|
+
|
|
118
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
119
|
+
|
|
120
|
+
The repository must define this GitHub secret:
|
|
121
|
+
|
|
122
|
+
```text
|
|
123
|
+
PYPI_API_TOKEN
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
127
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
128
|
+
distribution.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Python SDK for extracting receiver information from AWB/shipping labels."""
|
|
2
|
+
|
|
3
|
+
from .extractor import OnflowAwbOcr, ReceiverExtractor
|
|
4
|
+
from .types import FileInput, ReceiverResult
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"FileInput",
|
|
8
|
+
"OnflowAwbOcr",
|
|
9
|
+
"ReceiverExtractor",
|
|
10
|
+
"ReceiverResult",
|
|
11
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Static extraction constants."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
HEADERS = {
|
|
5
|
+
"User-Agent": (
|
|
6
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
7
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
8
|
+
"Chrome/124.0.0.0 Safari/537.36"
|
|
9
|
+
)
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
STOP_KEYWORDS = [
|
|
13
|
+
"nội dung hàng",
|
|
14
|
+
"tổng sl",
|
|
15
|
+
"ngày đặt hàng",
|
|
16
|
+
"khối lượng",
|
|
17
|
+
"tiền thu",
|
|
18
|
+
"cod",
|
|
19
|
+
"chữ ký",
|
|
20
|
+
"mã vận đơn",
|
|
21
|
+
"mã đơn hàng",
|
|
22
|
+
"chỉ dẫn giao hàng",
|
|
23
|
+
"xác nhận",
|
|
24
|
+
"chuyển hoàn",
|
|
25
|
+
"lưu kho",
|
|
26
|
+
"người gửi",
|
|
27
|
+
"spx tuyển",
|
|
28
|
+
"hc-",
|
|
29
|
+
"in transit",
|
|
30
|
+
"product name",
|
|
31
|
+
"tên sản phẩm",
|
|
32
|
+
"ghi chú",
|
|
33
|
+
"trọng lượng",
|
|
34
|
+
"order id",
|
|
35
|
+
"thời gian",
|
|
36
|
+
"được đồng kiểm",
|
|
37
|
+
"không đồng kiểm",
|
|
38
|
+
"lex",
|
|
39
|
+
"lazada",
|
|
40
|
+
"shopee",
|
|
41
|
+
"tiktok",
|
|
42
|
+
"j&t",
|
|
43
|
+
"viettel",
|
|
44
|
+
"thu hộ",
|
|
45
|
+
"non-cod",
|
|
46
|
+
"đã thanh toán",
|
|
47
|
+
"được mở hàng",
|
|
48
|
+
]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Public SDK class."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from .input import prepare_input
|
|
7
|
+
from .ocr import extract_from_ocr
|
|
8
|
+
from .postprocess import post_process
|
|
9
|
+
from .text_layer import extract_from_text_layer
|
|
10
|
+
from .types import FileInput, ReceiverResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OnflowAwbOcr:
|
|
14
|
+
"""Reusable SDK client for extracting receiver name and address."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, lang: str = "vie+eng") -> None:
|
|
17
|
+
self.lang = lang
|
|
18
|
+
|
|
19
|
+
def extract(self, source: FileInput, lang: Optional[str] = None) -> Optional[ReceiverResult]:
|
|
20
|
+
prepared = prepare_input(source)
|
|
21
|
+
try:
|
|
22
|
+
result = self._extract_from_path(prepared.path, lang or self.lang)
|
|
23
|
+
return post_process(result)
|
|
24
|
+
finally:
|
|
25
|
+
prepared.cleanup()
|
|
26
|
+
|
|
27
|
+
def _extract_from_path(self, file_path: str, lang: str) -> Optional[ReceiverResult]:
|
|
28
|
+
if Path(file_path).suffix.lower() == ".pdf":
|
|
29
|
+
result = extract_from_text_layer(file_path)
|
|
30
|
+
if result:
|
|
31
|
+
return result
|
|
32
|
+
|
|
33
|
+
return extract_from_ocr(file_path, lang)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
ReceiverExtractor = OnflowAwbOcr
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Input preparation for path, URL, and binary SDK sources."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import tempfile
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import BinaryIO, Optional, Union
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
from .constants import HEADERS
|
|
11
|
+
from .types import FileInput
|
|
12
|
+
from .utils import is_url
|
|
13
|
+
|
|
14
|
+
_CONTENT_TYPE_SUFFIXES = {
|
|
15
|
+
"pdf": ".pdf",
|
|
16
|
+
"png": ".png",
|
|
17
|
+
"jpeg": ".jpg",
|
|
18
|
+
"jpg": ".jpg",
|
|
19
|
+
"gif": ".gif",
|
|
20
|
+
"tiff": ".tif",
|
|
21
|
+
"bmp": ".bmp",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
_MAGIC_SUFFIXES = (
|
|
25
|
+
(b"%PDF", ".pdf"),
|
|
26
|
+
(b"\x89PNG\r\n\x1a\n", ".png"),
|
|
27
|
+
(b"\xff\xd8\xff", ".jpg"),
|
|
28
|
+
(b"GIF87a", ".gif"),
|
|
29
|
+
(b"GIF89a", ".gif"),
|
|
30
|
+
(b"II*\x00", ".tif"),
|
|
31
|
+
(b"MM\x00*", ".tif"),
|
|
32
|
+
(b"BM", ".bmp"),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def guess_suffix_from_bytes(data: bytes) -> str:
|
|
37
|
+
for magic, suffix in _MAGIC_SUFFIXES:
|
|
38
|
+
if data.startswith(magic):
|
|
39
|
+
return suffix
|
|
40
|
+
return ".bin"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def write_bytes_to_temp(data: bytes, suffix: Optional[str] = None) -> str:
|
|
44
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
45
|
+
suffix=suffix or guess_suffix_from_bytes(data),
|
|
46
|
+
delete=False,
|
|
47
|
+
)
|
|
48
|
+
with tmp:
|
|
49
|
+
tmp.write(data)
|
|
50
|
+
return tmp.name
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def read_binary_source(source: Union[bytes, bytearray, BinaryIO]) -> bytes:
|
|
54
|
+
if isinstance(source, bytes):
|
|
55
|
+
return source
|
|
56
|
+
if isinstance(source, bytearray):
|
|
57
|
+
return bytes(source)
|
|
58
|
+
|
|
59
|
+
data = source.read()
|
|
60
|
+
if isinstance(data, str):
|
|
61
|
+
raise TypeError("Binary file-like input must return bytes, not str.")
|
|
62
|
+
if not isinstance(data, (bytes, bytearray)):
|
|
63
|
+
raise TypeError("Binary file-like input must return bytes.")
|
|
64
|
+
return bytes(data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def suffix_from_response(url: str, content_type: str) -> Optional[str]:
|
|
68
|
+
content_type = content_type.lower()
|
|
69
|
+
for marker, suffix in _CONTENT_TYPE_SUFFIXES.items():
|
|
70
|
+
if marker in content_type:
|
|
71
|
+
return suffix
|
|
72
|
+
|
|
73
|
+
suffix = Path(urlparse(url).path).suffix
|
|
74
|
+
return suffix or None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def download_to_temp(url: str) -> str:
|
|
78
|
+
try:
|
|
79
|
+
import requests
|
|
80
|
+
except ImportError as exc:
|
|
81
|
+
raise RuntimeError("requests is required to read URL inputs.") from exc
|
|
82
|
+
|
|
83
|
+
response = requests.get(url, headers=HEADERS, timeout=30)
|
|
84
|
+
response.raise_for_status()
|
|
85
|
+
suffix = suffix_from_response(url, response.headers.get("Content-Type", ""))
|
|
86
|
+
return write_bytes_to_temp(response.content, suffix)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class PreparedInput:
|
|
91
|
+
path: str
|
|
92
|
+
should_cleanup: bool
|
|
93
|
+
|
|
94
|
+
def cleanup(self) -> None:
|
|
95
|
+
if not self.should_cleanup:
|
|
96
|
+
return
|
|
97
|
+
try:
|
|
98
|
+
os.unlink(self.path)
|
|
99
|
+
except FileNotFoundError:
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def prepare_input(source: FileInput) -> PreparedInput:
|
|
104
|
+
if isinstance(source, Path):
|
|
105
|
+
return PreparedInput(str(source), False)
|
|
106
|
+
|
|
107
|
+
if isinstance(source, str):
|
|
108
|
+
if is_url(source):
|
|
109
|
+
return PreparedInput(download_to_temp(source), True)
|
|
110
|
+
return PreparedInput(source, False)
|
|
111
|
+
|
|
112
|
+
if isinstance(source, (bytes, bytearray)) or hasattr(source, "read"):
|
|
113
|
+
data = read_binary_source(source) # type: ignore[arg-type]
|
|
114
|
+
return PreparedInput(write_bytes_to_temp(data), True)
|
|
115
|
+
|
|
116
|
+
raise TypeError("source must be a path, URL, bytes, bytearray, or binary file-like object.")
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""OCR fallback extraction."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from .types import ReceiverResult
|
|
10
|
+
from .utils import is_stop
|
|
11
|
+
|
|
12
|
+
_ANCHOR_RE = re.compile(
|
|
13
|
+
r"(Đến|Den|To|Receiver|Recipient|Ng[uư][ờơ]i\s*nh[ậa]n)",
|
|
14
|
+
re.IGNORECASE,
|
|
15
|
+
)
|
|
16
|
+
_ANCHOR_PREFIX_RE = re.compile(
|
|
17
|
+
r"^(Đến|Den|To|Receiver|Recipient|Ng[uư][ờơ]i\s*nh[ậa]n)\s*[::-]?\s*",
|
|
18
|
+
re.IGNORECASE,
|
|
19
|
+
)
|
|
20
|
+
_OCR_SCALE = 2.0
|
|
21
|
+
_MIN_CONFIDENCE = 20
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_images_for_ocr(file_path: str) -> List[Any]:
|
|
25
|
+
try:
|
|
26
|
+
import cv2
|
|
27
|
+
except ImportError:
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
path = Path(file_path)
|
|
31
|
+
if path.suffix.lower() == ".pdf":
|
|
32
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
33
|
+
prefix = Path(tmp_dir) / "page"
|
|
34
|
+
try:
|
|
35
|
+
subprocess.run(
|
|
36
|
+
["pdftoppm", "-jpeg", "-r", "200", file_path, str(prefix)],
|
|
37
|
+
check=True,
|
|
38
|
+
capture_output=True,
|
|
39
|
+
)
|
|
40
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
41
|
+
return []
|
|
42
|
+
files = sorted(Path(tmp_dir).glob("page-*.jpg"))
|
|
43
|
+
images = [cv2.imread(str(file)) for file in files]
|
|
44
|
+
return [image for image in images if image is not None]
|
|
45
|
+
|
|
46
|
+
image = cv2.imread(file_path)
|
|
47
|
+
return [image] if image is not None else []
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def parse_confidence(value: Any) -> float:
|
|
51
|
+
try:
|
|
52
|
+
return float(value)
|
|
53
|
+
except (TypeError, ValueError):
|
|
54
|
+
return -1
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def extract_from_ocr(file_path: str, lang: str = "vie+eng") -> Optional[ReceiverResult]:
|
|
58
|
+
try:
|
|
59
|
+
import cv2
|
|
60
|
+
import pytesseract
|
|
61
|
+
from pytesseract import Output
|
|
62
|
+
except ImportError:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
def preprocess(image: Any, scale: float = _OCR_SCALE) -> Any:
|
|
66
|
+
image = cv2.resize(image, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
|
67
|
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
68
|
+
gray = cv2.medianBlur(gray, 3)
|
|
69
|
+
return cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
|
|
70
|
+
|
|
71
|
+
def ocr_extract(image: Any) -> Optional[ReceiverResult]:
|
|
72
|
+
data = pytesseract.image_to_data(
|
|
73
|
+
preprocess(image),
|
|
74
|
+
lang=lang,
|
|
75
|
+
config="--psm 6",
|
|
76
|
+
output_type=Output.DICT,
|
|
77
|
+
)
|
|
78
|
+
lines = {} # type: Dict[Tuple[int, int, int], List[int]]
|
|
79
|
+
for idx, text in enumerate(data["text"]):
|
|
80
|
+
text = text.strip()
|
|
81
|
+
if not text or parse_confidence(data["conf"][idx]) < _MIN_CONFIDENCE:
|
|
82
|
+
continue
|
|
83
|
+
key = (data["block_num"][idx], data["par_num"][idx], data["line_num"][idx])
|
|
84
|
+
lines.setdefault(key, []).append(idx)
|
|
85
|
+
|
|
86
|
+
anchor = None # type: Optional[Tuple[int, int, str]]
|
|
87
|
+
for _key, indexes in sorted(lines.items()):
|
|
88
|
+
line_text = " ".join(data["text"][idx].strip() for idx in indexes).strip()
|
|
89
|
+
if _ANCHOR_RE.search(line_text):
|
|
90
|
+
x1 = min(int(data["left"][idx] / _OCR_SCALE) for idx in indexes)
|
|
91
|
+
y1 = min(int(data["top"][idx] / _OCR_SCALE) for idx in indexes)
|
|
92
|
+
anchor = (x1, y1, line_text)
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
if not anchor:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
anchor_x, anchor_y, _anchor_text = anchor
|
|
99
|
+
height, width = image.shape[:2]
|
|
100
|
+
crop = image[
|
|
101
|
+
max(0, anchor_y - 10) : min(height, anchor_y + int(height * 0.25)),
|
|
102
|
+
max(0, anchor_x - 10) : min(width, int(width * 0.98)),
|
|
103
|
+
]
|
|
104
|
+
raw = pytesseract.image_to_string(
|
|
105
|
+
preprocess(crop, scale=3.0),
|
|
106
|
+
lang=lang,
|
|
107
|
+
config="--psm 6",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
output = []
|
|
111
|
+
for line in raw.splitlines():
|
|
112
|
+
line = line.strip()
|
|
113
|
+
if not line:
|
|
114
|
+
continue
|
|
115
|
+
line = _ANCHOR_PREFIX_RE.sub("", line).strip()
|
|
116
|
+
if not line:
|
|
117
|
+
continue
|
|
118
|
+
if is_stop(line):
|
|
119
|
+
break
|
|
120
|
+
output.append(line)
|
|
121
|
+
|
|
122
|
+
if not output:
|
|
123
|
+
return None
|
|
124
|
+
return {"name": output[0], "address": "\n".join(output[1:]), "strategy": "ocr"}
|
|
125
|
+
|
|
126
|
+
for image in load_images_for_ocr(file_path):
|
|
127
|
+
result = ocr_extract(image)
|
|
128
|
+
if result:
|
|
129
|
+
return result
|
|
130
|
+
return None
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Result post-processing."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Iterable, List, Optional, Pattern
|
|
5
|
+
|
|
6
|
+
from .types import ReceiverResult
|
|
7
|
+
|
|
8
|
+
_NOISE_PATTERNS = (
|
|
9
|
+
re.compile(r"^[A-Za-z]{0,6}\d{6,}[A-Za-z]{0,3}$"),
|
|
10
|
+
re.compile(r"^[A-Z]{2,4}(\s+\d{1,4})?$"),
|
|
11
|
+
re.compile(r"^[A-Za-z0-9]+-[A-Za-z0-9]+-?[A-Za-z0-9]*$"),
|
|
12
|
+
re.compile(r"^\d{3,4}$"),
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def is_noise_line(line: str, patterns: Iterable[Pattern[str]] = _NOISE_PATTERNS) -> bool:
|
|
17
|
+
if len(line) <= 2 or line == "-":
|
|
18
|
+
return True
|
|
19
|
+
return any(pattern.match(line) for pattern in patterns)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def clean_address_lines(lines: List[str]) -> List[str]:
|
|
23
|
+
"""Remove obvious airway bill, barcode, route, and layout noise lines."""
|
|
24
|
+
cleaned = []
|
|
25
|
+
for line in lines:
|
|
26
|
+
line = line.strip()
|
|
27
|
+
if not line or is_noise_line(line):
|
|
28
|
+
continue
|
|
29
|
+
cleaned.append(line)
|
|
30
|
+
return cleaned
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def post_process(result: Optional[ReceiverResult]) -> Optional[ReceiverResult]:
|
|
34
|
+
if not result:
|
|
35
|
+
return result
|
|
36
|
+
processed = dict(result)
|
|
37
|
+
address_lines = processed.get("address", "").splitlines()
|
|
38
|
+
processed["address"] = "\n".join(clean_address_lines(address_lines))
|
|
39
|
+
return processed
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Compatibility exports for receiver extraction."""
|
|
2
|
+
|
|
3
|
+
from .extractor import OnflowAwbOcr, ReceiverExtractor
|
|
4
|
+
from .types import FileInput, ReceiverResult
|
|
5
|
+
from .utils import format_result
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"FileInput",
|
|
9
|
+
"OnflowAwbOcr",
|
|
10
|
+
"ReceiverExtractor",
|
|
11
|
+
"ReceiverResult",
|
|
12
|
+
"format_result",
|
|
13
|
+
]
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""PDF text-layer receiver extraction strategies."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from .types import ReceiverResult, Span
|
|
7
|
+
from .utils import is_stop
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_RE_NR_INLINE = re.compile(
|
|
11
|
+
r"(?:Ng[uư][ờơ]i|Nguoi)\s*nh[ậa]n\s*[:\-]?\s*(.+)",
|
|
12
|
+
re.IGNORECASE,
|
|
13
|
+
)
|
|
14
|
+
_RE_DEN = re.compile(r"^(?:Đ[eế]n|Den)\s*:?\s*$", re.IGNORECASE)
|
|
15
|
+
_RE_DEN_INLINE = re.compile(r"(?:Đ[eế]n|Den)\s*:\s*", re.IGNORECASE)
|
|
16
|
+
_RE_TU = re.compile(r"^(?:T[uừ]|Tu)\s*:?\s*$", re.IGNORECASE)
|
|
17
|
+
_RE_NR_LABEL = re.compile(r"(?:Ng[uư][ờơ]i|Nguoi)\s*nh[ậa]n", re.IGNORECASE)
|
|
18
|
+
_RE_SENDER_LABEL = re.compile(r"(?:Ng[uư][ờơ]i|Nguoi)\s*g[uư][ửừ]i", re.IGNORECASE)
|
|
19
|
+
_Strategy = Tuple[str, Callable[[List[Span]], Optional[ReceiverResult]]]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_spans(page: Any) -> List[Span]:
|
|
23
|
+
"""Return sorted text spans as (y, x, size, text)."""
|
|
24
|
+
spans = []
|
|
25
|
+
for block in page.get_text("dict")["blocks"]:
|
|
26
|
+
if block["type"] != 0:
|
|
27
|
+
continue
|
|
28
|
+
for line in block["lines"]:
|
|
29
|
+
if not line["spans"]:
|
|
30
|
+
continue
|
|
31
|
+
y = round(line["spans"][0]["origin"][1], 0)
|
|
32
|
+
x = min(span["origin"][0] for span in line["spans"])
|
|
33
|
+
size = line["spans"][0]["size"]
|
|
34
|
+
text = "".join(span["text"] for span in line["spans"]).strip()
|
|
35
|
+
if text:
|
|
36
|
+
spans.append((y, round(x, 0), round(size, 1), text))
|
|
37
|
+
spans.sort(key=lambda span: (span[0], span[1]))
|
|
38
|
+
return spans
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def extract_lex(spans: List[Span]) -> Optional[ReceiverResult]:
|
|
42
|
+
"""LEX / Lazada Logistics: "Người nhận: <Tên>" inline."""
|
|
43
|
+
anchor_idx = None
|
|
44
|
+
name = ""
|
|
45
|
+
for idx, (_y, _x, _size, text) in enumerate(spans):
|
|
46
|
+
match = _RE_NR_INLINE.match(text.strip())
|
|
47
|
+
if match:
|
|
48
|
+
name = match.group(1).strip()
|
|
49
|
+
if name:
|
|
50
|
+
anchor_idx = idx
|
|
51
|
+
break
|
|
52
|
+
|
|
53
|
+
if anchor_idx is None:
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
lines = [name]
|
|
57
|
+
for _y, _x, _size, text in spans[anchor_idx + 1 :]:
|
|
58
|
+
value = text.strip()
|
|
59
|
+
if not value:
|
|
60
|
+
continue
|
|
61
|
+
if _RE_SENDER_LABEL.match(value) or is_stop(value):
|
|
62
|
+
break
|
|
63
|
+
lines.append(value)
|
|
64
|
+
|
|
65
|
+
return {"name": lines[0], "address": "\n".join(lines[1:]) if len(lines) > 1 else ""}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_shopee(spans: List[Span]) -> Optional[ReceiverResult]:
|
|
69
|
+
"""Shopee SPX / Ahamove: right column below "Đến:"."""
|
|
70
|
+
den_y = None
|
|
71
|
+
den_x = None
|
|
72
|
+
|
|
73
|
+
for y, x, _size, text in spans:
|
|
74
|
+
stripped = text.strip()
|
|
75
|
+
if _RE_DEN.match(stripped):
|
|
76
|
+
den_y = y
|
|
77
|
+
den_x = x
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
if _RE_DEN_INLINE.match(stripped):
|
|
81
|
+
after = _RE_DEN_INLINE.sub("", text, count=1).strip()
|
|
82
|
+
if after:
|
|
83
|
+
den_y = y
|
|
84
|
+
den_x = x
|
|
85
|
+
lines = [after]
|
|
86
|
+
for y2, x2, _size2, text2 in spans:
|
|
87
|
+
if y2 <= den_y:
|
|
88
|
+
continue
|
|
89
|
+
if x2 < den_x - 10:
|
|
90
|
+
if is_stop(text2):
|
|
91
|
+
break
|
|
92
|
+
continue
|
|
93
|
+
if is_stop(text2):
|
|
94
|
+
break
|
|
95
|
+
lines.append(text2.strip())
|
|
96
|
+
if lines:
|
|
97
|
+
return {"name": lines[0], "address": "\n".join(lines[1:])}
|
|
98
|
+
|
|
99
|
+
if den_y is None or den_x is None:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
right_lines = {} # type: Dict[float, List[str]]
|
|
103
|
+
stop_y = float("inf")
|
|
104
|
+
for y, x, _size, text in spans:
|
|
105
|
+
if y <= den_y:
|
|
106
|
+
continue
|
|
107
|
+
if x < den_x - 5:
|
|
108
|
+
if is_stop(text):
|
|
109
|
+
stop_y = min(stop_y, y)
|
|
110
|
+
continue
|
|
111
|
+
if y >= stop_y:
|
|
112
|
+
break
|
|
113
|
+
if is_stop(text):
|
|
114
|
+
stop_y = min(stop_y, y)
|
|
115
|
+
break
|
|
116
|
+
if _RE_DEN.match(text.strip()) or _RE_TU.match(text.strip()):
|
|
117
|
+
continue
|
|
118
|
+
right_lines.setdefault(y, []).append(text.strip())
|
|
119
|
+
|
|
120
|
+
lines = []
|
|
121
|
+
for y in sorted(right_lines):
|
|
122
|
+
merged = " ".join(right_lines[y]).strip()
|
|
123
|
+
if merged:
|
|
124
|
+
lines.append(merged)
|
|
125
|
+
|
|
126
|
+
if not lines:
|
|
127
|
+
return None
|
|
128
|
+
return {"name": lines[0], "address": "\n".join(lines[1:])}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def extract_jt_vtp(spans: List[Span]) -> Optional[ReceiverResult]:
|
|
132
|
+
"""J&T / Viettel Post: small "Người nhận" label, larger name/address below."""
|
|
133
|
+
anchor_idx = None
|
|
134
|
+
for idx, (_y, _x, size, text) in enumerate(spans):
|
|
135
|
+
if _RE_NR_LABEL.search(text.strip()) and size < 9:
|
|
136
|
+
anchor_idx = idx
|
|
137
|
+
break
|
|
138
|
+
|
|
139
|
+
if anchor_idx is None:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
anchor_y = spans[anchor_idx][0]
|
|
143
|
+
anchor_x = spans[anchor_idx][1]
|
|
144
|
+
|
|
145
|
+
name = ""
|
|
146
|
+
name_y = None
|
|
147
|
+
for y, x, size, text in spans[anchor_idx + 1 :]:
|
|
148
|
+
value = text.strip()
|
|
149
|
+
if not value:
|
|
150
|
+
continue
|
|
151
|
+
if is_stop(value):
|
|
152
|
+
break
|
|
153
|
+
if abs(y - anchor_y) < 5 and x > anchor_x:
|
|
154
|
+
name = value
|
|
155
|
+
name_y = y
|
|
156
|
+
break
|
|
157
|
+
if y > anchor_y and size > 8:
|
|
158
|
+
name = value
|
|
159
|
+
name_y = y
|
|
160
|
+
break
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
if not name:
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
address_lines = []
|
|
167
|
+
for y, _x, _size, text in spans:
|
|
168
|
+
if name_y is None or y <= name_y:
|
|
169
|
+
continue
|
|
170
|
+
value = text.strip()
|
|
171
|
+
if not value:
|
|
172
|
+
continue
|
|
173
|
+
if is_stop(value):
|
|
174
|
+
break
|
|
175
|
+
if re.match(r"^\(\+\d+\)|^0\d+\*+", value):
|
|
176
|
+
continue
|
|
177
|
+
address_lines.append(value)
|
|
178
|
+
|
|
179
|
+
return {"name": name, "address": "\n".join(address_lines)}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def extract_from_text_layer(pdf_path: str) -> Optional[ReceiverResult]:
|
|
183
|
+
try:
|
|
184
|
+
import fitz
|
|
185
|
+
except ImportError as exc:
|
|
186
|
+
raise RuntimeError("PyMuPDF is required to read PDF text layers.") from exc
|
|
187
|
+
|
|
188
|
+
doc = fitz.open(pdf_path)
|
|
189
|
+
try:
|
|
190
|
+
page = doc[0]
|
|
191
|
+
spans = get_spans(page)
|
|
192
|
+
finally:
|
|
193
|
+
doc.close()
|
|
194
|
+
|
|
195
|
+
if not spans:
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
strategies = ( # type: Tuple[_Strategy, ...]
|
|
199
|
+
("lex", extract_lex),
|
|
200
|
+
("shopee", extract_shopee),
|
|
201
|
+
("jt_vtp", extract_jt_vtp),
|
|
202
|
+
)
|
|
203
|
+
for strategy_name, strategy in strategies:
|
|
204
|
+
result = strategy(spans)
|
|
205
|
+
if result and result["name"]:
|
|
206
|
+
result["strategy"] = strategy_name
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
return None
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Shared text helpers."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import unicodedata
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from .constants import STOP_KEYWORDS
|
|
8
|
+
from .types import ReceiverResult
|
|
9
|
+
|
|
10
|
+
_SAFE_TEXT_RE = re.compile(r"[^a-z0-9\s:/.-]")
|
|
11
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def normalize(text: str) -> str:
|
|
15
|
+
normalized = unicodedata.normalize("NFKD", text.strip().lower())
|
|
16
|
+
without_marks = "".join(char for char in normalized if not unicodedata.combining(char))
|
|
17
|
+
safe_text = _SAFE_TEXT_RE.sub(" ", without_marks)
|
|
18
|
+
return _WHITESPACE_RE.sub(" ", safe_text).strip()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _normalize_all(values: Iterable[str]) -> tuple:
|
|
22
|
+
return tuple(normalize(value) for value in values)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_NORMALIZED_STOP_KEYWORDS = _normalize_all(STOP_KEYWORDS)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_stop(line: str) -> bool:
|
|
29
|
+
norm = normalize(line)
|
|
30
|
+
return any(keyword in norm for keyword in _NORMALIZED_STOP_KEYWORDS)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_url(value: str) -> bool:
|
|
34
|
+
return value.startswith("http://") or value.startswith("https://")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def format_result(result: Optional[ReceiverResult]) -> str:
|
|
38
|
+
if not result:
|
|
39
|
+
return "Không tìm thấy thông tin receiver."
|
|
40
|
+
|
|
41
|
+
parts = [result.get("name", "")]
|
|
42
|
+
address = result.get("address", "").strip()
|
|
43
|
+
if address:
|
|
44
|
+
parts.append(address)
|
|
45
|
+
return "\n".join(part for part in parts if part)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: onflow-awb-ocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for extracting receiver information from AWB/shipping labels.
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: PyMuPDF>=1.20
|
|
8
|
+
Requires-Dist: requests>=2.25
|
|
9
|
+
Provides-Extra: ocr
|
|
10
|
+
Requires-Dist: opencv-python>=4.5; extra == "ocr"
|
|
11
|
+
Requires-Dist: pytesseract>=0.3; extra == "ocr"
|
|
12
|
+
Requires-Dist: numpy>=1.21; extra == "ocr"
|
|
13
|
+
|
|
14
|
+
# Onflow AWB OCR
|
|
15
|
+
|
|
16
|
+
Python SDK for extracting receiver information from AWB and shipping label files.
|
|
17
|
+
|
|
18
|
+
The package supports PDF files with a text layer first, then falls back to OCR for
|
|
19
|
+
scanned PDFs and image files when OCR dependencies are installed.
|
|
20
|
+
|
|
21
|
+
## Requirements
|
|
22
|
+
|
|
23
|
+
- Python 3.8+
|
|
24
|
+
- PyMuPDF for PDF text-layer extraction
|
|
25
|
+
- Optional OCR stack for scanned files and images:
|
|
26
|
+
- Tesseract OCR
|
|
27
|
+
- Vietnamese Tesseract language data
|
|
28
|
+
- Poppler `pdftoppm`
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
Install from PyPI:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install onflow-awb-ocr
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Install with OCR dependencies:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install "onflow-awb-ocr[ocr]"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
On Ubuntu, install the native OCR tools:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
sudo apt install -y tesseract-ocr tesseract-ocr-vie poppler-utils
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
For local development:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install -e ".[ocr]"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from onflow_awb_ocr import OnflowAwbOcr
|
|
60
|
+
|
|
61
|
+
ocr = OnflowAwbOcr(lang="vie+eng")
|
|
62
|
+
result = ocr.extract("label.pdf")
|
|
63
|
+
|
|
64
|
+
print(result)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Example result:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
{
|
|
71
|
+
"name": "Nguyen Van A",
|
|
72
|
+
"address": "123 Nguyen Trai\nQuan 1, TP. Ho Chi Minh",
|
|
73
|
+
"strategy": "shopee",
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
If no receiver can be detected, `extract()` returns `None`.
|
|
78
|
+
|
|
79
|
+
## Supported Inputs
|
|
80
|
+
|
|
81
|
+
`extract()` accepts:
|
|
82
|
+
|
|
83
|
+
- Local file path as `str`
|
|
84
|
+
- Local file path as `pathlib.Path`
|
|
85
|
+
- HTTP/HTTPS URL
|
|
86
|
+
- `bytes`
|
|
87
|
+
- `bytearray`
|
|
88
|
+
- Binary file-like object
|
|
89
|
+
|
|
90
|
+
Examples:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from pathlib import Path
|
|
94
|
+
|
|
95
|
+
from onflow_awb_ocr import OnflowAwbOcr
|
|
96
|
+
|
|
97
|
+
ocr = OnflowAwbOcr()
|
|
98
|
+
|
|
99
|
+
from_path = ocr.extract(Path("label.pdf"))
|
|
100
|
+
from_url = ocr.extract("https://example.com/label.pdf")
|
|
101
|
+
|
|
102
|
+
with open("label.pdf", "rb") as file:
|
|
103
|
+
from_file = ocr.extract(file)
|
|
104
|
+
|
|
105
|
+
with open("label.png", "rb") as file:
|
|
106
|
+
from_bytes = ocr.extract(file.read())
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Compatibility
|
|
110
|
+
|
|
111
|
+
The old `ReceiverExtractor` class name is still available as an alias:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from onflow_awb_ocr import ReceiverExtractor
|
|
115
|
+
|
|
116
|
+
ocr = ReceiverExtractor()
|
|
117
|
+
result = ocr.extract("label.pdf")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Package Structure
|
|
121
|
+
|
|
122
|
+
- `extractor.py`: public `OnflowAwbOcr` class
|
|
123
|
+
- `input.py`: input preparation for paths, URLs, bytes, and binary streams
|
|
124
|
+
- `text_layer.py`: PDF text-layer extraction strategies
|
|
125
|
+
- `ocr.py`: OCR fallback for scanned PDFs and images
|
|
126
|
+
- `postprocess.py`: address cleanup
|
|
127
|
+
- `types.py`, `constants.py`, `utils.py`: shared types, constants, and helpers
|
|
128
|
+
|
|
129
|
+
## Publishing
|
|
130
|
+
|
|
131
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
132
|
+
|
|
133
|
+
The repository must define this GitHub secret:
|
|
134
|
+
|
|
135
|
+
```text
|
|
136
|
+
PYPI_API_TOKEN
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
140
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
141
|
+
distribution.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
onflow_awb_ocr/__init__.py
|
|
4
|
+
onflow_awb_ocr/constants.py
|
|
5
|
+
onflow_awb_ocr/extractor.py
|
|
6
|
+
onflow_awb_ocr/input.py
|
|
7
|
+
onflow_awb_ocr/ocr.py
|
|
8
|
+
onflow_awb_ocr/postprocess.py
|
|
9
|
+
onflow_awb_ocr/py.typed
|
|
10
|
+
onflow_awb_ocr/receiver.py
|
|
11
|
+
onflow_awb_ocr/text_layer.py
|
|
12
|
+
onflow_awb_ocr/types.py
|
|
13
|
+
onflow_awb_ocr/utils.py
|
|
14
|
+
onflow_awb_ocr.egg-info/PKG-INFO
|
|
15
|
+
onflow_awb_ocr.egg-info/SOURCES.txt
|
|
16
|
+
onflow_awb_ocr.egg-info/dependency_links.txt
|
|
17
|
+
onflow_awb_ocr.egg-info/requires.txt
|
|
18
|
+
onflow_awb_ocr.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
onflow_awb_ocr
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "onflow-awb-ocr"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Python SDK for extracting receiver information from AWB/shipping labels."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"PyMuPDF>=1.20",
|
|
13
|
+
"requests>=2.25",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
ocr = [
|
|
18
|
+
"opencv-python>=4.5",
|
|
19
|
+
"pytesseract>=0.3",
|
|
20
|
+
"numpy>=1.21",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
include = ["onflow_awb_ocr*"]
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.package-data]
|
|
27
|
+
onflow_awb_ocr = ["py.typed"]
|