onflow-awb-ocr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: onflow-awb-ocr
3
+ Version: 0.1.0
4
+ Summary: Python SDK for extracting receiver information from AWB/shipping labels.
5
+ Requires-Python: >=3.8
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: PyMuPDF>=1.20
8
+ Requires-Dist: requests>=2.25
9
+ Provides-Extra: ocr
10
+ Requires-Dist: opencv-python>=4.5; extra == "ocr"
11
+ Requires-Dist: pytesseract>=0.3; extra == "ocr"
12
+ Requires-Dist: numpy>=1.21; extra == "ocr"
13
+
14
+ # Onflow AWB OCR
15
+
16
+ Python SDK for extracting receiver information from AWB and shipping label files.
17
+
18
+ The package supports PDF files with a text layer first, then falls back to OCR for
19
+ scanned PDFs and image files when OCR dependencies are installed.
20
+
21
+ ## Requirements
22
+
23
+ - Python 3.8+
24
+ - PyMuPDF for PDF text-layer extraction
25
+ - Optional OCR stack for scanned files and images:
26
+ - Tesseract OCR
27
+ - Vietnamese Tesseract language data
28
+ - Poppler `pdftoppm`
29
+
30
+ ## Installation
31
+
32
+ Install from PyPI:
33
+
34
+ ```bash
35
+ pip install onflow-awb-ocr
36
+ ```
37
+
38
+ Install with OCR dependencies:
39
+
40
+ ```bash
41
+ pip install "onflow-awb-ocr[ocr]"
42
+ ```
43
+
44
+ On Ubuntu, install the native OCR tools:
45
+
46
+ ```bash
47
+ sudo apt install -y tesseract-ocr tesseract-ocr-vie poppler-utils
48
+ ```
49
+
50
+ For local development:
51
+
52
+ ```bash
53
+ pip install -e ".[ocr]"
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ ```python
59
+ from onflow_awb_ocr import OnflowAwbOcr
60
+
61
+ ocr = OnflowAwbOcr(lang="vie+eng")
62
+ result = ocr.extract("label.pdf")
63
+
64
+ print(result)
65
+ ```
66
+
67
+ Example result:
68
+
69
+ ```python
70
+ {
71
+ "name": "Nguyen Van A",
72
+ "address": "123 Nguyen Trai\nQuan 1, TP. Ho Chi Minh",
73
+ "strategy": "shopee",
74
+ }
75
+ ```
76
+
77
+ If no receiver can be detected, `extract()` returns `None`.
78
+
79
+ ## Supported Inputs
80
+
81
+ `extract()` accepts:
82
+
83
+ - Local file path as `str`
84
+ - Local file path as `pathlib.Path`
85
+ - HTTP/HTTPS URL
86
+ - `bytes`
87
+ - `bytearray`
88
+ - Binary file-like object
89
+
90
+ Examples:
91
+
92
+ ```python
93
+ from pathlib import Path
94
+
95
+ from onflow_awb_ocr import OnflowAwbOcr
96
+
97
+ ocr = OnflowAwbOcr()
98
+
99
+ from_path = ocr.extract(Path("label.pdf"))
100
+ from_url = ocr.extract("https://example.com/label.pdf")
101
+
102
+ with open("label.pdf", "rb") as file:
103
+ from_file = ocr.extract(file)
104
+
105
+ with open("label.png", "rb") as file:
106
+ from_bytes = ocr.extract(file.read())
107
+ ```
108
+
109
+ ## Compatibility
110
+
111
+ The old `ReceiverExtractor` class name is still available as an alias:
112
+
113
+ ```python
114
+ from onflow_awb_ocr import ReceiverExtractor
115
+
116
+ ocr = ReceiverExtractor()
117
+ result = ocr.extract("label.pdf")
118
+ ```
119
+
120
+ ## Package Structure
121
+
122
+ - `extractor.py`: public `OnflowAwbOcr` class
123
+ - `input.py`: input preparation for paths, URLs, bytes, and binary streams
124
+ - `text_layer.py`: PDF text-layer extraction strategies
125
+ - `ocr.py`: OCR fallback for scanned PDFs and images
126
+ - `postprocess.py`: address cleanup
127
+ - `types.py`, `constants.py`, `utils.py`: shared types, constants, and helpers
128
+
129
+ ## Publishing
130
+
131
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
132
+
133
+ The repository must define this GitHub secret:
134
+
135
+ ```text
136
+ PYPI_API_TOKEN
137
+ ```
138
+
139
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
140
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
141
+ distribution.
@@ -0,0 +1,128 @@
1
+ # Onflow AWB OCR
2
+
3
+ Python SDK for extracting receiver information from AWB and shipping label files.
4
+
5
+ The package supports PDF files with a text layer first, then falls back to OCR for
6
+ scanned PDFs and image files when OCR dependencies are installed.
7
+
8
+ ## Requirements
9
+
10
+ - Python 3.8+
11
+ - PyMuPDF for PDF text-layer extraction
12
+ - Optional OCR stack for scanned files and images:
13
+ - Tesseract OCR
14
+ - Vietnamese Tesseract language data
15
+ - Poppler `pdftoppm`
16
+
17
+ ## Installation
18
+
19
+ Install from PyPI:
20
+
21
+ ```bash
22
+ pip install onflow-awb-ocr
23
+ ```
24
+
25
+ Install with OCR dependencies:
26
+
27
+ ```bash
28
+ pip install "onflow-awb-ocr[ocr]"
29
+ ```
30
+
31
+ On Ubuntu, install the native OCR tools:
32
+
33
+ ```bash
34
+ sudo apt install -y tesseract-ocr tesseract-ocr-vie poppler-utils
35
+ ```
36
+
37
+ For local development:
38
+
39
+ ```bash
40
+ pip install -e ".[ocr]"
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ```python
46
+ from onflow_awb_ocr import OnflowAwbOcr
47
+
48
+ ocr = OnflowAwbOcr(lang="vie+eng")
49
+ result = ocr.extract("label.pdf")
50
+
51
+ print(result)
52
+ ```
53
+
54
+ Example result:
55
+
56
+ ```python
57
+ {
58
+ "name": "Nguyen Van A",
59
+ "address": "123 Nguyen Trai\nQuan 1, TP. Ho Chi Minh",
60
+ "strategy": "shopee",
61
+ }
62
+ ```
63
+
64
+ If no receiver can be detected, `extract()` returns `None`.
65
+
66
+ ## Supported Inputs
67
+
68
+ `extract()` accepts:
69
+
70
+ - Local file path as `str`
71
+ - Local file path as `pathlib.Path`
72
+ - HTTP/HTTPS URL
73
+ - `bytes`
74
+ - `bytearray`
75
+ - Binary file-like object
76
+
77
+ Examples:
78
+
79
+ ```python
80
+ from pathlib import Path
81
+
82
+ from onflow_awb_ocr import OnflowAwbOcr
83
+
84
+ ocr = OnflowAwbOcr()
85
+
86
+ from_path = ocr.extract(Path("label.pdf"))
87
+ from_url = ocr.extract("https://example.com/label.pdf")
88
+
89
+ with open("label.pdf", "rb") as file:
90
+ from_file = ocr.extract(file)
91
+
92
+ with open("label.png", "rb") as file:
93
+ from_bytes = ocr.extract(file.read())
94
+ ```
95
+
96
+ ## Compatibility
97
+
98
+ The old `ReceiverExtractor` class name is still available as an alias:
99
+
100
+ ```python
101
+ from onflow_awb_ocr import ReceiverExtractor
102
+
103
+ ocr = ReceiverExtractor()
104
+ result = ocr.extract("label.pdf")
105
+ ```
106
+
107
+ ## Package Structure
108
+
109
+ - `extractor.py`: public `OnflowAwbOcr` class
110
+ - `input.py`: input preparation for paths, URLs, bytes, and binary streams
111
+ - `text_layer.py`: PDF text-layer extraction strategies
112
+ - `ocr.py`: OCR fallback for scanned PDFs and images
113
+ - `postprocess.py`: address cleanup
114
+ - `types.py`, `constants.py`, `utils.py`: shared types, constants, and helpers
115
+
116
+ ## Publishing
117
+
118
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
119
+
120
+ The repository must define this GitHub secret:
121
+
122
+ ```text
123
+ PYPI_API_TOKEN
124
+ ```
125
+
126
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
127
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
128
+ distribution.
@@ -0,0 +1,11 @@
1
+ """Python SDK for extracting receiver information from AWB/shipping labels."""
2
+
3
+ from .extractor import OnflowAwbOcr, ReceiverExtractor
4
+ from .types import FileInput, ReceiverResult
5
+
6
+ __all__ = [
7
+ "FileInput",
8
+ "OnflowAwbOcr",
9
+ "ReceiverExtractor",
10
+ "ReceiverResult",
11
+ ]
@@ -0,0 +1,48 @@
1
+ """Static extraction constants."""
2
+
3
+
4
+ HEADERS = {
5
+ "User-Agent": (
6
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
7
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
8
+ "Chrome/124.0.0.0 Safari/537.36"
9
+ )
10
+ }
11
+
12
+ STOP_KEYWORDS = [
13
+ "nội dung hàng",
14
+ "tổng sl",
15
+ "ngày đặt hàng",
16
+ "khối lượng",
17
+ "tiền thu",
18
+ "cod",
19
+ "chữ ký",
20
+ "mã vận đơn",
21
+ "mã đơn hàng",
22
+ "chỉ dẫn giao hàng",
23
+ "xác nhận",
24
+ "chuyển hoàn",
25
+ "lưu kho",
26
+ "người gửi",
27
+ "spx tuyển",
28
+ "hc-",
29
+ "in transit",
30
+ "product name",
31
+ "tên sản phẩm",
32
+ "ghi chú",
33
+ "trọng lượng",
34
+ "order id",
35
+ "thời gian",
36
+ "được đồng kiểm",
37
+ "không đồng kiểm",
38
+ "lex",
39
+ "lazada",
40
+ "shopee",
41
+ "tiktok",
42
+ "j&t",
43
+ "viettel",
44
+ "thu hộ",
45
+ "non-cod",
46
+ "đã thanh toán",
47
+ "được mở hàng",
48
+ ]
@@ -0,0 +1,36 @@
1
+ """Public SDK class."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from .input import prepare_input
7
+ from .ocr import extract_from_ocr
8
+ from .postprocess import post_process
9
+ from .text_layer import extract_from_text_layer
10
+ from .types import FileInput, ReceiverResult
11
+
12
+
13
+ class OnflowAwbOcr:
14
+ """Reusable SDK client for extracting receiver name and address."""
15
+
16
+ def __init__(self, lang: str = "vie+eng") -> None:
17
+ self.lang = lang
18
+
19
+ def extract(self, source: FileInput, lang: Optional[str] = None) -> Optional[ReceiverResult]:
20
+ prepared = prepare_input(source)
21
+ try:
22
+ result = self._extract_from_path(prepared.path, lang or self.lang)
23
+ return post_process(result)
24
+ finally:
25
+ prepared.cleanup()
26
+
27
+ def _extract_from_path(self, file_path: str, lang: str) -> Optional[ReceiverResult]:
28
+ if Path(file_path).suffix.lower() == ".pdf":
29
+ result = extract_from_text_layer(file_path)
30
+ if result:
31
+ return result
32
+
33
+ return extract_from_ocr(file_path, lang)
34
+
35
+
36
+ ReceiverExtractor = OnflowAwbOcr
@@ -0,0 +1,116 @@
1
+ """Input preparation for path, URL, and binary SDK sources."""
2
+
3
+ import os
4
+ import tempfile
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import BinaryIO, Optional, Union
8
+ from urllib.parse import urlparse
9
+
10
+ from .constants import HEADERS
11
+ from .types import FileInput
12
+ from .utils import is_url
13
+
14
+ _CONTENT_TYPE_SUFFIXES = {
15
+ "pdf": ".pdf",
16
+ "png": ".png",
17
+ "jpeg": ".jpg",
18
+ "jpg": ".jpg",
19
+ "gif": ".gif",
20
+ "tiff": ".tif",
21
+ "bmp": ".bmp",
22
+ }
23
+
24
+ _MAGIC_SUFFIXES = (
25
+ (b"%PDF", ".pdf"),
26
+ (b"\x89PNG\r\n\x1a\n", ".png"),
27
+ (b"\xff\xd8\xff", ".jpg"),
28
+ (b"GIF87a", ".gif"),
29
+ (b"GIF89a", ".gif"),
30
+ (b"II*\x00", ".tif"),
31
+ (b"MM\x00*", ".tif"),
32
+ (b"BM", ".bmp"),
33
+ )
34
+
35
+
36
+ def guess_suffix_from_bytes(data: bytes) -> str:
37
+ for magic, suffix in _MAGIC_SUFFIXES:
38
+ if data.startswith(magic):
39
+ return suffix
40
+ return ".bin"
41
+
42
+
43
+ def write_bytes_to_temp(data: bytes, suffix: Optional[str] = None) -> str:
44
+ tmp = tempfile.NamedTemporaryFile(
45
+ suffix=suffix or guess_suffix_from_bytes(data),
46
+ delete=False,
47
+ )
48
+ with tmp:
49
+ tmp.write(data)
50
+ return tmp.name
51
+
52
+
53
+ def read_binary_source(source: Union[bytes, bytearray, BinaryIO]) -> bytes:
54
+ if isinstance(source, bytes):
55
+ return source
56
+ if isinstance(source, bytearray):
57
+ return bytes(source)
58
+
59
+ data = source.read()
60
+ if isinstance(data, str):
61
+ raise TypeError("Binary file-like input must return bytes, not str.")
62
+ if not isinstance(data, (bytes, bytearray)):
63
+ raise TypeError("Binary file-like input must return bytes.")
64
+ return bytes(data)
65
+
66
+
67
+ def suffix_from_response(url: str, content_type: str) -> Optional[str]:
68
+ content_type = content_type.lower()
69
+ for marker, suffix in _CONTENT_TYPE_SUFFIXES.items():
70
+ if marker in content_type:
71
+ return suffix
72
+
73
+ suffix = Path(urlparse(url).path).suffix
74
+ return suffix or None
75
+
76
+
77
+ def download_to_temp(url: str) -> str:
78
+ try:
79
+ import requests
80
+ except ImportError as exc:
81
+ raise RuntimeError("requests is required to read URL inputs.") from exc
82
+
83
+ response = requests.get(url, headers=HEADERS, timeout=30)
84
+ response.raise_for_status()
85
+ suffix = suffix_from_response(url, response.headers.get("Content-Type", ""))
86
+ return write_bytes_to_temp(response.content, suffix)
87
+
88
+
89
+ @dataclass(frozen=True)
90
+ class PreparedInput:
91
+ path: str
92
+ should_cleanup: bool
93
+
94
+ def cleanup(self) -> None:
95
+ if not self.should_cleanup:
96
+ return
97
+ try:
98
+ os.unlink(self.path)
99
+ except FileNotFoundError:
100
+ pass
101
+
102
+
103
+ def prepare_input(source: FileInput) -> PreparedInput:
104
+ if isinstance(source, Path):
105
+ return PreparedInput(str(source), False)
106
+
107
+ if isinstance(source, str):
108
+ if is_url(source):
109
+ return PreparedInput(download_to_temp(source), True)
110
+ return PreparedInput(source, False)
111
+
112
+ if isinstance(source, (bytes, bytearray)) or hasattr(source, "read"):
113
+ data = read_binary_source(source) # type: ignore[arg-type]
114
+ return PreparedInput(write_bytes_to_temp(data), True)
115
+
116
+ raise TypeError("source must be a path, URL, bytes, bytearray, or binary file-like object.")
@@ -0,0 +1,130 @@
1
+ """OCR fallback extraction."""
2
+
3
+ import re
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ from .types import ReceiverResult
10
+ from .utils import is_stop
11
+
12
+ _ANCHOR_RE = re.compile(
13
+ r"(Đến|Den|To|Receiver|Recipient|Ng[uư][ờơ]i\s*nh[ậa]n)",
14
+ re.IGNORECASE,
15
+ )
16
+ _ANCHOR_PREFIX_RE = re.compile(
17
+ r"^(Đến|Den|To|Receiver|Recipient|Ng[uư][ờơ]i\s*nh[ậa]n)\s*[::-]?\s*",
18
+ re.IGNORECASE,
19
+ )
20
+ _OCR_SCALE = 2.0
21
+ _MIN_CONFIDENCE = 20
22
+
23
+
24
+ def load_images_for_ocr(file_path: str) -> List[Any]:
25
+ try:
26
+ import cv2
27
+ except ImportError:
28
+ return []
29
+
30
+ path = Path(file_path)
31
+ if path.suffix.lower() == ".pdf":
32
+ with tempfile.TemporaryDirectory() as tmp_dir:
33
+ prefix = Path(tmp_dir) / "page"
34
+ try:
35
+ subprocess.run(
36
+ ["pdftoppm", "-jpeg", "-r", "200", file_path, str(prefix)],
37
+ check=True,
38
+ capture_output=True,
39
+ )
40
+ except (FileNotFoundError, subprocess.CalledProcessError):
41
+ return []
42
+ files = sorted(Path(tmp_dir).glob("page-*.jpg"))
43
+ images = [cv2.imread(str(file)) for file in files]
44
+ return [image for image in images if image is not None]
45
+
46
+ image = cv2.imread(file_path)
47
+ return [image] if image is not None else []
48
+
49
+
50
+ def parse_confidence(value: Any) -> float:
51
+ try:
52
+ return float(value)
53
+ except (TypeError, ValueError):
54
+ return -1
55
+
56
+
57
+ def extract_from_ocr(file_path: str, lang: str = "vie+eng") -> Optional[ReceiverResult]:
58
+ try:
59
+ import cv2
60
+ import pytesseract
61
+ from pytesseract import Output
62
+ except ImportError:
63
+ return None
64
+
65
+ def preprocess(image: Any, scale: float = _OCR_SCALE) -> Any:
66
+ image = cv2.resize(image, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
67
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
68
+ gray = cv2.medianBlur(gray, 3)
69
+ return cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
70
+
71
+ def ocr_extract(image: Any) -> Optional[ReceiverResult]:
72
+ data = pytesseract.image_to_data(
73
+ preprocess(image),
74
+ lang=lang,
75
+ config="--psm 6",
76
+ output_type=Output.DICT,
77
+ )
78
+ lines = {} # type: Dict[Tuple[int, int, int], List[int]]
79
+ for idx, text in enumerate(data["text"]):
80
+ text = text.strip()
81
+ if not text or parse_confidence(data["conf"][idx]) < _MIN_CONFIDENCE:
82
+ continue
83
+ key = (data["block_num"][idx], data["par_num"][idx], data["line_num"][idx])
84
+ lines.setdefault(key, []).append(idx)
85
+
86
+ anchor = None # type: Optional[Tuple[int, int, str]]
87
+ for _key, indexes in sorted(lines.items()):
88
+ line_text = " ".join(data["text"][idx].strip() for idx in indexes).strip()
89
+ if _ANCHOR_RE.search(line_text):
90
+ x1 = min(int(data["left"][idx] / _OCR_SCALE) for idx in indexes)
91
+ y1 = min(int(data["top"][idx] / _OCR_SCALE) for idx in indexes)
92
+ anchor = (x1, y1, line_text)
93
+ break
94
+
95
+ if not anchor:
96
+ return None
97
+
98
+ anchor_x, anchor_y, _anchor_text = anchor
99
+ height, width = image.shape[:2]
100
+ crop = image[
101
+ max(0, anchor_y - 10) : min(height, anchor_y + int(height * 0.25)),
102
+ max(0, anchor_x - 10) : min(width, int(width * 0.98)),
103
+ ]
104
+ raw = pytesseract.image_to_string(
105
+ preprocess(crop, scale=3.0),
106
+ lang=lang,
107
+ config="--psm 6",
108
+ )
109
+
110
+ output = []
111
+ for line in raw.splitlines():
112
+ line = line.strip()
113
+ if not line:
114
+ continue
115
+ line = _ANCHOR_PREFIX_RE.sub("", line).strip()
116
+ if not line:
117
+ continue
118
+ if is_stop(line):
119
+ break
120
+ output.append(line)
121
+
122
+ if not output:
123
+ return None
124
+ return {"name": output[0], "address": "\n".join(output[1:]), "strategy": "ocr"}
125
+
126
+ for image in load_images_for_ocr(file_path):
127
+ result = ocr_extract(image)
128
+ if result:
129
+ return result
130
+ return None
@@ -0,0 +1,39 @@
1
+ """Result post-processing."""
2
+
3
+ import re
4
+ from typing import Iterable, List, Optional, Pattern
5
+
6
+ from .types import ReceiverResult
7
+
8
+ _NOISE_PATTERNS = (
9
+ re.compile(r"^[A-Za-z]{0,6}\d{6,}[A-Za-z]{0,3}$"),
10
+ re.compile(r"^[A-Z]{2,4}(\s+\d{1,4})?$"),
11
+ re.compile(r"^[A-Za-z0-9]+-[A-Za-z0-9]+-?[A-Za-z0-9]*$"),
12
+ re.compile(r"^\d{3,4}$"),
13
+ )
14
+
15
+
16
+ def is_noise_line(line: str, patterns: Iterable[Pattern[str]] = _NOISE_PATTERNS) -> bool:
17
+ if len(line) <= 2 or line == "-":
18
+ return True
19
+ return any(pattern.match(line) for pattern in patterns)
20
+
21
+
22
+ def clean_address_lines(lines: List[str]) -> List[str]:
23
+ """Remove obvious airway bill, barcode, route, and layout noise lines."""
24
+ cleaned = []
25
+ for line in lines:
26
+ line = line.strip()
27
+ if not line or is_noise_line(line):
28
+ continue
29
+ cleaned.append(line)
30
+ return cleaned
31
+
32
+
33
+ def post_process(result: Optional[ReceiverResult]) -> Optional[ReceiverResult]:
34
+ if not result:
35
+ return result
36
+ processed = dict(result)
37
+ address_lines = processed.get("address", "").splitlines()
38
+ processed["address"] = "\n".join(clean_address_lines(address_lines))
39
+ return processed
@@ -0,0 +1,13 @@
1
+ """Compatibility exports for receiver extraction."""
2
+
3
+ from .extractor import OnflowAwbOcr, ReceiverExtractor
4
+ from .types import FileInput, ReceiverResult
5
+ from .utils import format_result
6
+
7
+ __all__ = [
8
+ "FileInput",
9
+ "OnflowAwbOcr",
10
+ "ReceiverExtractor",
11
+ "ReceiverResult",
12
+ "format_result",
13
+ ]
@@ -0,0 +1,209 @@
1
+ """PDF text-layer receiver extraction strategies."""
2
+
3
+ import re
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple
5
+
6
+ from .types import ReceiverResult, Span
7
+ from .utils import is_stop
8
+
9
+
10
+ _RE_NR_INLINE = re.compile(
11
+ r"(?:Ng[uư][ờơ]i|Nguoi)\s*nh[ậa]n\s*[:\-]?\s*(.+)",
12
+ re.IGNORECASE,
13
+ )
14
+ _RE_DEN = re.compile(r"^(?:Đ[eế]n|Den)\s*:?\s*$", re.IGNORECASE)
15
+ _RE_DEN_INLINE = re.compile(r"(?:Đ[eế]n|Den)\s*:\s*", re.IGNORECASE)
16
+ _RE_TU = re.compile(r"^(?:T[uừ]|Tu)\s*:?\s*$", re.IGNORECASE)
17
+ _RE_NR_LABEL = re.compile(r"(?:Ng[uư][ờơ]i|Nguoi)\s*nh[ậa]n", re.IGNORECASE)
18
+ _RE_SENDER_LABEL = re.compile(r"(?:Ng[uư][ờơ]i|Nguoi)\s*g[uư][ửừ]i", re.IGNORECASE)
19
+ _Strategy = Tuple[str, Callable[[List[Span]], Optional[ReceiverResult]]]
20
+
21
+
22
+ def get_spans(page: Any) -> List[Span]:
23
+ """Return sorted text spans as (y, x, size, text)."""
24
+ spans = []
25
+ for block in page.get_text("dict")["blocks"]:
26
+ if block["type"] != 0:
27
+ continue
28
+ for line in block["lines"]:
29
+ if not line["spans"]:
30
+ continue
31
+ y = round(line["spans"][0]["origin"][1], 0)
32
+ x = min(span["origin"][0] for span in line["spans"])
33
+ size = line["spans"][0]["size"]
34
+ text = "".join(span["text"] for span in line["spans"]).strip()
35
+ if text:
36
+ spans.append((y, round(x, 0), round(size, 1), text))
37
+ spans.sort(key=lambda span: (span[0], span[1]))
38
+ return spans
39
+
40
+
41
+ def extract_lex(spans: List[Span]) -> Optional[ReceiverResult]:
42
+ """LEX / Lazada Logistics: "Người nhận: <Tên>" inline."""
43
+ anchor_idx = None
44
+ name = ""
45
+ for idx, (_y, _x, _size, text) in enumerate(spans):
46
+ match = _RE_NR_INLINE.match(text.strip())
47
+ if match:
48
+ name = match.group(1).strip()
49
+ if name:
50
+ anchor_idx = idx
51
+ break
52
+
53
+ if anchor_idx is None:
54
+ return None
55
+
56
+ lines = [name]
57
+ for _y, _x, _size, text in spans[anchor_idx + 1 :]:
58
+ value = text.strip()
59
+ if not value:
60
+ continue
61
+ if _RE_SENDER_LABEL.match(value) or is_stop(value):
62
+ break
63
+ lines.append(value)
64
+
65
+ return {"name": lines[0], "address": "\n".join(lines[1:]) if len(lines) > 1 else ""}
66
+
67
+
68
+ def extract_shopee(spans: List[Span]) -> Optional[ReceiverResult]:
69
+ """Shopee SPX / Ahamove: right column below "Đến:"."""
70
+ den_y = None
71
+ den_x = None
72
+
73
+ for y, x, _size, text in spans:
74
+ stripped = text.strip()
75
+ if _RE_DEN.match(stripped):
76
+ den_y = y
77
+ den_x = x
78
+ break
79
+
80
+ if _RE_DEN_INLINE.match(stripped):
81
+ after = _RE_DEN_INLINE.sub("", text, count=1).strip()
82
+ if after:
83
+ den_y = y
84
+ den_x = x
85
+ lines = [after]
86
+ for y2, x2, _size2, text2 in spans:
87
+ if y2 <= den_y:
88
+ continue
89
+ if x2 < den_x - 10:
90
+ if is_stop(text2):
91
+ break
92
+ continue
93
+ if is_stop(text2):
94
+ break
95
+ lines.append(text2.strip())
96
+ if lines:
97
+ return {"name": lines[0], "address": "\n".join(lines[1:])}
98
+
99
+ if den_y is None or den_x is None:
100
+ return None
101
+
102
+ right_lines = {} # type: Dict[float, List[str]]
103
+ stop_y = float("inf")
104
+ for y, x, _size, text in spans:
105
+ if y <= den_y:
106
+ continue
107
+ if x < den_x - 5:
108
+ if is_stop(text):
109
+ stop_y = min(stop_y, y)
110
+ continue
111
+ if y >= stop_y:
112
+ break
113
+ if is_stop(text):
114
+ stop_y = min(stop_y, y)
115
+ break
116
+ if _RE_DEN.match(text.strip()) or _RE_TU.match(text.strip()):
117
+ continue
118
+ right_lines.setdefault(y, []).append(text.strip())
119
+
120
+ lines = []
121
+ for y in sorted(right_lines):
122
+ merged = " ".join(right_lines[y]).strip()
123
+ if merged:
124
+ lines.append(merged)
125
+
126
+ if not lines:
127
+ return None
128
+ return {"name": lines[0], "address": "\n".join(lines[1:])}
129
+
130
+
131
+ def extract_jt_vtp(spans: List[Span]) -> Optional[ReceiverResult]:
132
+ """J&T / Viettel Post: small "Người nhận" label, larger name/address below."""
133
+ anchor_idx = None
134
+ for idx, (_y, _x, size, text) in enumerate(spans):
135
+ if _RE_NR_LABEL.search(text.strip()) and size < 9:
136
+ anchor_idx = idx
137
+ break
138
+
139
+ if anchor_idx is None:
140
+ return None
141
+
142
+ anchor_y = spans[anchor_idx][0]
143
+ anchor_x = spans[anchor_idx][1]
144
+
145
+ name = ""
146
+ name_y = None
147
+ for y, x, size, text in spans[anchor_idx + 1 :]:
148
+ value = text.strip()
149
+ if not value:
150
+ continue
151
+ if is_stop(value):
152
+ break
153
+ if abs(y - anchor_y) < 5 and x > anchor_x:
154
+ name = value
155
+ name_y = y
156
+ break
157
+ if y > anchor_y and size > 8:
158
+ name = value
159
+ name_y = y
160
+ break
161
+ break
162
+
163
+ if not name:
164
+ return None
165
+
166
+ address_lines = []
167
+ for y, _x, _size, text in spans:
168
+ if name_y is None or y <= name_y:
169
+ continue
170
+ value = text.strip()
171
+ if not value:
172
+ continue
173
+ if is_stop(value):
174
+ break
175
+ if re.match(r"^\(\+\d+\)|^0\d+\*+", value):
176
+ continue
177
+ address_lines.append(value)
178
+
179
+ return {"name": name, "address": "\n".join(address_lines)}
180
+
181
+
182
+ def extract_from_text_layer(pdf_path: str) -> Optional[ReceiverResult]:
183
+ try:
184
+ import fitz
185
+ except ImportError as exc:
186
+ raise RuntimeError("PyMuPDF is required to read PDF text layers.") from exc
187
+
188
+ doc = fitz.open(pdf_path)
189
+ try:
190
+ page = doc[0]
191
+ spans = get_spans(page)
192
+ finally:
193
+ doc.close()
194
+
195
+ if not spans:
196
+ return None
197
+
198
+ strategies = ( # type: Tuple[_Strategy, ...]
199
+ ("lex", extract_lex),
200
+ ("shopee", extract_shopee),
201
+ ("jt_vtp", extract_jt_vtp),
202
+ )
203
+ for strategy_name, strategy in strategies:
204
+ result = strategy(spans)
205
+ if result and result["name"]:
206
+ result["strategy"] = strategy_name
207
+ return result
208
+
209
+ return None
@@ -0,0 +1,9 @@
1
+ """Shared type aliases for the SDK."""
2
+
3
+ from pathlib import Path
4
+ from typing import BinaryIO, Dict, Tuple, Union
5
+
6
+
7
+ ReceiverResult = Dict[str, str]
8
+ FileInput = Union[str, Path, bytes, bytearray, BinaryIO]
9
+ Span = Tuple[float, float, float, str]
@@ -0,0 +1,45 @@
1
+ """Shared text helpers."""
2
+
3
+ import re
4
+ import unicodedata
5
+ from typing import Iterable, Optional
6
+
7
+ from .constants import STOP_KEYWORDS
8
+ from .types import ReceiverResult
9
+
10
+ _SAFE_TEXT_RE = re.compile(r"[^a-z0-9\s:/.-]")
11
+ _WHITESPACE_RE = re.compile(r"\s+")
12
+
13
+
14
+ def normalize(text: str) -> str:
15
+ normalized = unicodedata.normalize("NFKD", text.strip().lower())
16
+ without_marks = "".join(char for char in normalized if not unicodedata.combining(char))
17
+ safe_text = _SAFE_TEXT_RE.sub(" ", without_marks)
18
+ return _WHITESPACE_RE.sub(" ", safe_text).strip()
19
+
20
+
21
+ def _normalize_all(values: Iterable[str]) -> tuple:
22
+ return tuple(normalize(value) for value in values)
23
+
24
+
25
+ _NORMALIZED_STOP_KEYWORDS = _normalize_all(STOP_KEYWORDS)
26
+
27
+
28
+ def is_stop(line: str) -> bool:
29
+ norm = normalize(line)
30
+ return any(keyword in norm for keyword in _NORMALIZED_STOP_KEYWORDS)
31
+
32
+
33
+ def is_url(value: str) -> bool:
34
+ return value.startswith("http://") or value.startswith("https://")
35
+
36
+
37
+ def format_result(result: Optional[ReceiverResult]) -> str:
38
+ if not result:
39
+ return "Không tìm thấy thông tin receiver."
40
+
41
+ parts = [result.get("name", "")]
42
+ address = result.get("address", "").strip()
43
+ if address:
44
+ parts.append(address)
45
+ return "\n".join(part for part in parts if part)
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: onflow-awb-ocr
3
+ Version: 0.1.0
4
+ Summary: Python SDK for extracting receiver information from AWB/shipping labels.
5
+ Requires-Python: >=3.8
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: PyMuPDF>=1.20
8
+ Requires-Dist: requests>=2.25
9
+ Provides-Extra: ocr
10
+ Requires-Dist: opencv-python>=4.5; extra == "ocr"
11
+ Requires-Dist: pytesseract>=0.3; extra == "ocr"
12
+ Requires-Dist: numpy>=1.21; extra == "ocr"
13
+
14
+ # Onflow AWB OCR
15
+
16
+ Python SDK for extracting receiver information from AWB and shipping label files.
17
+
18
+ The package supports PDF files with a text layer first, then falls back to OCR for
19
+ scanned PDFs and image files when OCR dependencies are installed.
20
+
21
+ ## Requirements
22
+
23
+ - Python 3.8+
24
+ - PyMuPDF for PDF text-layer extraction
25
+ - Optional OCR stack for scanned files and images:
26
+ - Tesseract OCR
27
+ - Vietnamese Tesseract language data
28
+ - Poppler `pdftoppm`
29
+
30
+ ## Installation
31
+
32
+ Install from PyPI:
33
+
34
+ ```bash
35
+ pip install onflow-awb-ocr
36
+ ```
37
+
38
+ Install with OCR dependencies:
39
+
40
+ ```bash
41
+ pip install "onflow-awb-ocr[ocr]"
42
+ ```
43
+
44
+ On Ubuntu, install the native OCR tools:
45
+
46
+ ```bash
47
+ sudo apt install -y tesseract-ocr tesseract-ocr-vie poppler-utils
48
+ ```
49
+
50
+ For local development:
51
+
52
+ ```bash
53
+ pip install -e ".[ocr]"
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ ```python
59
+ from onflow_awb_ocr import OnflowAwbOcr
60
+
61
+ ocr = OnflowAwbOcr(lang="vie+eng")
62
+ result = ocr.extract("label.pdf")
63
+
64
+ print(result)
65
+ ```
66
+
67
+ Example result:
68
+
69
+ ```python
70
+ {
71
+ "name": "Nguyen Van A",
72
+ "address": "123 Nguyen Trai\nQuan 1, TP. Ho Chi Minh",
73
+ "strategy": "shopee",
74
+ }
75
+ ```
76
+
77
+ If no receiver can be detected, `extract()` returns `None`.
78
+
79
+ ## Supported Inputs
80
+
81
+ `extract()` accepts:
82
+
83
+ - Local file path as `str`
84
+ - Local file path as `pathlib.Path`
85
+ - HTTP/HTTPS URL
86
+ - `bytes`
87
+ - `bytearray`
88
+ - Binary file-like object
89
+
90
+ Examples:
91
+
92
+ ```python
93
+ from pathlib import Path
94
+
95
+ from onflow_awb_ocr import OnflowAwbOcr
96
+
97
+ ocr = OnflowAwbOcr()
98
+
99
+ from_path = ocr.extract(Path("label.pdf"))
100
+ from_url = ocr.extract("https://example.com/label.pdf")
101
+
102
+ with open("label.pdf", "rb") as file:
103
+ from_file = ocr.extract(file)
104
+
105
+ with open("label.png", "rb") as file:
106
+ from_bytes = ocr.extract(file.read())
107
+ ```
108
+
109
+ ## Compatibility
110
+
111
+ The old `ReceiverExtractor` class name is still available as an alias:
112
+
113
+ ```python
114
+ from onflow_awb_ocr import ReceiverExtractor
115
+
116
+ ocr = ReceiverExtractor()
117
+ result = ocr.extract("label.pdf")
118
+ ```
119
+
120
+ ## Package Structure
121
+
122
+ - `extractor.py`: public `OnflowAwbOcr` class
123
+ - `input.py`: input preparation for paths, URLs, bytes, and binary streams
124
+ - `text_layer.py`: PDF text-layer extraction strategies
125
+ - `ocr.py`: OCR fallback for scanned PDFs and images
126
+ - `postprocess.py`: address cleanup
127
+ - `types.py`, `constants.py`, `utils.py`: shared types, constants, and helpers
128
+
129
+ ## Publishing
130
+
131
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
132
+
133
+ The repository must define this GitHub secret:
134
+
135
+ ```text
136
+ PYPI_API_TOKEN
137
+ ```
138
+
139
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
140
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
141
+ distribution.
@@ -0,0 +1,18 @@
1
+ README.md
2
+ pyproject.toml
3
+ onflow_awb_ocr/__init__.py
4
+ onflow_awb_ocr/constants.py
5
+ onflow_awb_ocr/extractor.py
6
+ onflow_awb_ocr/input.py
7
+ onflow_awb_ocr/ocr.py
8
+ onflow_awb_ocr/postprocess.py
9
+ onflow_awb_ocr/py.typed
10
+ onflow_awb_ocr/receiver.py
11
+ onflow_awb_ocr/text_layer.py
12
+ onflow_awb_ocr/types.py
13
+ onflow_awb_ocr/utils.py
14
+ onflow_awb_ocr.egg-info/PKG-INFO
15
+ onflow_awb_ocr.egg-info/SOURCES.txt
16
+ onflow_awb_ocr.egg-info/dependency_links.txt
17
+ onflow_awb_ocr.egg-info/requires.txt
18
+ onflow_awb_ocr.egg-info/top_level.txt
@@ -0,0 +1,7 @@
1
+ PyMuPDF>=1.20
2
+ requests>=2.25
3
+
4
+ [ocr]
5
+ opencv-python>=4.5
6
+ pytesseract>=0.3
7
+ numpy>=1.21
@@ -0,0 +1 @@
1
+ onflow_awb_ocr
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "onflow-awb-ocr"
7
+ version = "0.1.0"
8
+ description = "Python SDK for extracting receiver information from AWB/shipping labels."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ dependencies = [
12
+ "PyMuPDF>=1.20",
13
+ "requests>=2.25",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ ocr = [
18
+ "opencv-python>=4.5",
19
+ "pytesseract>=0.3",
20
+ "numpy>=1.21",
21
+ ]
22
+
23
+ [tool.setuptools.packages.find]
24
+ include = ["onflow_awb_ocr*"]
25
+
26
+ [tool.setuptools.package-data]
27
+ onflow_awb_ocr = ["py.typed"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+