awb-extractor 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awb_extractor-0.1.1/PKG-INFO +179 -0
- awb_extractor-0.1.1/README.md +166 -0
- awb_extractor-0.1.1/awb_extractor/__init__.py +18 -0
- awb_extractor-0.1.1/awb_extractor/exceptions.py +22 -0
- awb_extractor-0.1.1/awb_extractor/extractor.py +129 -0
- awb_extractor-0.1.1/awb_extractor/models.py +37 -0
- awb_extractor-0.1.1/awb_extractor.egg-info/PKG-INFO +179 -0
- awb_extractor-0.1.1/awb_extractor.egg-info/SOURCES.txt +12 -0
- awb_extractor-0.1.1/awb_extractor.egg-info/dependency_links.txt +1 -0
- awb_extractor-0.1.1/awb_extractor.egg-info/requires.txt +6 -0
- awb_extractor-0.1.1/awb_extractor.egg-info/top_level.txt +1 -0
- awb_extractor-0.1.1/pyproject.toml +25 -0
- awb_extractor-0.1.1/setup.cfg +4 -0
- awb_extractor-0.1.1/tests/test_extractor.py +170 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: awb-extractor
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Extract recipient address from AWB/shipping label PDF using Claude AI
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: anthropic>=0.40.0
|
|
9
|
+
Requires-Dist: httpx>=0.27.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
12
|
+
Requires-Dist: pytest-mock>=3.14; extra == "dev"
|
|
13
|
+
|
|
14
|
+
# AWB Extractor
|
|
15
|
+
|
|
16
|
+
Python SDK for extracting receiver and shipment information from AWB/shipping
|
|
17
|
+
label PDF files using Claude AI.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- Extract from PDF bytes, local PDF files, or PDF URLs
|
|
22
|
+
- Batch extraction from multiple URLs
|
|
23
|
+
- Optional default HTTP headers for protected AWB URLs
|
|
24
|
+
- Typed `AWBResult` dataclass output
|
|
25
|
+
- Custom exceptions for API key, PDF download, and JSON parsing failures
|
|
26
|
+
|
|
27
|
+
## Requirements
|
|
28
|
+
|
|
29
|
+
- Python 3.10+
|
|
30
|
+
- Anthropic API key
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
Install from PyPI:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install awb-extractor
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
For local development:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e ".[dev]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from awb_extractor import AWBExtractor
|
|
50
|
+
|
|
51
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
52
|
+
result = extractor.from_file("label.pdf")
|
|
53
|
+
|
|
54
|
+
print(result.recipient_name)
|
|
55
|
+
print(result.to_dict())
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Example result:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
{
|
|
62
|
+
"tracking_number": "NHSVC972103440",
|
|
63
|
+
"recipient_name": "Nguyen Van A",
|
|
64
|
+
"recipient_phone": "(+84)03******37",
|
|
65
|
+
"recipient_address": "237 Nguyen Trai",
|
|
66
|
+
"recipient_ward": "Phuong Ben Thanh",
|
|
67
|
+
"recipient_district": "Quan 1",
|
|
68
|
+
"recipient_province": "TP. Ho Chi Minh",
|
|
69
|
+
"sender_name": "Onflow",
|
|
70
|
+
"sender_address": "TP. Ho Chi Minh",
|
|
71
|
+
"cod": "0",
|
|
72
|
+
"weight": "0.700 KG",
|
|
73
|
+
"order_id": "584425059595159079",
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Supported Inputs
|
|
78
|
+
|
|
79
|
+
### PDF bytes
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from awb_extractor import AWBExtractor
|
|
83
|
+
|
|
84
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
85
|
+
|
|
86
|
+
with open("label.pdf", "rb") as file:
|
|
87
|
+
result = extractor.from_bytes(file.read())
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Local PDF file
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from awb_extractor import AWBExtractor
|
|
94
|
+
|
|
95
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
96
|
+
result = extractor.from_file("label.pdf")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### PDF URL
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from awb_extractor import AWBExtractor
|
|
103
|
+
|
|
104
|
+
extractor = AWBExtractor(
|
|
105
|
+
api_key="sk-ant-...",
|
|
106
|
+
http_headers={"Authorization": "Bearer token"},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
result = extractor.from_url("https://example.com/awb.pdf")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
You can pass request-specific headers with `extra_headers`:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
result = extractor.from_url(
|
|
116
|
+
"https://example.com/awb.pdf",
|
|
117
|
+
extra_headers={"X-Request-ID": "request-123"},
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Multiple URLs
|
|
122
|
+
|
|
123
|
+
`from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
|
|
124
|
+
Failed URLs do not stop the whole batch.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from awb_extractor import AWBExtractor
|
|
128
|
+
|
|
129
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
130
|
+
results = extractor.from_urls([
|
|
131
|
+
"https://example.com/good.pdf",
|
|
132
|
+
"https://example.com/bad.pdf",
|
|
133
|
+
])
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Result Fields
|
|
137
|
+
|
|
138
|
+
`AWBResult` includes:
|
|
139
|
+
|
|
140
|
+
- `tracking_number`
|
|
141
|
+
- `recipient_name`
|
|
142
|
+
- `recipient_phone`
|
|
143
|
+
- `recipient_address`
|
|
144
|
+
- `recipient_ward`
|
|
145
|
+
- `recipient_district`
|
|
146
|
+
- `recipient_province`
|
|
147
|
+
- `sender_name`
|
|
148
|
+
- `sender_address`
|
|
149
|
+
- `cod`
|
|
150
|
+
- `weight`
|
|
151
|
+
- `order_id`
|
|
152
|
+
|
|
153
|
+
Use `to_dict()` or `to_json()` to serialize the result.
|
|
154
|
+
|
|
155
|
+
## Exceptions
|
|
156
|
+
|
|
157
|
+
- `APIKeyError`: missing API key
|
|
158
|
+
- `PDFDownloadError`: PDF URL download failed
|
|
159
|
+
- `ExtractionError`: Claude response could not be parsed as JSON
|
|
160
|
+
|
|
161
|
+
## Package Structure
|
|
162
|
+
|
|
163
|
+
- `awb_extractor/extractor.py`: public `AWBExtractor` class
|
|
164
|
+
- `awb_extractor/models.py`: `AWBResult` dataclass
|
|
165
|
+
- `awb_extractor/exceptions.py`: package exceptions
|
|
166
|
+
|
|
167
|
+
## Publishing
|
|
168
|
+
|
|
169
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
170
|
+
|
|
171
|
+
The repository must define this GitHub secret:
|
|
172
|
+
|
|
173
|
+
```text
|
|
174
|
+
PYPI_API_TOKEN
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
178
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
179
|
+
distribution.
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# AWB Extractor
|
|
2
|
+
|
|
3
|
+
Python SDK for extracting receiver and shipment information from AWB/shipping
|
|
4
|
+
label PDF files using Claude AI.
|
|
5
|
+
|
|
6
|
+
## Features
|
|
7
|
+
|
|
8
|
+
- Extract from PDF bytes, local PDF files, or PDF URLs
|
|
9
|
+
- Batch extraction from multiple URLs
|
|
10
|
+
- Optional default HTTP headers for protected AWB URLs
|
|
11
|
+
- Typed `AWBResult` dataclass output
|
|
12
|
+
- Custom exceptions for API key, PDF download, and JSON parsing failures
|
|
13
|
+
|
|
14
|
+
## Requirements
|
|
15
|
+
|
|
16
|
+
- Python 3.10+
|
|
17
|
+
- Anthropic API key
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
Install from PyPI:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install awb-extractor
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
For local development:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install -e ".[dev]"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from awb_extractor import AWBExtractor
|
|
37
|
+
|
|
38
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
39
|
+
result = extractor.from_file("label.pdf")
|
|
40
|
+
|
|
41
|
+
print(result.recipient_name)
|
|
42
|
+
print(result.to_dict())
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Example result:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
{
|
|
49
|
+
"tracking_number": "NHSVC972103440",
|
|
50
|
+
"recipient_name": "Nguyen Van A",
|
|
51
|
+
"recipient_phone": "(+84)03******37",
|
|
52
|
+
"recipient_address": "237 Nguyen Trai",
|
|
53
|
+
"recipient_ward": "Phuong Ben Thanh",
|
|
54
|
+
"recipient_district": "Quan 1",
|
|
55
|
+
"recipient_province": "TP. Ho Chi Minh",
|
|
56
|
+
"sender_name": "Onflow",
|
|
57
|
+
"sender_address": "TP. Ho Chi Minh",
|
|
58
|
+
"cod": "0",
|
|
59
|
+
"weight": "0.700 KG",
|
|
60
|
+
"order_id": "584425059595159079",
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Supported Inputs
|
|
65
|
+
|
|
66
|
+
### PDF bytes
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from awb_extractor import AWBExtractor
|
|
70
|
+
|
|
71
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
72
|
+
|
|
73
|
+
with open("label.pdf", "rb") as file:
|
|
74
|
+
result = extractor.from_bytes(file.read())
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Local PDF file
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from awb_extractor import AWBExtractor
|
|
81
|
+
|
|
82
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
83
|
+
result = extractor.from_file("label.pdf")
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### PDF URL
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from awb_extractor import AWBExtractor
|
|
90
|
+
|
|
91
|
+
extractor = AWBExtractor(
|
|
92
|
+
api_key="sk-ant-...",
|
|
93
|
+
http_headers={"Authorization": "Bearer token"},
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
result = extractor.from_url("https://example.com/awb.pdf")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
You can pass request-specific headers with `extra_headers`:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
result = extractor.from_url(
|
|
103
|
+
"https://example.com/awb.pdf",
|
|
104
|
+
extra_headers={"X-Request-ID": "request-123"},
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Multiple URLs
|
|
109
|
+
|
|
110
|
+
`from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
|
|
111
|
+
Failed URLs do not stop the whole batch.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from awb_extractor import AWBExtractor
|
|
115
|
+
|
|
116
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
117
|
+
results = extractor.from_urls([
|
|
118
|
+
"https://example.com/good.pdf",
|
|
119
|
+
"https://example.com/bad.pdf",
|
|
120
|
+
])
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Result Fields
|
|
124
|
+
|
|
125
|
+
`AWBResult` includes:
|
|
126
|
+
|
|
127
|
+
- `tracking_number`
|
|
128
|
+
- `recipient_name`
|
|
129
|
+
- `recipient_phone`
|
|
130
|
+
- `recipient_address`
|
|
131
|
+
- `recipient_ward`
|
|
132
|
+
- `recipient_district`
|
|
133
|
+
- `recipient_province`
|
|
134
|
+
- `sender_name`
|
|
135
|
+
- `sender_address`
|
|
136
|
+
- `cod`
|
|
137
|
+
- `weight`
|
|
138
|
+
- `order_id`
|
|
139
|
+
|
|
140
|
+
Use `to_dict()` or `to_json()` to serialize the result.
|
|
141
|
+
|
|
142
|
+
## Exceptions
|
|
143
|
+
|
|
144
|
+
- `APIKeyError`: missing API key
|
|
145
|
+
- `PDFDownloadError`: PDF URL download failed
|
|
146
|
+
- `ExtractionError`: Claude response could not be parsed as JSON
|
|
147
|
+
|
|
148
|
+
## Package Structure
|
|
149
|
+
|
|
150
|
+
- `awb_extractor/extractor.py`: public `AWBExtractor` class
|
|
151
|
+
- `awb_extractor/models.py`: `AWBResult` dataclass
|
|
152
|
+
- `awb_extractor/exceptions.py`: package exceptions
|
|
153
|
+
|
|
154
|
+
## Publishing
|
|
155
|
+
|
|
156
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
157
|
+
|
|
158
|
+
The repository must define this GitHub secret:
|
|
159
|
+
|
|
160
|
+
```text
|
|
161
|
+
PYPI_API_TOKEN
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
165
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
166
|
+
distribution.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .extractor import AWBExtractor
|
|
2
|
+
from .models import AWBResult
|
|
3
|
+
from .exceptions import (
|
|
4
|
+
AWBExtractorError,
|
|
5
|
+
APIKeyError,
|
|
6
|
+
PDFDownloadError,
|
|
7
|
+
ExtractionError,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.1"
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AWBExtractor",
|
|
13
|
+
"AWBResult",
|
|
14
|
+
"AWBExtractorError",
|
|
15
|
+
"APIKeyError",
|
|
16
|
+
"PDFDownloadError",
|
|
17
|
+
"ExtractionError",
|
|
18
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
class AWBExtractorError(Exception):
|
|
2
|
+
pass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class APIKeyError(AWBExtractorError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PDFDownloadError(AWBExtractorError):
|
|
10
|
+
def __init__(self, url: str, status_code: int = None, message: str = None):
|
|
11
|
+
self.url = url
|
|
12
|
+
self.status_code = status_code
|
|
13
|
+
detail = f"status={status_code}" if status_code else message or "unknown error"
|
|
14
|
+
super().__init__(f"Failed to download PDF from {url!r}: {detail}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ExtractionError(AWBExtractorError):
|
|
18
|
+
def __init__(self, raw_response: str = None):
|
|
19
|
+
self.raw_response = raw_response
|
|
20
|
+
super().__init__(
|
|
21
|
+
f"Failed to parse Claude response as JSON. Raw: {raw_response!r}"
|
|
22
|
+
)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import anthropic
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from .exceptions import APIKeyError, ExtractionError, PDFDownloadError
|
|
10
|
+
from .models import AWBResult
|
|
11
|
+
|
|
12
|
+
_SYSTEM_PROMPT = """You are an OCR system that extracts information from shipping labels (AWB).
|
|
13
|
+
Return only raw JSON, no explanation, no markdown. Required format:
|
|
14
|
+
{
|
|
15
|
+
"tracking_number": "",
|
|
16
|
+
"recipient_name": "",
|
|
17
|
+
"recipient_phone": "",
|
|
18
|
+
"recipient_address": "",
|
|
19
|
+
"recipient_ward": "",
|
|
20
|
+
"recipient_district": "",
|
|
21
|
+
"recipient_province": "",
|
|
22
|
+
"sender_name": "",
|
|
23
|
+
"sender_address": "",
|
|
24
|
+
"cod": "",
|
|
25
|
+
"weight": "",
|
|
26
|
+
"order_id": ""
|
|
27
|
+
}"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AWBExtractor:
|
|
31
|
+
DEFAULT_MODEL = "claude-haiku-4-5-20251001"
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
api_key: str,
|
|
36
|
+
http_headers: Optional[dict] = None,
|
|
37
|
+
model: str = DEFAULT_MODEL,
|
|
38
|
+
timeout: int = 30,
|
|
39
|
+
):
|
|
40
|
+
if not api_key:
|
|
41
|
+
raise APIKeyError("api_key is required.")
|
|
42
|
+
self._client = anthropic.Anthropic(api_key=api_key)
|
|
43
|
+
self._http_headers = http_headers or {}
|
|
44
|
+
self._model = model
|
|
45
|
+
self._timeout = timeout
|
|
46
|
+
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
# Public API
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def from_bytes(self, pdf_bytes: bytes) -> AWBResult:
|
|
52
|
+
raw_json = self._call_claude(pdf_bytes)
|
|
53
|
+
return AWBResult.from_dict(raw_json)
|
|
54
|
+
|
|
55
|
+
def from_file(self, pdf_path: str) -> AWBResult:
|
|
56
|
+
pdf_bytes = Path(pdf_path).read_bytes()
|
|
57
|
+
return self.from_bytes(pdf_bytes)
|
|
58
|
+
|
|
59
|
+
def from_url(self, url: str, extra_headers: Optional[dict] = None) -> AWBResult:
|
|
60
|
+
pdf_bytes = self._download(url, extra_headers)
|
|
61
|
+
return self.from_bytes(pdf_bytes)
|
|
62
|
+
|
|
63
|
+
def from_urls(self, urls: list[str]) -> list[dict]:
|
|
64
|
+
results = []
|
|
65
|
+
for url in urls:
|
|
66
|
+
try:
|
|
67
|
+
results.append({
|
|
68
|
+
"url": url,
|
|
69
|
+
"data": self.from_url(url),
|
|
70
|
+
"error": None,
|
|
71
|
+
})
|
|
72
|
+
except Exception as e:
|
|
73
|
+
results.append({
|
|
74
|
+
"url": url,
|
|
75
|
+
"data": None,
|
|
76
|
+
"error": str(e),
|
|
77
|
+
})
|
|
78
|
+
return results
|
|
79
|
+
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
# Private helpers
|
|
82
|
+
# ------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _download(self, url: str, extra_headers: Optional[dict] = None) -> bytes:
|
|
85
|
+
headers = {**self._http_headers, **(extra_headers or {})}
|
|
86
|
+
try:
|
|
87
|
+
resp = httpx.get(url, headers=headers, follow_redirects=True, timeout=self._timeout)
|
|
88
|
+
resp.raise_for_status()
|
|
89
|
+
return resp.content
|
|
90
|
+
except httpx.HTTPStatusError as e:
|
|
91
|
+
raise PDFDownloadError(url, status_code=e.response.status_code) from e
|
|
92
|
+
except httpx.RequestError as e:
|
|
93
|
+
raise PDFDownloadError(url, message=str(e)) from e
|
|
94
|
+
|
|
95
|
+
def _call_claude(self, pdf_bytes: bytes) -> dict:
|
|
96
|
+
pdf_base64 = base64.standard_b64encode(pdf_bytes).decode("utf-8")
|
|
97
|
+
|
|
98
|
+
response = self._client.messages.create(
|
|
99
|
+
model=self._model,
|
|
100
|
+
max_tokens=1024,
|
|
101
|
+
system=_SYSTEM_PROMPT,
|
|
102
|
+
messages=[
|
|
103
|
+
{
|
|
104
|
+
"role": "user",
|
|
105
|
+
"content": [
|
|
106
|
+
{
|
|
107
|
+
"type": "document",
|
|
108
|
+
"source": {
|
|
109
|
+
"type": "base64",
|
|
110
|
+
"media_type": "application/pdf",
|
|
111
|
+
"data": pdf_base64,
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"type": "text",
|
|
116
|
+
"text": "Extract all information from this shipping label.",
|
|
117
|
+
},
|
|
118
|
+
],
|
|
119
|
+
}
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
raw = response.content[0].text.strip()
|
|
124
|
+
raw = raw.replace("```json", "").replace("```", "").strip()
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
return json.loads(raw)
|
|
128
|
+
except json.JSONDecodeError as e:
|
|
129
|
+
raise ExtractionError(raw_response=raw) from e
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from dataclasses import dataclass, field, asdict
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class AWBResult:
|
|
8
|
+
tracking_number: Optional[str] = None
|
|
9
|
+
recipient_name: Optional[str] = None
|
|
10
|
+
recipient_phone: Optional[str] = None
|
|
11
|
+
recipient_address: Optional[str] = None
|
|
12
|
+
recipient_ward: Optional[str] = None
|
|
13
|
+
recipient_district: Optional[str] = None
|
|
14
|
+
recipient_province: Optional[str] = None
|
|
15
|
+
sender_name: Optional[str] = None
|
|
16
|
+
sender_address: Optional[str] = None
|
|
17
|
+
cod: Optional[str] = None
|
|
18
|
+
weight: Optional[str] = None
|
|
19
|
+
order_id: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_dict(cls, data: dict) -> "AWBResult":
|
|
23
|
+
valid_keys = cls.__dataclass_fields__.keys()
|
|
24
|
+
filtered = {k: v or None for k, v in data.items() if k in valid_keys}
|
|
25
|
+
return cls(**filtered)
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> dict:
|
|
28
|
+
return {k: v for k, v in asdict(self).items()}
|
|
29
|
+
|
|
30
|
+
def to_json(self, indent: int = 2) -> str:
|
|
31
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
|
|
32
|
+
|
|
33
|
+
def __repr__(self) -> str:
|
|
34
|
+
fields = ", ".join(
|
|
35
|
+
f"{k}={v!r}" for k, v in self.to_dict().items() if v is not None
|
|
36
|
+
)
|
|
37
|
+
return f"AWBResult({fields})"
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: awb-extractor
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Extract recipient address from AWB/shipping label PDF using Claude AI
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: anthropic>=0.40.0
|
|
9
|
+
Requires-Dist: httpx>=0.27.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
12
|
+
Requires-Dist: pytest-mock>=3.14; extra == "dev"
|
|
13
|
+
|
|
14
|
+
# AWB Extractor
|
|
15
|
+
|
|
16
|
+
Python SDK for extracting receiver and shipment information from AWB/shipping
|
|
17
|
+
label PDF files using Claude AI.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- Extract from PDF bytes, local PDF files, or PDF URLs
|
|
22
|
+
- Batch extraction from multiple URLs
|
|
23
|
+
- Optional default HTTP headers for protected AWB URLs
|
|
24
|
+
- Typed `AWBResult` dataclass output
|
|
25
|
+
- Custom exceptions for API key, PDF download, and JSON parsing failures
|
|
26
|
+
|
|
27
|
+
## Requirements
|
|
28
|
+
|
|
29
|
+
- Python 3.10+
|
|
30
|
+
- Anthropic API key
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
Install from PyPI:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install awb-extractor
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
For local development:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e ".[dev]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from awb_extractor import AWBExtractor
|
|
50
|
+
|
|
51
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
52
|
+
result = extractor.from_file("label.pdf")
|
|
53
|
+
|
|
54
|
+
print(result.recipient_name)
|
|
55
|
+
print(result.to_dict())
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Example result:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
{
|
|
62
|
+
"tracking_number": "NHSVC972103440",
|
|
63
|
+
"recipient_name": "Nguyen Van A",
|
|
64
|
+
"recipient_phone": "(+84)03******37",
|
|
65
|
+
"recipient_address": "237 Nguyen Trai",
|
|
66
|
+
"recipient_ward": "Phuong Ben Thanh",
|
|
67
|
+
"recipient_district": "Quan 1",
|
|
68
|
+
"recipient_province": "TP. Ho Chi Minh",
|
|
69
|
+
"sender_name": "Onflow",
|
|
70
|
+
"sender_address": "TP. Ho Chi Minh",
|
|
71
|
+
"cod": "0",
|
|
72
|
+
"weight": "0.700 KG",
|
|
73
|
+
"order_id": "584425059595159079",
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Supported Inputs
|
|
78
|
+
|
|
79
|
+
### PDF bytes
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from awb_extractor import AWBExtractor
|
|
83
|
+
|
|
84
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
85
|
+
|
|
86
|
+
with open("label.pdf", "rb") as file:
|
|
87
|
+
result = extractor.from_bytes(file.read())
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Local PDF file
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from awb_extractor import AWBExtractor
|
|
94
|
+
|
|
95
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
96
|
+
result = extractor.from_file("label.pdf")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### PDF URL
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from awb_extractor import AWBExtractor
|
|
103
|
+
|
|
104
|
+
extractor = AWBExtractor(
|
|
105
|
+
api_key="sk-ant-...",
|
|
106
|
+
http_headers={"Authorization": "Bearer token"},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
result = extractor.from_url("https://example.com/awb.pdf")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
You can pass request-specific headers with `extra_headers`:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
result = extractor.from_url(
|
|
116
|
+
"https://example.com/awb.pdf",
|
|
117
|
+
extra_headers={"X-Request-ID": "request-123"},
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Multiple URLs
|
|
122
|
+
|
|
123
|
+
`from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
|
|
124
|
+
Failed URLs do not stop the whole batch.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from awb_extractor import AWBExtractor
|
|
128
|
+
|
|
129
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
130
|
+
results = extractor.from_urls([
|
|
131
|
+
"https://example.com/good.pdf",
|
|
132
|
+
"https://example.com/bad.pdf",
|
|
133
|
+
])
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Result Fields
|
|
137
|
+
|
|
138
|
+
`AWBResult` includes:
|
|
139
|
+
|
|
140
|
+
- `tracking_number`
|
|
141
|
+
- `recipient_name`
|
|
142
|
+
- `recipient_phone`
|
|
143
|
+
- `recipient_address`
|
|
144
|
+
- `recipient_ward`
|
|
145
|
+
- `recipient_district`
|
|
146
|
+
- `recipient_province`
|
|
147
|
+
- `sender_name`
|
|
148
|
+
- `sender_address`
|
|
149
|
+
- `cod`
|
|
150
|
+
- `weight`
|
|
151
|
+
- `order_id`
|
|
152
|
+
|
|
153
|
+
Use `to_dict()` or `to_json()` to serialize the result.
|
|
154
|
+
|
|
155
|
+
## Exceptions
|
|
156
|
+
|
|
157
|
+
- `APIKeyError`: missing API key
|
|
158
|
+
- `PDFDownloadError`: PDF URL download failed
|
|
159
|
+
- `ExtractionError`: Claude response could not be parsed as JSON
|
|
160
|
+
|
|
161
|
+
## Package Structure
|
|
162
|
+
|
|
163
|
+
- `awb_extractor/extractor.py`: public `AWBExtractor` class
|
|
164
|
+
- `awb_extractor/models.py`: `AWBResult` dataclass
|
|
165
|
+
- `awb_extractor/exceptions.py`: package exceptions
|
|
166
|
+
|
|
167
|
+
## Publishing
|
|
168
|
+
|
|
169
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
170
|
+
|
|
171
|
+
The repository must define this GitHub secret:
|
|
172
|
+
|
|
173
|
+
```text
|
|
174
|
+
PYPI_API_TOKEN
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
178
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
179
|
+
distribution.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
awb_extractor/__init__.py
|
|
4
|
+
awb_extractor/exceptions.py
|
|
5
|
+
awb_extractor/extractor.py
|
|
6
|
+
awb_extractor/models.py
|
|
7
|
+
awb_extractor.egg-info/PKG-INFO
|
|
8
|
+
awb_extractor.egg-info/SOURCES.txt
|
|
9
|
+
awb_extractor.egg-info/dependency_links.txt
|
|
10
|
+
awb_extractor.egg-info/requires.txt
|
|
11
|
+
awb_extractor.egg-info/top_level.txt
|
|
12
|
+
tests/test_extractor.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
awb_extractor
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "awb-extractor"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Extract recipient address from AWB/shipping label PDF using Claude AI"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
dependencies = [
|
|
13
|
+
"anthropic>=0.40.0",
|
|
14
|
+
"httpx>=0.27.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
dev = [
|
|
19
|
+
"pytest>=8.0",
|
|
20
|
+
"pytest-mock>=3.14",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
where = ["."]
|
|
25
|
+
include = ["awb_extractor*"]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pytest
|
|
3
|
+
from unittest.mock import MagicMock, patch
|
|
4
|
+
|
|
5
|
+
from awb_extractor import AWBExtractor, AWBResult
|
|
6
|
+
from awb_extractor.exceptions import APIKeyError, ExtractionError, PDFDownloadError
|
|
7
|
+
|
|
8
|
+
MOCK_PDF = b"%PDF-1.4 fake content"
|
|
9
|
+
MOCK_RESPONSE = {
|
|
10
|
+
"tracking_number": "NHSVC972103440",
|
|
11
|
+
"recipient_name": "Nguyễn Thanh Huyền",
|
|
12
|
+
"recipient_phone": "(+84)03******37",
|
|
13
|
+
"recipient_address": "Trường Mầm Non Trung Chính, 237HNC19",
|
|
14
|
+
"recipient_ward": "Xã Trung Chính",
|
|
15
|
+
"recipient_district": "Huyện Nông Cống",
|
|
16
|
+
"recipient_province": "Thanh Hóa",
|
|
17
|
+
"sender_name": "BZU BZU VIỆT NAM",
|
|
18
|
+
"sender_address": "Số 3A đường An Phú Đông 13, Q.12, TP.HCM",
|
|
19
|
+
"cod": "Được đồng kiểm",
|
|
20
|
+
"weight": "0.700 KG",
|
|
21
|
+
"order_id": "584425059595159079",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_extractor():
|
|
26
|
+
return AWBExtractor(api_key="sk-ant-test")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def mock_claude_response(data: dict):
|
|
30
|
+
msg = MagicMock()
|
|
31
|
+
msg.content = [MagicMock(text=json.dumps(data))]
|
|
32
|
+
return msg
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ------------------------------------------------------------------
|
|
36
|
+
# AWBResult
|
|
37
|
+
# ------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
def test_awb_result_from_dict():
|
|
40
|
+
result = AWBResult.from_dict(MOCK_RESPONSE)
|
|
41
|
+
assert result.tracking_number == "NHSVC972103440"
|
|
42
|
+
assert result.recipient_name == "Nguyễn Thanh Huyền"
|
|
43
|
+
assert result.recipient_province == "Thanh Hóa"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_awb_result_ignores_unknown_keys():
|
|
47
|
+
data = {**MOCK_RESPONSE, "unknown_field": "should be ignored"}
|
|
48
|
+
result = AWBResult.from_dict(data)
|
|
49
|
+
assert not hasattr(result, "unknown_field")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_awb_result_to_dict():
|
|
53
|
+
result = AWBResult.from_dict(MOCK_RESPONSE)
|
|
54
|
+
d = result.to_dict()
|
|
55
|
+
assert d["tracking_number"] == "NHSVC972103440"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_awb_result_to_json():
|
|
59
|
+
result = AWBResult.from_dict(MOCK_RESPONSE)
|
|
60
|
+
parsed = json.loads(result.to_json())
|
|
61
|
+
assert parsed["order_id"] == "584425059595159079"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_awb_result_empty_string_becomes_none():
|
|
65
|
+
data = {**MOCK_RESPONSE, "cod": ""}
|
|
66
|
+
result = AWBResult.from_dict(data)
|
|
67
|
+
assert result.cod is None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
# AWBExtractor init
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def test_raises_if_no_api_key():
|
|
75
|
+
with pytest.raises(APIKeyError):
|
|
76
|
+
AWBExtractor(api_key="")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
# from_bytes
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
def test_from_bytes_returns_awb_result():
|
|
84
|
+
extractor = make_extractor()
|
|
85
|
+
with patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
|
|
86
|
+
result = extractor.from_bytes(MOCK_PDF)
|
|
87
|
+
assert isinstance(result, AWBResult)
|
|
88
|
+
assert result.tracking_number == "NHSVC972103440"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_from_bytes_raises_extraction_error_on_bad_json():
|
|
92
|
+
extractor = make_extractor()
|
|
93
|
+
bad_msg = MagicMock()
|
|
94
|
+
bad_msg.content = [MagicMock(text="not json at all")]
|
|
95
|
+
with patch.object(extractor._client.messages, "create", return_value=bad_msg):
|
|
96
|
+
with pytest.raises(ExtractionError):
|
|
97
|
+
extractor.from_bytes(MOCK_PDF)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------------------------------
|
|
101
|
+
# from_file
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
def test_from_file(tmp_path):
|
|
105
|
+
pdf_file = tmp_path / "test.pdf"
|
|
106
|
+
pdf_file.write_bytes(MOCK_PDF)
|
|
107
|
+
extractor = make_extractor()
|
|
108
|
+
with patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
|
|
109
|
+
result = extractor.from_file(str(pdf_file))
|
|
110
|
+
assert result.recipient_name == "Nguyễn Thanh Huyền"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
# from_url
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def test_from_url_success():
|
|
118
|
+
extractor = make_extractor()
|
|
119
|
+
with patch("awb_extractor.extractor.httpx.get") as mock_get, \
|
|
120
|
+
patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
|
|
121
|
+
mock_get.return_value = MagicMock(content=MOCK_PDF, raise_for_status=lambda: None)
|
|
122
|
+
result = extractor.from_url("https://example.com/awb.pdf")
|
|
123
|
+
assert result.order_id == "584425059595159079"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_from_url_raises_download_error_on_http_error():
|
|
127
|
+
import httpx
|
|
128
|
+
extractor = make_extractor()
|
|
129
|
+
with patch("awb_extractor.extractor.httpx.get") as mock_get:
|
|
130
|
+
mock_response = MagicMock()
|
|
131
|
+
mock_response.status_code = 403
|
|
132
|
+
mock_get.return_value.raise_for_status.side_effect = httpx.HTTPStatusError(
|
|
133
|
+
"403", request=MagicMock(), response=mock_response
|
|
134
|
+
)
|
|
135
|
+
with pytest.raises(PDFDownloadError) as exc_info:
|
|
136
|
+
extractor.from_url("https://example.com/awb.pdf")
|
|
137
|
+
assert exc_info.value.status_code == 403
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
# from_urls (batch)
|
|
142
|
+
# ------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def test_from_urls_handles_mixed_results():
|
|
145
|
+
import httpx
|
|
146
|
+
extractor = make_extractor()
|
|
147
|
+
|
|
148
|
+
def side_effect(url, **kwargs):
|
|
149
|
+
mock = MagicMock()
|
|
150
|
+
if "good" in url:
|
|
151
|
+
mock.content = MOCK_PDF
|
|
152
|
+
mock.raise_for_status = lambda: None
|
|
153
|
+
return mock
|
|
154
|
+
mock_response = MagicMock(status_code=404)
|
|
155
|
+
mock.raise_for_status.side_effect = httpx.HTTPStatusError(
|
|
156
|
+
"404", request=MagicMock(), response=mock_response
|
|
157
|
+
)
|
|
158
|
+
return mock
|
|
159
|
+
|
|
160
|
+
with patch("awb_extractor.extractor.httpx.get", side_effect=side_effect), \
|
|
161
|
+
patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
|
|
162
|
+
results = extractor.from_urls([
|
|
163
|
+
"https://example.com/good.pdf",
|
|
164
|
+
"https://example.com/bad.pdf",
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
assert results[0]["data"] is not None
|
|
168
|
+
assert results[0]["error"] is None
|
|
169
|
+
assert results[1]["data"] is None
|
|
170
|
+
assert results[1]["error"] is not None
|