awb-extractor 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awb_extractor/__init__.py +18 -0
- awb_extractor/exceptions.py +22 -0
- awb_extractor/extractor.py +129 -0
- awb_extractor/models.py +37 -0
- awb_extractor-0.1.1.dist-info/METADATA +179 -0
- awb_extractor-0.1.1.dist-info/RECORD +8 -0
- awb_extractor-0.1.1.dist-info/WHEEL +5 -0
- awb_extractor-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .extractor import AWBExtractor
|
|
2
|
+
from .models import AWBResult
|
|
3
|
+
from .exceptions import (
|
|
4
|
+
AWBExtractorError,
|
|
5
|
+
APIKeyError,
|
|
6
|
+
PDFDownloadError,
|
|
7
|
+
ExtractionError,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.1"
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AWBExtractor",
|
|
13
|
+
"AWBResult",
|
|
14
|
+
"AWBExtractorError",
|
|
15
|
+
"APIKeyError",
|
|
16
|
+
"PDFDownloadError",
|
|
17
|
+
"ExtractionError",
|
|
18
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
class AWBExtractorError(Exception):
|
|
2
|
+
pass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class APIKeyError(AWBExtractorError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PDFDownloadError(AWBExtractorError):
|
|
10
|
+
def __init__(self, url: str, status_code: int = None, message: str = None):
|
|
11
|
+
self.url = url
|
|
12
|
+
self.status_code = status_code
|
|
13
|
+
detail = f"status={status_code}" if status_code else message or "unknown error"
|
|
14
|
+
super().__init__(f"Failed to download PDF from {url!r}: {detail}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ExtractionError(AWBExtractorError):
|
|
18
|
+
def __init__(self, raw_response: str = None):
|
|
19
|
+
self.raw_response = raw_response
|
|
20
|
+
super().__init__(
|
|
21
|
+
f"Failed to parse Claude response as JSON. Raw: {raw_response!r}"
|
|
22
|
+
)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import anthropic
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from .exceptions import APIKeyError, ExtractionError, PDFDownloadError
|
|
10
|
+
from .models import AWBResult
|
|
11
|
+
|
|
12
|
+
_SYSTEM_PROMPT = """You are an OCR system that extracts information from shipping labels (AWB).
|
|
13
|
+
Return only raw JSON, no explanation, no markdown. Required format:
|
|
14
|
+
{
|
|
15
|
+
"tracking_number": "",
|
|
16
|
+
"recipient_name": "",
|
|
17
|
+
"recipient_phone": "",
|
|
18
|
+
"recipient_address": "",
|
|
19
|
+
"recipient_ward": "",
|
|
20
|
+
"recipient_district": "",
|
|
21
|
+
"recipient_province": "",
|
|
22
|
+
"sender_name": "",
|
|
23
|
+
"sender_address": "",
|
|
24
|
+
"cod": "",
|
|
25
|
+
"weight": "",
|
|
26
|
+
"order_id": ""
|
|
27
|
+
}"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AWBExtractor:
|
|
31
|
+
DEFAULT_MODEL = "claude-haiku-4-5-20251001"
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
api_key: str,
|
|
36
|
+
http_headers: Optional[dict] = None,
|
|
37
|
+
model: str = DEFAULT_MODEL,
|
|
38
|
+
timeout: int = 30,
|
|
39
|
+
):
|
|
40
|
+
if not api_key:
|
|
41
|
+
raise APIKeyError("api_key is required.")
|
|
42
|
+
self._client = anthropic.Anthropic(api_key=api_key)
|
|
43
|
+
self._http_headers = http_headers or {}
|
|
44
|
+
self._model = model
|
|
45
|
+
self._timeout = timeout
|
|
46
|
+
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
# Public API
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def from_bytes(self, pdf_bytes: bytes) -> AWBResult:
|
|
52
|
+
raw_json = self._call_claude(pdf_bytes)
|
|
53
|
+
return AWBResult.from_dict(raw_json)
|
|
54
|
+
|
|
55
|
+
def from_file(self, pdf_path: str) -> AWBResult:
|
|
56
|
+
pdf_bytes = Path(pdf_path).read_bytes()
|
|
57
|
+
return self.from_bytes(pdf_bytes)
|
|
58
|
+
|
|
59
|
+
def from_url(self, url: str, extra_headers: Optional[dict] = None) -> AWBResult:
|
|
60
|
+
pdf_bytes = self._download(url, extra_headers)
|
|
61
|
+
return self.from_bytes(pdf_bytes)
|
|
62
|
+
|
|
63
|
+
def from_urls(self, urls: list[str]) -> list[dict]:
|
|
64
|
+
results = []
|
|
65
|
+
for url in urls:
|
|
66
|
+
try:
|
|
67
|
+
results.append({
|
|
68
|
+
"url": url,
|
|
69
|
+
"data": self.from_url(url),
|
|
70
|
+
"error": None,
|
|
71
|
+
})
|
|
72
|
+
except Exception as e:
|
|
73
|
+
results.append({
|
|
74
|
+
"url": url,
|
|
75
|
+
"data": None,
|
|
76
|
+
"error": str(e),
|
|
77
|
+
})
|
|
78
|
+
return results
|
|
79
|
+
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
# Private helpers
|
|
82
|
+
# ------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _download(self, url: str, extra_headers: Optional[dict] = None) -> bytes:
|
|
85
|
+
headers = {**self._http_headers, **(extra_headers or {})}
|
|
86
|
+
try:
|
|
87
|
+
resp = httpx.get(url, headers=headers, follow_redirects=True, timeout=self._timeout)
|
|
88
|
+
resp.raise_for_status()
|
|
89
|
+
return resp.content
|
|
90
|
+
except httpx.HTTPStatusError as e:
|
|
91
|
+
raise PDFDownloadError(url, status_code=e.response.status_code) from e
|
|
92
|
+
except httpx.RequestError as e:
|
|
93
|
+
raise PDFDownloadError(url, message=str(e)) from e
|
|
94
|
+
|
|
95
|
+
def _call_claude(self, pdf_bytes: bytes) -> dict:
|
|
96
|
+
pdf_base64 = base64.standard_b64encode(pdf_bytes).decode("utf-8")
|
|
97
|
+
|
|
98
|
+
response = self._client.messages.create(
|
|
99
|
+
model=self._model,
|
|
100
|
+
max_tokens=1024,
|
|
101
|
+
system=_SYSTEM_PROMPT,
|
|
102
|
+
messages=[
|
|
103
|
+
{
|
|
104
|
+
"role": "user",
|
|
105
|
+
"content": [
|
|
106
|
+
{
|
|
107
|
+
"type": "document",
|
|
108
|
+
"source": {
|
|
109
|
+
"type": "base64",
|
|
110
|
+
"media_type": "application/pdf",
|
|
111
|
+
"data": pdf_base64,
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"type": "text",
|
|
116
|
+
"text": "Extract all information from this shipping label.",
|
|
117
|
+
},
|
|
118
|
+
],
|
|
119
|
+
}
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
raw = response.content[0].text.strip()
|
|
124
|
+
raw = raw.replace("```json", "").replace("```", "").strip()
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
return json.loads(raw)
|
|
128
|
+
except json.JSONDecodeError as e:
|
|
129
|
+
raise ExtractionError(raw_response=raw) from e
|
awb_extractor/models.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from dataclasses import dataclass, field, asdict
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class AWBResult:
|
|
8
|
+
tracking_number: Optional[str] = None
|
|
9
|
+
recipient_name: Optional[str] = None
|
|
10
|
+
recipient_phone: Optional[str] = None
|
|
11
|
+
recipient_address: Optional[str] = None
|
|
12
|
+
recipient_ward: Optional[str] = None
|
|
13
|
+
recipient_district: Optional[str] = None
|
|
14
|
+
recipient_province: Optional[str] = None
|
|
15
|
+
sender_name: Optional[str] = None
|
|
16
|
+
sender_address: Optional[str] = None
|
|
17
|
+
cod: Optional[str] = None
|
|
18
|
+
weight: Optional[str] = None
|
|
19
|
+
order_id: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_dict(cls, data: dict) -> "AWBResult":
|
|
23
|
+
valid_keys = cls.__dataclass_fields__.keys()
|
|
24
|
+
filtered = {k: v or None for k, v in data.items() if k in valid_keys}
|
|
25
|
+
return cls(**filtered)
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> dict:
|
|
28
|
+
return {k: v for k, v in asdict(self).items()}
|
|
29
|
+
|
|
30
|
+
def to_json(self, indent: int = 2) -> str:
|
|
31
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
|
|
32
|
+
|
|
33
|
+
def __repr__(self) -> str:
|
|
34
|
+
fields = ", ".join(
|
|
35
|
+
f"{k}={v!r}" for k, v in self.to_dict().items() if v is not None
|
|
36
|
+
)
|
|
37
|
+
return f"AWBResult({fields})"
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: awb-extractor
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Extract recipient address from AWB/shipping label PDF using Claude AI
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: anthropic>=0.40.0
|
|
9
|
+
Requires-Dist: httpx>=0.27.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
12
|
+
Requires-Dist: pytest-mock>=3.14; extra == "dev"
|
|
13
|
+
|
|
14
|
+
# AWB Extractor
|
|
15
|
+
|
|
16
|
+
Python SDK for extracting receiver and shipment information from AWB/shipping
|
|
17
|
+
label PDF files using Claude AI.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- Extract from PDF bytes, local PDF files, or PDF URLs
|
|
22
|
+
- Batch extraction from multiple URLs
|
|
23
|
+
- Optional default HTTP headers for protected AWB URLs
|
|
24
|
+
- Typed `AWBResult` dataclass output
|
|
25
|
+
- Custom exceptions for API key, PDF download, and JSON parsing failures
|
|
26
|
+
|
|
27
|
+
## Requirements
|
|
28
|
+
|
|
29
|
+
- Python 3.10+
|
|
30
|
+
- Anthropic API key
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
Install from PyPI:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install awb-extractor
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
For local development:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e ".[dev]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from awb_extractor import AWBExtractor
|
|
50
|
+
|
|
51
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
52
|
+
result = extractor.from_file("label.pdf")
|
|
53
|
+
|
|
54
|
+
print(result.recipient_name)
|
|
55
|
+
print(result.to_dict())
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Example result:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
{
|
|
62
|
+
"tracking_number": "NHSVC972103440",
|
|
63
|
+
"recipient_name": "Nguyen Van A",
|
|
64
|
+
"recipient_phone": "(+84)03******37",
|
|
65
|
+
"recipient_address": "237 Nguyen Trai",
|
|
66
|
+
"recipient_ward": "Phuong Ben Thanh",
|
|
67
|
+
"recipient_district": "Quan 1",
|
|
68
|
+
"recipient_province": "TP. Ho Chi Minh",
|
|
69
|
+
"sender_name": "Onflow",
|
|
70
|
+
"sender_address": "TP. Ho Chi Minh",
|
|
71
|
+
"cod": "0",
|
|
72
|
+
"weight": "0.700 KG",
|
|
73
|
+
"order_id": "584425059595159079",
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Supported Inputs
|
|
78
|
+
|
|
79
|
+
### PDF bytes
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from awb_extractor import AWBExtractor
|
|
83
|
+
|
|
84
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
85
|
+
|
|
86
|
+
with open("label.pdf", "rb") as file:
|
|
87
|
+
result = extractor.from_bytes(file.read())
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Local PDF file
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from awb_extractor import AWBExtractor
|
|
94
|
+
|
|
95
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
96
|
+
result = extractor.from_file("label.pdf")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### PDF URL
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from awb_extractor import AWBExtractor
|
|
103
|
+
|
|
104
|
+
extractor = AWBExtractor(
|
|
105
|
+
api_key="sk-ant-...",
|
|
106
|
+
http_headers={"Authorization": "Bearer token"},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
result = extractor.from_url("https://example.com/awb.pdf")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
You can pass request-specific headers with `extra_headers`:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
result = extractor.from_url(
|
|
116
|
+
"https://example.com/awb.pdf",
|
|
117
|
+
extra_headers={"X-Request-ID": "request-123"},
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Multiple URLs
|
|
122
|
+
|
|
123
|
+
`from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
|
|
124
|
+
Failed URLs do not stop the whole batch.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from awb_extractor import AWBExtractor
|
|
128
|
+
|
|
129
|
+
extractor = AWBExtractor(api_key="sk-ant-...")
|
|
130
|
+
results = extractor.from_urls([
|
|
131
|
+
"https://example.com/good.pdf",
|
|
132
|
+
"https://example.com/bad.pdf",
|
|
133
|
+
])
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Result Fields
|
|
137
|
+
|
|
138
|
+
`AWBResult` includes:
|
|
139
|
+
|
|
140
|
+
- `tracking_number`
|
|
141
|
+
- `recipient_name`
|
|
142
|
+
- `recipient_phone`
|
|
143
|
+
- `recipient_address`
|
|
144
|
+
- `recipient_ward`
|
|
145
|
+
- `recipient_district`
|
|
146
|
+
- `recipient_province`
|
|
147
|
+
- `sender_name`
|
|
148
|
+
- `sender_address`
|
|
149
|
+
- `cod`
|
|
150
|
+
- `weight`
|
|
151
|
+
- `order_id`
|
|
152
|
+
|
|
153
|
+
Use `to_dict()` or `to_json()` to serialize the result.
|
|
154
|
+
|
|
155
|
+
## Exceptions
|
|
156
|
+
|
|
157
|
+
- `APIKeyError`: missing API key
|
|
158
|
+
- `PDFDownloadError`: PDF URL download failed
|
|
159
|
+
- `ExtractionError`: Claude response could not be parsed as JSON
|
|
160
|
+
|
|
161
|
+
## Package Structure
|
|
162
|
+
|
|
163
|
+
- `awb_extractor/extractor.py`: public `AWBExtractor` class
|
|
164
|
+
- `awb_extractor/models.py`: `AWBResult` dataclass
|
|
165
|
+
- `awb_extractor/exceptions.py`: package exceptions
|
|
166
|
+
|
|
167
|
+
## Publishing
|
|
168
|
+
|
|
169
|
+
GitHub Actions builds and publishes the package to PyPI on every push to `main`.
|
|
170
|
+
|
|
171
|
+
The repository must define this GitHub secret:
|
|
172
|
+
|
|
173
|
+
```text
|
|
174
|
+
PYPI_API_TOKEN
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
PyPI does not allow replacing an existing version. If a commit on `main` does not
|
|
178
|
+
bump `project.version` in `pyproject.toml`, the publish step skips the existing
|
|
179
|
+
distribution.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
awb_extractor/__init__.py,sha256=GfsBdTsmNHcNuKKmAo4wgQEaBjZsB5TYoUNhlu4y41w,342
|
|
2
|
+
awb_extractor/exceptions.py,sha256=PRf0h1WLyxmcoSbaEyQtmrnx8Nz-ZifaZMW9f-PA1ic,693
|
|
3
|
+
awb_extractor/extractor.py,sha256=GJb5XRbRLmgZgKW3jhRHgs7oM04T9fkoHYp75X7oaLI,4250
|
|
4
|
+
awb_extractor/models.py,sha256=cY3T1hcGpOzZKaEma_eevsAXKCndNe1GXOMpqq4JTvg,1224
|
|
5
|
+
awb_extractor-0.1.1.dist-info/METADATA,sha256=AnhxwhUFBi0xqLxC0sCCD6rYZb1M6Qk0bjrTwdK_8pg,3830
|
|
6
|
+
awb_extractor-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
awb_extractor-0.1.1.dist-info/top_level.txt,sha256=OjoN1DneQqcJc6-VrQC53k8s0JvGCSoMPD73t5YGpig,14
|
|
8
|
+
awb_extractor-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
awb_extractor
|