awb-extractor 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ from .extractor import AWBExtractor
2
+ from .models import AWBResult
3
+ from .exceptions import (
4
+ AWBExtractorError,
5
+ APIKeyError,
6
+ PDFDownloadError,
7
+ ExtractionError,
8
+ )
9
+
10
+ __version__ = "0.1.1"
11
+ __all__ = [
12
+ "AWBExtractor",
13
+ "AWBResult",
14
+ "AWBExtractorError",
15
+ "APIKeyError",
16
+ "PDFDownloadError",
17
+ "ExtractionError",
18
+ ]
@@ -0,0 +1,22 @@
1
+ class AWBExtractorError(Exception):
2
+ pass
3
+
4
+
5
+ class APIKeyError(AWBExtractorError):
6
+ pass
7
+
8
+
9
+ class PDFDownloadError(AWBExtractorError):
10
+ def __init__(self, url: str, status_code: int = None, message: str = None):
11
+ self.url = url
12
+ self.status_code = status_code
13
+ detail = f"status={status_code}" if status_code else message or "unknown error"
14
+ super().__init__(f"Failed to download PDF from {url!r}: {detail}")
15
+
16
+
17
+ class ExtractionError(AWBExtractorError):
18
+ def __init__(self, raw_response: str = None):
19
+ self.raw_response = raw_response
20
+ super().__init__(
21
+ f"Failed to parse Claude response as JSON. Raw: {raw_response!r}"
22
+ )
@@ -0,0 +1,129 @@
1
+ import base64
2
+ import json
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ import anthropic
7
+ import httpx
8
+
9
+ from .exceptions import APIKeyError, ExtractionError, PDFDownloadError
10
+ from .models import AWBResult
11
+
12
+ _SYSTEM_PROMPT = """You are an OCR system that extracts information from shipping labels (AWB).
13
+ Return only raw JSON, no explanation, no markdown. Required format:
14
+ {
15
+ "tracking_number": "",
16
+ "recipient_name": "",
17
+ "recipient_phone": "",
18
+ "recipient_address": "",
19
+ "recipient_ward": "",
20
+ "recipient_district": "",
21
+ "recipient_province": "",
22
+ "sender_name": "",
23
+ "sender_address": "",
24
+ "cod": "",
25
+ "weight": "",
26
+ "order_id": ""
27
+ }"""
28
+
29
+
30
+ class AWBExtractor:
31
+ DEFAULT_MODEL = "claude-haiku-4-5-20251001"
32
+
33
+ def __init__(
34
+ self,
35
+ api_key: str,
36
+ http_headers: Optional[dict] = None,
37
+ model: str = DEFAULT_MODEL,
38
+ timeout: int = 30,
39
+ ):
40
+ if not api_key:
41
+ raise APIKeyError("api_key is required.")
42
+ self._client = anthropic.Anthropic(api_key=api_key)
43
+ self._http_headers = http_headers or {}
44
+ self._model = model
45
+ self._timeout = timeout
46
+
47
+ # ------------------------------------------------------------------
48
+ # Public API
49
+ # ------------------------------------------------------------------
50
+
51
+ def from_bytes(self, pdf_bytes: bytes) -> AWBResult:
52
+ raw_json = self._call_claude(pdf_bytes)
53
+ return AWBResult.from_dict(raw_json)
54
+
55
+ def from_file(self, pdf_path: str) -> AWBResult:
56
+ pdf_bytes = Path(pdf_path).read_bytes()
57
+ return self.from_bytes(pdf_bytes)
58
+
59
+ def from_url(self, url: str, extra_headers: Optional[dict] = None) -> AWBResult:
60
+ pdf_bytes = self._download(url, extra_headers)
61
+ return self.from_bytes(pdf_bytes)
62
+
63
+ def from_urls(self, urls: list[str]) -> list[dict]:
64
+ results = []
65
+ for url in urls:
66
+ try:
67
+ results.append({
68
+ "url": url,
69
+ "data": self.from_url(url),
70
+ "error": None,
71
+ })
72
+ except Exception as e:
73
+ results.append({
74
+ "url": url,
75
+ "data": None,
76
+ "error": str(e),
77
+ })
78
+ return results
79
+
80
+ # ------------------------------------------------------------------
81
+ # Private helpers
82
+ # ------------------------------------------------------------------
83
+
84
+ def _download(self, url: str, extra_headers: Optional[dict] = None) -> bytes:
85
+ headers = {**self._http_headers, **(extra_headers or {})}
86
+ try:
87
+ resp = httpx.get(url, headers=headers, follow_redirects=True, timeout=self._timeout)
88
+ resp.raise_for_status()
89
+ return resp.content
90
+ except httpx.HTTPStatusError as e:
91
+ raise PDFDownloadError(url, status_code=e.response.status_code) from e
92
+ except httpx.RequestError as e:
93
+ raise PDFDownloadError(url, message=str(e)) from e
94
+
95
+ def _call_claude(self, pdf_bytes: bytes) -> dict:
96
+ pdf_base64 = base64.standard_b64encode(pdf_bytes).decode("utf-8")
97
+
98
+ response = self._client.messages.create(
99
+ model=self._model,
100
+ max_tokens=1024,
101
+ system=_SYSTEM_PROMPT,
102
+ messages=[
103
+ {
104
+ "role": "user",
105
+ "content": [
106
+ {
107
+ "type": "document",
108
+ "source": {
109
+ "type": "base64",
110
+ "media_type": "application/pdf",
111
+ "data": pdf_base64,
112
+ },
113
+ },
114
+ {
115
+ "type": "text",
116
+ "text": "Extract all information from this shipping label.",
117
+ },
118
+ ],
119
+ }
120
+ ],
121
+ )
122
+
123
+ raw = response.content[0].text.strip()
124
+ raw = raw.replace("```json", "").replace("```", "").strip()
125
+
126
+ try:
127
+ return json.loads(raw)
128
+ except json.JSONDecodeError as e:
129
+ raise ExtractionError(raw_response=raw) from e
@@ -0,0 +1,37 @@
1
+ from dataclasses import dataclass, field, asdict
2
+ from typing import Optional
3
+ import json
4
+
5
+
6
+ @dataclass
7
+ class AWBResult:
8
+ tracking_number: Optional[str] = None
9
+ recipient_name: Optional[str] = None
10
+ recipient_phone: Optional[str] = None
11
+ recipient_address: Optional[str] = None
12
+ recipient_ward: Optional[str] = None
13
+ recipient_district: Optional[str] = None
14
+ recipient_province: Optional[str] = None
15
+ sender_name: Optional[str] = None
16
+ sender_address: Optional[str] = None
17
+ cod: Optional[str] = None
18
+ weight: Optional[str] = None
19
+ order_id: Optional[str] = None
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict) -> "AWBResult":
23
+ valid_keys = cls.__dataclass_fields__.keys()
24
+ filtered = {k: v or None for k, v in data.items() if k in valid_keys}
25
+ return cls(**filtered)
26
+
27
+ def to_dict(self) -> dict:
28
+ return {k: v for k, v in asdict(self).items()}
29
+
30
+ def to_json(self, indent: int = 2) -> str:
31
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
32
+
33
+ def __repr__(self) -> str:
34
+ fields = ", ".join(
35
+ f"{k}={v!r}" for k, v in self.to_dict().items() if v is not None
36
+ )
37
+ return f"AWBResult({fields})"
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: awb-extractor
3
+ Version: 0.1.1
4
+ Summary: Extract recipient address from AWB/shipping label PDF using Claude AI
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: anthropic>=0.40.0
9
+ Requires-Dist: httpx>=0.27.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0; extra == "dev"
12
+ Requires-Dist: pytest-mock>=3.14; extra == "dev"
13
+
14
+ # AWB Extractor
15
+
16
+ Python SDK for extracting receiver and shipment information from AWB/shipping
17
+ label PDF files using Claude AI.
18
+
19
+ ## Features
20
+
21
+ - Extract from PDF bytes, local PDF files, or PDF URLs
22
+ - Batch extraction from multiple URLs
23
+ - Optional default HTTP headers for protected AWB URLs
24
+ - Typed `AWBResult` dataclass output
25
+ - Custom exceptions for API key, PDF download, and JSON parsing failures
26
+
27
+ ## Requirements
28
+
29
+ - Python 3.10+
30
+ - Anthropic API key
31
+
32
+ ## Installation
33
+
34
+ Install from PyPI:
35
+
36
+ ```bash
37
+ pip install awb-extractor
38
+ ```
39
+
40
+ For local development:
41
+
42
+ ```bash
43
+ pip install -e ".[dev]"
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```python
49
+ from awb_extractor import AWBExtractor
50
+
51
+ extractor = AWBExtractor(api_key="sk-ant-...")
52
+ result = extractor.from_file("label.pdf")
53
+
54
+ print(result.recipient_name)
55
+ print(result.to_dict())
56
+ ```
57
+
58
+ Example result:
59
+
60
+ ```python
61
+ {
62
+ "tracking_number": "NHSVC972103440",
63
+ "recipient_name": "Nguyen Van A",
64
+ "recipient_phone": "(+84)03******37",
65
+ "recipient_address": "237 Nguyen Trai",
66
+ "recipient_ward": "Phuong Ben Thanh",
67
+ "recipient_district": "Quan 1",
68
+ "recipient_province": "TP. Ho Chi Minh",
69
+ "sender_name": "Onflow",
70
+ "sender_address": "TP. Ho Chi Minh",
71
+ "cod": "0",
72
+ "weight": "0.700 KG",
73
+ "order_id": "584425059595159079",
74
+ }
75
+ ```
76
+
77
+ ## Supported Inputs
78
+
79
+ ### PDF bytes
80
+
81
+ ```python
82
+ from awb_extractor import AWBExtractor
83
+
84
+ extractor = AWBExtractor(api_key="sk-ant-...")
85
+
86
+ with open("label.pdf", "rb") as file:
87
+ result = extractor.from_bytes(file.read())
88
+ ```
89
+
90
+ ### Local PDF file
91
+
92
+ ```python
93
+ from awb_extractor import AWBExtractor
94
+
95
+ extractor = AWBExtractor(api_key="sk-ant-...")
96
+ result = extractor.from_file("label.pdf")
97
+ ```
98
+
99
+ ### PDF URL
100
+
101
+ ```python
102
+ from awb_extractor import AWBExtractor
103
+
104
+ extractor = AWBExtractor(
105
+ api_key="sk-ant-...",
106
+ http_headers={"Authorization": "Bearer token"},
107
+ )
108
+
109
+ result = extractor.from_url("https://example.com/awb.pdf")
110
+ ```
111
+
112
+ You can pass request-specific headers with `extra_headers`:
113
+
114
+ ```python
115
+ result = extractor.from_url(
116
+ "https://example.com/awb.pdf",
117
+ extra_headers={"X-Request-ID": "request-123"},
118
+ )
119
+ ```
120
+
121
+ ### Multiple URLs
122
+
123
+ `from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
124
+ Failed URLs do not stop the whole batch.
125
+
126
+ ```python
127
+ from awb_extractor import AWBExtractor
128
+
129
+ extractor = AWBExtractor(api_key="sk-ant-...")
130
+ results = extractor.from_urls([
131
+ "https://example.com/good.pdf",
132
+ "https://example.com/bad.pdf",
133
+ ])
134
+ ```
135
+
136
+ ## Result Fields
137
+
138
+ `AWBResult` includes:
139
+
140
+ - `tracking_number`
141
+ - `recipient_name`
142
+ - `recipient_phone`
143
+ - `recipient_address`
144
+ - `recipient_ward`
145
+ - `recipient_district`
146
+ - `recipient_province`
147
+ - `sender_name`
148
+ - `sender_address`
149
+ - `cod`
150
+ - `weight`
151
+ - `order_id`
152
+
153
+ Use `to_dict()` or `to_json()` to serialize the result.
154
+
155
+ ## Exceptions
156
+
157
+ - `APIKeyError`: missing API key
158
+ - `PDFDownloadError`: PDF URL download failed
159
+ - `ExtractionError`: Claude response could not be parsed as JSON
160
+
161
+ ## Package Structure
162
+
163
+ - `awb_extractor/extractor.py`: public `AWBExtractor` class
164
+ - `awb_extractor/models.py`: `AWBResult` dataclass
165
+ - `awb_extractor/exceptions.py`: package exceptions
166
+
167
+ ## Publishing
168
+
169
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
170
+
171
+ The repository must define this GitHub secret:
172
+
173
+ ```text
174
+ PYPI_API_TOKEN
175
+ ```
176
+
177
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
178
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
179
+ distribution.
@@ -0,0 +1,8 @@
1
+ awb_extractor/__init__.py,sha256=GfsBdTsmNHcNuKKmAo4wgQEaBjZsB5TYoUNhlu4y41w,342
2
+ awb_extractor/exceptions.py,sha256=PRf0h1WLyxmcoSbaEyQtmrnx8Nz-ZifaZMW9f-PA1ic,693
3
+ awb_extractor/extractor.py,sha256=GJb5XRbRLmgZgKW3jhRHgs7oM04T9fkoHYp75X7oaLI,4250
4
+ awb_extractor/models.py,sha256=cY3T1hcGpOzZKaEma_eevsAXKCndNe1GXOMpqq4JTvg,1224
5
+ awb_extractor-0.1.1.dist-info/METADATA,sha256=AnhxwhUFBi0xqLxC0sCCD6rYZb1M6Qk0bjrTwdK_8pg,3830
6
+ awb_extractor-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ awb_extractor-0.1.1.dist-info/top_level.txt,sha256=OjoN1DneQqcJc6-VrQC53k8s0JvGCSoMPD73t5YGpig,14
8
+ awb_extractor-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ awb_extractor