awb-extractor 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: awb-extractor
3
+ Version: 0.1.1
4
+ Summary: Extract recipient address from AWB/shipping label PDF using Claude AI
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: anthropic>=0.40.0
9
+ Requires-Dist: httpx>=0.27.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0; extra == "dev"
12
+ Requires-Dist: pytest-mock>=3.14; extra == "dev"
13
+
14
+ # AWB Extractor
15
+
16
+ Python SDK for extracting receiver and shipment information from AWB/shipping
17
+ label PDF files using Claude AI.
18
+
19
+ ## Features
20
+
21
+ - Extract from PDF bytes, local PDF files, or PDF URLs
22
+ - Batch extraction from multiple URLs
23
+ - Optional default HTTP headers for protected AWB URLs
24
+ - Typed `AWBResult` dataclass output
25
+ - Custom exceptions for API key, PDF download, and JSON parsing failures
26
+
27
+ ## Requirements
28
+
29
+ - Python 3.10+
30
+ - Anthropic API key
31
+
32
+ ## Installation
33
+
34
+ Install from PyPI:
35
+
36
+ ```bash
37
+ pip install awb-extractor
38
+ ```
39
+
40
+ For local development:
41
+
42
+ ```bash
43
+ pip install -e ".[dev]"
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```python
49
+ from awb_extractor import AWBExtractor
50
+
51
+ extractor = AWBExtractor(api_key="sk-ant-...")
52
+ result = extractor.from_file("label.pdf")
53
+
54
+ print(result.recipient_name)
55
+ print(result.to_dict())
56
+ ```
57
+
58
+ Example result:
59
+
60
+ ```python
61
+ {
62
+ "tracking_number": "NHSVC972103440",
63
+ "recipient_name": "Nguyen Van A",
64
+ "recipient_phone": "(+84)03******37",
65
+ "recipient_address": "237 Nguyen Trai",
66
+ "recipient_ward": "Phuong Ben Thanh",
67
+ "recipient_district": "Quan 1",
68
+ "recipient_province": "TP. Ho Chi Minh",
69
+ "sender_name": "Onflow",
70
+ "sender_address": "TP. Ho Chi Minh",
71
+ "cod": "0",
72
+ "weight": "0.700 KG",
73
+ "order_id": "584425059595159079",
74
+ }
75
+ ```
76
+
77
+ ## Supported Inputs
78
+
79
+ ### PDF bytes
80
+
81
+ ```python
82
+ from awb_extractor import AWBExtractor
83
+
84
+ extractor = AWBExtractor(api_key="sk-ant-...")
85
+
86
+ with open("label.pdf", "rb") as file:
87
+ result = extractor.from_bytes(file.read())
88
+ ```
89
+
90
+ ### Local PDF file
91
+
92
+ ```python
93
+ from awb_extractor import AWBExtractor
94
+
95
+ extractor = AWBExtractor(api_key="sk-ant-...")
96
+ result = extractor.from_file("label.pdf")
97
+ ```
98
+
99
+ ### PDF URL
100
+
101
+ ```python
102
+ from awb_extractor import AWBExtractor
103
+
104
+ extractor = AWBExtractor(
105
+ api_key="sk-ant-...",
106
+ http_headers={"Authorization": "Bearer token"},
107
+ )
108
+
109
+ result = extractor.from_url("https://example.com/awb.pdf")
110
+ ```
111
+
112
+ You can pass request-specific headers with `extra_headers`:
113
+
114
+ ```python
115
+ result = extractor.from_url(
116
+ "https://example.com/awb.pdf",
117
+ extra_headers={"X-Request-ID": "request-123"},
118
+ )
119
+ ```
120
+
121
+ ### Multiple URLs
122
+
123
+ `from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
124
+ Failed URLs do not stop the whole batch.
125
+
126
+ ```python
127
+ from awb_extractor import AWBExtractor
128
+
129
+ extractor = AWBExtractor(api_key="sk-ant-...")
130
+ results = extractor.from_urls([
131
+ "https://example.com/good.pdf",
132
+ "https://example.com/bad.pdf",
133
+ ])
134
+ ```
135
+
136
+ ## Result Fields
137
+
138
+ `AWBResult` includes:
139
+
140
+ - `tracking_number`
141
+ - `recipient_name`
142
+ - `recipient_phone`
143
+ - `recipient_address`
144
+ - `recipient_ward`
145
+ - `recipient_district`
146
+ - `recipient_province`
147
+ - `sender_name`
148
+ - `sender_address`
149
+ - `cod`
150
+ - `weight`
151
+ - `order_id`
152
+
153
+ Use `to_dict()` or `to_json()` to serialize the result.
154
+
155
+ ## Exceptions
156
+
157
+ - `APIKeyError`: missing API key
158
+ - `PDFDownloadError`: PDF URL download failed
159
+ - `ExtractionError`: Claude response could not be parsed as JSON
160
+
161
+ ## Package Structure
162
+
163
+ - `awb_extractor/extractor.py`: public `AWBExtractor` class
164
+ - `awb_extractor/models.py`: `AWBResult` dataclass
165
+ - `awb_extractor/exceptions.py`: package exceptions
166
+
167
+ ## Publishing
168
+
169
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
170
+
171
+ The repository must define this GitHub secret:
172
+
173
+ ```text
174
+ PYPI_API_TOKEN
175
+ ```
176
+
177
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
178
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
179
+ distribution.
@@ -0,0 +1,166 @@
1
+ # AWB Extractor
2
+
3
+ Python SDK for extracting receiver and shipment information from AWB/shipping
4
+ label PDF files using Claude AI.
5
+
6
+ ## Features
7
+
8
+ - Extract from PDF bytes, local PDF files, or PDF URLs
9
+ - Batch extraction from multiple URLs
10
+ - Optional default HTTP headers for protected AWB URLs
11
+ - Typed `AWBResult` dataclass output
12
+ - Custom exceptions for API key, PDF download, and JSON parsing failures
13
+
14
+ ## Requirements
15
+
16
+ - Python 3.10+
17
+ - Anthropic API key
18
+
19
+ ## Installation
20
+
21
+ Install from PyPI:
22
+
23
+ ```bash
24
+ pip install awb-extractor
25
+ ```
26
+
27
+ For local development:
28
+
29
+ ```bash
30
+ pip install -e ".[dev]"
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from awb_extractor import AWBExtractor
37
+
38
+ extractor = AWBExtractor(api_key="sk-ant-...")
39
+ result = extractor.from_file("label.pdf")
40
+
41
+ print(result.recipient_name)
42
+ print(result.to_dict())
43
+ ```
44
+
45
+ Example result:
46
+
47
+ ```python
48
+ {
49
+ "tracking_number": "NHSVC972103440",
50
+ "recipient_name": "Nguyen Van A",
51
+ "recipient_phone": "(+84)03******37",
52
+ "recipient_address": "237 Nguyen Trai",
53
+ "recipient_ward": "Phuong Ben Thanh",
54
+ "recipient_district": "Quan 1",
55
+ "recipient_province": "TP. Ho Chi Minh",
56
+ "sender_name": "Onflow",
57
+ "sender_address": "TP. Ho Chi Minh",
58
+ "cod": "0",
59
+ "weight": "0.700 KG",
60
+ "order_id": "584425059595159079",
61
+ }
62
+ ```
63
+
64
+ ## Supported Inputs
65
+
66
+ ### PDF bytes
67
+
68
+ ```python
69
+ from awb_extractor import AWBExtractor
70
+
71
+ extractor = AWBExtractor(api_key="sk-ant-...")
72
+
73
+ with open("label.pdf", "rb") as file:
74
+ result = extractor.from_bytes(file.read())
75
+ ```
76
+
77
+ ### Local PDF file
78
+
79
+ ```python
80
+ from awb_extractor import AWBExtractor
81
+
82
+ extractor = AWBExtractor(api_key="sk-ant-...")
83
+ result = extractor.from_file("label.pdf")
84
+ ```
85
+
86
+ ### PDF URL
87
+
88
+ ```python
89
+ from awb_extractor import AWBExtractor
90
+
91
+ extractor = AWBExtractor(
92
+ api_key="sk-ant-...",
93
+ http_headers={"Authorization": "Bearer token"},
94
+ )
95
+
96
+ result = extractor.from_url("https://example.com/awb.pdf")
97
+ ```
98
+
99
+ You can pass request-specific headers with `extra_headers`:
100
+
101
+ ```python
102
+ result = extractor.from_url(
103
+ "https://example.com/awb.pdf",
104
+ extra_headers={"X-Request-ID": "request-123"},
105
+ )
106
+ ```
107
+
108
+ ### Multiple URLs
109
+
110
+ `from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
111
+ Failed URLs do not stop the whole batch.
112
+
113
+ ```python
114
+ from awb_extractor import AWBExtractor
115
+
116
+ extractor = AWBExtractor(api_key="sk-ant-...")
117
+ results = extractor.from_urls([
118
+ "https://example.com/good.pdf",
119
+ "https://example.com/bad.pdf",
120
+ ])
121
+ ```
122
+
123
+ ## Result Fields
124
+
125
+ `AWBResult` includes:
126
+
127
+ - `tracking_number`
128
+ - `recipient_name`
129
+ - `recipient_phone`
130
+ - `recipient_address`
131
+ - `recipient_ward`
132
+ - `recipient_district`
133
+ - `recipient_province`
134
+ - `sender_name`
135
+ - `sender_address`
136
+ - `cod`
137
+ - `weight`
138
+ - `order_id`
139
+
140
+ Use `to_dict()` or `to_json()` to serialize the result.
141
+
142
+ ## Exceptions
143
+
144
+ - `APIKeyError`: missing API key
145
+ - `PDFDownloadError`: PDF URL download failed
146
+ - `ExtractionError`: Claude response could not be parsed as JSON
147
+
148
+ ## Package Structure
149
+
150
+ - `awb_extractor/extractor.py`: public `AWBExtractor` class
151
+ - `awb_extractor/models.py`: `AWBResult` dataclass
152
+ - `awb_extractor/exceptions.py`: package exceptions
153
+
154
+ ## Publishing
155
+
156
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
157
+
158
+ The repository must define this GitHub secret:
159
+
160
+ ```text
161
+ PYPI_API_TOKEN
162
+ ```
163
+
164
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
165
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
166
+ distribution.
@@ -0,0 +1,18 @@
1
+ from .extractor import AWBExtractor
2
+ from .models import AWBResult
3
+ from .exceptions import (
4
+ AWBExtractorError,
5
+ APIKeyError,
6
+ PDFDownloadError,
7
+ ExtractionError,
8
+ )
9
+
10
+ __version__ = "0.1.1"
11
+ __all__ = [
12
+ "AWBExtractor",
13
+ "AWBResult",
14
+ "AWBExtractorError",
15
+ "APIKeyError",
16
+ "PDFDownloadError",
17
+ "ExtractionError",
18
+ ]
@@ -0,0 +1,22 @@
1
+ class AWBExtractorError(Exception):
2
+ pass
3
+
4
+
5
+ class APIKeyError(AWBExtractorError):
6
+ pass
7
+
8
+
9
+ class PDFDownloadError(AWBExtractorError):
10
+ def __init__(self, url: str, status_code: int = None, message: str = None):
11
+ self.url = url
12
+ self.status_code = status_code
13
+ detail = f"status={status_code}" if status_code else message or "unknown error"
14
+ super().__init__(f"Failed to download PDF from {url!r}: {detail}")
15
+
16
+
17
+ class ExtractionError(AWBExtractorError):
18
+ def __init__(self, raw_response: str = None):
19
+ self.raw_response = raw_response
20
+ super().__init__(
21
+ f"Failed to parse Claude response as JSON. Raw: {raw_response!r}"
22
+ )
@@ -0,0 +1,129 @@
1
+ import base64
2
+ import json
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ import anthropic
7
+ import httpx
8
+
9
+ from .exceptions import APIKeyError, ExtractionError, PDFDownloadError
10
+ from .models import AWBResult
11
+
12
+ _SYSTEM_PROMPT = """You are an OCR system that extracts information from shipping labels (AWB).
13
+ Return only raw JSON, no explanation, no markdown. Required format:
14
+ {
15
+ "tracking_number": "",
16
+ "recipient_name": "",
17
+ "recipient_phone": "",
18
+ "recipient_address": "",
19
+ "recipient_ward": "",
20
+ "recipient_district": "",
21
+ "recipient_province": "",
22
+ "sender_name": "",
23
+ "sender_address": "",
24
+ "cod": "",
25
+ "weight": "",
26
+ "order_id": ""
27
+ }"""
28
+
29
+
30
+ class AWBExtractor:
31
+ DEFAULT_MODEL = "claude-haiku-4-5-20251001"
32
+
33
+ def __init__(
34
+ self,
35
+ api_key: str,
36
+ http_headers: Optional[dict] = None,
37
+ model: str = DEFAULT_MODEL,
38
+ timeout: int = 30,
39
+ ):
40
+ if not api_key:
41
+ raise APIKeyError("api_key is required.")
42
+ self._client = anthropic.Anthropic(api_key=api_key)
43
+ self._http_headers = http_headers or {}
44
+ self._model = model
45
+ self._timeout = timeout
46
+
47
+ # ------------------------------------------------------------------
48
+ # Public API
49
+ # ------------------------------------------------------------------
50
+
51
+ def from_bytes(self, pdf_bytes: bytes) -> AWBResult:
52
+ raw_json = self._call_claude(pdf_bytes)
53
+ return AWBResult.from_dict(raw_json)
54
+
55
+ def from_file(self, pdf_path: str) -> AWBResult:
56
+ pdf_bytes = Path(pdf_path).read_bytes()
57
+ return self.from_bytes(pdf_bytes)
58
+
59
+ def from_url(self, url: str, extra_headers: Optional[dict] = None) -> AWBResult:
60
+ pdf_bytes = self._download(url, extra_headers)
61
+ return self.from_bytes(pdf_bytes)
62
+
63
+ def from_urls(self, urls: list[str]) -> list[dict]:
64
+ results = []
65
+ for url in urls:
66
+ try:
67
+ results.append({
68
+ "url": url,
69
+ "data": self.from_url(url),
70
+ "error": None,
71
+ })
72
+ except Exception as e:
73
+ results.append({
74
+ "url": url,
75
+ "data": None,
76
+ "error": str(e),
77
+ })
78
+ return results
79
+
80
+ # ------------------------------------------------------------------
81
+ # Private helpers
82
+ # ------------------------------------------------------------------
83
+
84
+ def _download(self, url: str, extra_headers: Optional[dict] = None) -> bytes:
85
+ headers = {**self._http_headers, **(extra_headers or {})}
86
+ try:
87
+ resp = httpx.get(url, headers=headers, follow_redirects=True, timeout=self._timeout)
88
+ resp.raise_for_status()
89
+ return resp.content
90
+ except httpx.HTTPStatusError as e:
91
+ raise PDFDownloadError(url, status_code=e.response.status_code) from e
92
+ except httpx.RequestError as e:
93
+ raise PDFDownloadError(url, message=str(e)) from e
94
+
95
+ def _call_claude(self, pdf_bytes: bytes) -> dict:
96
+ pdf_base64 = base64.standard_b64encode(pdf_bytes).decode("utf-8")
97
+
98
+ response = self._client.messages.create(
99
+ model=self._model,
100
+ max_tokens=1024,
101
+ system=_SYSTEM_PROMPT,
102
+ messages=[
103
+ {
104
+ "role": "user",
105
+ "content": [
106
+ {
107
+ "type": "document",
108
+ "source": {
109
+ "type": "base64",
110
+ "media_type": "application/pdf",
111
+ "data": pdf_base64,
112
+ },
113
+ },
114
+ {
115
+ "type": "text",
116
+ "text": "Extract all information from this shipping label.",
117
+ },
118
+ ],
119
+ }
120
+ ],
121
+ )
122
+
123
+ raw = response.content[0].text.strip()
124
+ raw = raw.replace("```json", "").replace("```", "").strip()
125
+
126
+ try:
127
+ return json.loads(raw)
128
+ except json.JSONDecodeError as e:
129
+ raise ExtractionError(raw_response=raw) from e
@@ -0,0 +1,37 @@
1
+ from dataclasses import dataclass, field, asdict
2
+ from typing import Optional
3
+ import json
4
+
5
+
6
+ @dataclass
7
+ class AWBResult:
8
+ tracking_number: Optional[str] = None
9
+ recipient_name: Optional[str] = None
10
+ recipient_phone: Optional[str] = None
11
+ recipient_address: Optional[str] = None
12
+ recipient_ward: Optional[str] = None
13
+ recipient_district: Optional[str] = None
14
+ recipient_province: Optional[str] = None
15
+ sender_name: Optional[str] = None
16
+ sender_address: Optional[str] = None
17
+ cod: Optional[str] = None
18
+ weight: Optional[str] = None
19
+ order_id: Optional[str] = None
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict) -> "AWBResult":
23
+ valid_keys = cls.__dataclass_fields__.keys()
24
+ filtered = {k: v or None for k, v in data.items() if k in valid_keys}
25
+ return cls(**filtered)
26
+
27
+ def to_dict(self) -> dict:
28
+ return {k: v for k, v in asdict(self).items()}
29
+
30
+ def to_json(self, indent: int = 2) -> str:
31
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
32
+
33
+ def __repr__(self) -> str:
34
+ fields = ", ".join(
35
+ f"{k}={v!r}" for k, v in self.to_dict().items() if v is not None
36
+ )
37
+ return f"AWBResult({fields})"
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: awb-extractor
3
+ Version: 0.1.1
4
+ Summary: Extract recipient address from AWB/shipping label PDF using Claude AI
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: anthropic>=0.40.0
9
+ Requires-Dist: httpx>=0.27.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0; extra == "dev"
12
+ Requires-Dist: pytest-mock>=3.14; extra == "dev"
13
+
14
+ # AWB Extractor
15
+
16
+ Python SDK for extracting receiver and shipment information from AWB/shipping
17
+ label PDF files using Claude AI.
18
+
19
+ ## Features
20
+
21
+ - Extract from PDF bytes, local PDF files, or PDF URLs
22
+ - Batch extraction from multiple URLs
23
+ - Optional default HTTP headers for protected AWB URLs
24
+ - Typed `AWBResult` dataclass output
25
+ - Custom exceptions for API key, PDF download, and JSON parsing failures
26
+
27
+ ## Requirements
28
+
29
+ - Python 3.10+
30
+ - Anthropic API key
31
+
32
+ ## Installation
33
+
34
+ Install from PyPI:
35
+
36
+ ```bash
37
+ pip install awb-extractor
38
+ ```
39
+
40
+ For local development:
41
+
42
+ ```bash
43
+ pip install -e ".[dev]"
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```python
49
+ from awb_extractor import AWBExtractor
50
+
51
+ extractor = AWBExtractor(api_key="sk-ant-...")
52
+ result = extractor.from_file("label.pdf")
53
+
54
+ print(result.recipient_name)
55
+ print(result.to_dict())
56
+ ```
57
+
58
+ Example result:
59
+
60
+ ```python
61
+ {
62
+ "tracking_number": "NHSVC972103440",
63
+ "recipient_name": "Nguyen Van A",
64
+ "recipient_phone": "(+84)03******37",
65
+ "recipient_address": "237 Nguyen Trai",
66
+ "recipient_ward": "Phuong Ben Thanh",
67
+ "recipient_district": "Quan 1",
68
+ "recipient_province": "TP. Ho Chi Minh",
69
+ "sender_name": "Onflow",
70
+ "sender_address": "TP. Ho Chi Minh",
71
+ "cod": "0",
72
+ "weight": "0.700 KG",
73
+ "order_id": "584425059595159079",
74
+ }
75
+ ```
76
+
77
+ ## Supported Inputs
78
+
79
+ ### PDF bytes
80
+
81
+ ```python
82
+ from awb_extractor import AWBExtractor
83
+
84
+ extractor = AWBExtractor(api_key="sk-ant-...")
85
+
86
+ with open("label.pdf", "rb") as file:
87
+ result = extractor.from_bytes(file.read())
88
+ ```
89
+
90
+ ### Local PDF file
91
+
92
+ ```python
93
+ from awb_extractor import AWBExtractor
94
+
95
+ extractor = AWBExtractor(api_key="sk-ant-...")
96
+ result = extractor.from_file("label.pdf")
97
+ ```
98
+
99
+ ### PDF URL
100
+
101
+ ```python
102
+ from awb_extractor import AWBExtractor
103
+
104
+ extractor = AWBExtractor(
105
+ api_key="sk-ant-...",
106
+ http_headers={"Authorization": "Bearer token"},
107
+ )
108
+
109
+ result = extractor.from_url("https://example.com/awb.pdf")
110
+ ```
111
+
112
+ You can pass request-specific headers with `extra_headers`:
113
+
114
+ ```python
115
+ result = extractor.from_url(
116
+ "https://example.com/awb.pdf",
117
+ extra_headers={"X-Request-ID": "request-123"},
118
+ )
119
+ ```
120
+
121
+ ### Multiple URLs
122
+
123
+ `from_urls()` returns a list of dictionaries with `url`, `data`, and `error`.
124
+ Failed URLs do not stop the whole batch.
125
+
126
+ ```python
127
+ from awb_extractor import AWBExtractor
128
+
129
+ extractor = AWBExtractor(api_key="sk-ant-...")
130
+ results = extractor.from_urls([
131
+ "https://example.com/good.pdf",
132
+ "https://example.com/bad.pdf",
133
+ ])
134
+ ```
135
+
136
+ ## Result Fields
137
+
138
+ `AWBResult` includes:
139
+
140
+ - `tracking_number`
141
+ - `recipient_name`
142
+ - `recipient_phone`
143
+ - `recipient_address`
144
+ - `recipient_ward`
145
+ - `recipient_district`
146
+ - `recipient_province`
147
+ - `sender_name`
148
+ - `sender_address`
149
+ - `cod`
150
+ - `weight`
151
+ - `order_id`
152
+
153
+ Use `to_dict()` or `to_json()` to serialize the result.
154
+
155
+ ## Exceptions
156
+
157
+ - `APIKeyError`: missing API key
158
+ - `PDFDownloadError`: PDF URL download failed
159
+ - `ExtractionError`: Claude response could not be parsed as JSON
160
+
161
+ ## Package Structure
162
+
163
+ - `awb_extractor/extractor.py`: public `AWBExtractor` class
164
+ - `awb_extractor/models.py`: `AWBResult` dataclass
165
+ - `awb_extractor/exceptions.py`: package exceptions
166
+
167
+ ## Publishing
168
+
169
+ GitHub Actions builds and publishes the package to PyPI on every push to `main`.
170
+
171
+ The repository must define this GitHub secret:
172
+
173
+ ```text
174
+ PYPI_API_TOKEN
175
+ ```
176
+
177
+ PyPI does not allow replacing an existing version. If a commit on `main` does not
178
+ bump `project.version` in `pyproject.toml`, the publish step skips the existing
179
+ distribution.
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ awb_extractor/__init__.py
4
+ awb_extractor/exceptions.py
5
+ awb_extractor/extractor.py
6
+ awb_extractor/models.py
7
+ awb_extractor.egg-info/PKG-INFO
8
+ awb_extractor.egg-info/SOURCES.txt
9
+ awb_extractor.egg-info/dependency_links.txt
10
+ awb_extractor.egg-info/requires.txt
11
+ awb_extractor.egg-info/top_level.txt
12
+ tests/test_extractor.py
@@ -0,0 +1,6 @@
1
+ anthropic>=0.40.0
2
+ httpx>=0.27.0
3
+
4
+ [dev]
5
+ pytest>=8.0
6
+ pytest-mock>=3.14
@@ -0,0 +1 @@
1
+ awb_extractor
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "awb-extractor"
7
+ version = "0.1.1"
8
+ description = "Extract recipient address from AWB/shipping label PDF using Claude AI"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ dependencies = [
13
+ "anthropic>=0.40.0",
14
+ "httpx>=0.27.0",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ dev = [
19
+ "pytest>=8.0",
20
+ "pytest-mock>=3.14",
21
+ ]
22
+
23
+ [tool.setuptools.packages.find]
24
+ where = ["."]
25
+ include = ["awb_extractor*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,170 @@
1
+ import json
2
+ import pytest
3
+ from unittest.mock import MagicMock, patch
4
+
5
+ from awb_extractor import AWBExtractor, AWBResult
6
+ from awb_extractor.exceptions import APIKeyError, ExtractionError, PDFDownloadError
7
+
8
+ MOCK_PDF = b"%PDF-1.4 fake content"
9
+ MOCK_RESPONSE = {
10
+ "tracking_number": "NHSVC972103440",
11
+ "recipient_name": "Nguyễn Thanh Huyền",
12
+ "recipient_phone": "(+84)03******37",
13
+ "recipient_address": "Trường Mầm Non Trung Chính, 237HNC19",
14
+ "recipient_ward": "Xã Trung Chính",
15
+ "recipient_district": "Huyện Nông Cống",
16
+ "recipient_province": "Thanh Hóa",
17
+ "sender_name": "BZU BZU VIỆT NAM",
18
+ "sender_address": "Số 3A đường An Phú Đông 13, Q.12, TP.HCM",
19
+ "cod": "Được đồng kiểm",
20
+ "weight": "0.700 KG",
21
+ "order_id": "584425059595159079",
22
+ }
23
+
24
+
25
+ def make_extractor():
26
+ return AWBExtractor(api_key="sk-ant-test")
27
+
28
+
29
+ def mock_claude_response(data: dict):
30
+ msg = MagicMock()
31
+ msg.content = [MagicMock(text=json.dumps(data))]
32
+ return msg
33
+
34
+
35
+ # ------------------------------------------------------------------
36
+ # AWBResult
37
+ # ------------------------------------------------------------------
38
+
39
+ def test_awb_result_from_dict():
40
+ result = AWBResult.from_dict(MOCK_RESPONSE)
41
+ assert result.tracking_number == "NHSVC972103440"
42
+ assert result.recipient_name == "Nguyễn Thanh Huyền"
43
+ assert result.recipient_province == "Thanh Hóa"
44
+
45
+
46
+ def test_awb_result_ignores_unknown_keys():
47
+ data = {**MOCK_RESPONSE, "unknown_field": "should be ignored"}
48
+ result = AWBResult.from_dict(data)
49
+ assert not hasattr(result, "unknown_field")
50
+
51
+
52
+ def test_awb_result_to_dict():
53
+ result = AWBResult.from_dict(MOCK_RESPONSE)
54
+ d = result.to_dict()
55
+ assert d["tracking_number"] == "NHSVC972103440"
56
+
57
+
58
+ def test_awb_result_to_json():
59
+ result = AWBResult.from_dict(MOCK_RESPONSE)
60
+ parsed = json.loads(result.to_json())
61
+ assert parsed["order_id"] == "584425059595159079"
62
+
63
+
64
+ def test_awb_result_empty_string_becomes_none():
65
+ data = {**MOCK_RESPONSE, "cod": ""}
66
+ result = AWBResult.from_dict(data)
67
+ assert result.cod is None
68
+
69
+
70
+ # ------------------------------------------------------------------
71
+ # AWBExtractor init
72
+ # ------------------------------------------------------------------
73
+
74
+ def test_raises_if_no_api_key():
75
+ with pytest.raises(APIKeyError):
76
+ AWBExtractor(api_key="")
77
+
78
+
79
+ # ------------------------------------------------------------------
80
+ # from_bytes
81
+ # ------------------------------------------------------------------
82
+
83
+ def test_from_bytes_returns_awb_result():
84
+ extractor = make_extractor()
85
+ with patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
86
+ result = extractor.from_bytes(MOCK_PDF)
87
+ assert isinstance(result, AWBResult)
88
+ assert result.tracking_number == "NHSVC972103440"
89
+
90
+
91
+ def test_from_bytes_raises_extraction_error_on_bad_json():
92
+ extractor = make_extractor()
93
+ bad_msg = MagicMock()
94
+ bad_msg.content = [MagicMock(text="not json at all")]
95
+ with patch.object(extractor._client.messages, "create", return_value=bad_msg):
96
+ with pytest.raises(ExtractionError):
97
+ extractor.from_bytes(MOCK_PDF)
98
+
99
+
100
+ # ------------------------------------------------------------------
101
+ # from_file
102
+ # ------------------------------------------------------------------
103
+
104
+ def test_from_file(tmp_path):
105
+ pdf_file = tmp_path / "test.pdf"
106
+ pdf_file.write_bytes(MOCK_PDF)
107
+ extractor = make_extractor()
108
+ with patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
109
+ result = extractor.from_file(str(pdf_file))
110
+ assert result.recipient_name == "Nguyễn Thanh Huyền"
111
+
112
+
113
+ # ------------------------------------------------------------------
114
+ # from_url
115
+ # ------------------------------------------------------------------
116
+
117
+ def test_from_url_success():
118
+ extractor = make_extractor()
119
+ with patch("awb_extractor.extractor.httpx.get") as mock_get, \
120
+ patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
121
+ mock_get.return_value = MagicMock(content=MOCK_PDF, raise_for_status=lambda: None)
122
+ result = extractor.from_url("https://example.com/awb.pdf")
123
+ assert result.order_id == "584425059595159079"
124
+
125
+
126
+ def test_from_url_raises_download_error_on_http_error():
127
+ import httpx
128
+ extractor = make_extractor()
129
+ with patch("awb_extractor.extractor.httpx.get") as mock_get:
130
+ mock_response = MagicMock()
131
+ mock_response.status_code = 403
132
+ mock_get.return_value.raise_for_status.side_effect = httpx.HTTPStatusError(
133
+ "403", request=MagicMock(), response=mock_response
134
+ )
135
+ with pytest.raises(PDFDownloadError) as exc_info:
136
+ extractor.from_url("https://example.com/awb.pdf")
137
+ assert exc_info.value.status_code == 403
138
+
139
+
140
+ # ------------------------------------------------------------------
141
+ # from_urls (batch)
142
+ # ------------------------------------------------------------------
143
+
144
+ def test_from_urls_handles_mixed_results():
145
+ import httpx
146
+ extractor = make_extractor()
147
+
148
+ def side_effect(url, **kwargs):
149
+ mock = MagicMock()
150
+ if "good" in url:
151
+ mock.content = MOCK_PDF
152
+ mock.raise_for_status = lambda: None
153
+ return mock
154
+ mock_response = MagicMock(status_code=404)
155
+ mock.raise_for_status.side_effect = httpx.HTTPStatusError(
156
+ "404", request=MagicMock(), response=mock_response
157
+ )
158
+ return mock
159
+
160
+ with patch("awb_extractor.extractor.httpx.get", side_effect=side_effect), \
161
+ patch.object(extractor._client.messages, "create", return_value=mock_claude_response(MOCK_RESPONSE)):
162
+ results = extractor.from_urls([
163
+ "https://example.com/good.pdf",
164
+ "https://example.com/bad.pdf",
165
+ ])
166
+
167
+ assert results[0]["data"] is not None
168
+ assert results[0]["error"] is None
169
+ assert results[1]["data"] is None
170
+ assert results[1]["error"] is not None