cli-web-amazon 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,416 @@
1
+ """HTTP client for cli-web-amazon.
2
+
3
+ Protocol: SSR HTML + REST JSON hybrid.
4
+ Library: curl_cffi — Amazon returns 503 to plain httpx; browser TLS
5
+ impersonation (curl_cffi) is required to reach the public endpoints.
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from bs4 import BeautifulSoup
12
+ from curl_cffi import requests as curl_requests
13
+
14
+ from .exceptions import (
15
+ NetworkError,
16
+ NotFoundError,
17
+ ParsingError,
18
+ RateLimitError,
19
+ ServerError,
20
+ )
21
+ from .models import BestSeller, Product, SearchResult, Suggestion
22
+
23
+ BASE_URL = "https://www.amazon.com"
24
+ COMPLETION_URL = "https://completion.amazon.com"
25
+ MERCHANT_ID = "ATVPDKIKX0DER"
26
+
27
+ _DEFAULT_HEADERS = {
28
+ "User-Agent": (
29
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
30
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
31
+ "Chrome/124.0.0.0 Safari/537.36"
32
+ ),
33
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
34
+ "Accept-Language": "en-US,en;q=0.9",
35
+ "Accept-Encoding": "gzip, deflate, br",
36
+ "Sec-Fetch-Dest": "document",
37
+ "Sec-Fetch-Mode": "navigate",
38
+ "Sec-Fetch-Site": "none",
39
+ "Sec-Fetch-User": "?1",
40
+ "Upgrade-Insecure-Requests": "1",
41
+ "Cache-Control": "max-age=0",
42
+ }
43
+
44
+ _JSON_HEADERS = {
45
+ **_DEFAULT_HEADERS,
46
+ "Accept": "application/json, text/plain, */*",
47
+ "X-Requested-With": "XMLHttpRequest",
48
+ }
49
+
50
+
51
+ class AmazonClient:
52
+ """Amazon web client with HTML scraping and JSON API support."""
53
+
54
+ def __init__(self):
55
+ """Initialize the client."""
56
+ self._client: Any = None
57
+
58
+ def __enter__(self):
59
+ self._client = curl_requests.Session(
60
+ impersonate="chrome124",
61
+ headers=_DEFAULT_HEADERS,
62
+ timeout=30,
63
+ )
64
+ return self
65
+
66
+ def __exit__(self, *args):
67
+ if self._client:
68
+ self._client.close()
69
+ self._client = None
70
+
71
+ # ── Internal helpers ────────────────────────────────────────────────
72
+
73
+ def _get(
74
+ self, url: str, params: dict | None = None, headers: dict | None = None
75
+ ) -> Any:
76
+ """Make a GET request with error mapping."""
77
+ try:
78
+ resp = self._client.get(url, params=params, headers=headers)
79
+ except Exception as exc:
80
+ raise NetworkError(f"Request failed: {url}: {exc}") from exc
81
+ return self._check_status(resp, url)
82
+
83
+ def _post(self, url: str, data: dict | None = None, json: dict | None = None) -> Any:
84
+ """Make a POST request with error mapping."""
85
+ try:
86
+ resp = self._client.post(url, data=data, json=json)
87
+ except Exception as exc:
88
+ raise NetworkError(f"Request failed: {url}: {exc}") from exc
89
+ return self._check_status(resp, url)
90
+
91
+ def _check_status(self, resp: Any, url: str) -> Any:
92
+ """Map HTTP status codes to typed exceptions."""
93
+ if resp.status_code == 200:
94
+ return resp
95
+ if resp.status_code in (401, 403):
96
+ raise ServerError(
97
+ f"Access denied (HTTP {resp.status_code}) — unexpected on a public endpoint.",
98
+ status_code=resp.status_code,
99
+ )
100
+ if resp.status_code == 404:
101
+ raise NotFoundError(f"Resource not found: {url}")
102
+ if resp.status_code == 429:
103
+ retry_after = None
104
+ if "retry-after" in resp.headers:
105
+ try:
106
+ retry_after = float(resp.headers["retry-after"])
107
+ except ValueError:
108
+ pass
109
+ raise RateLimitError("Rate limited by Amazon", retry_after=retry_after)
110
+ if resp.status_code >= 500:
111
+ raise ServerError(
112
+ f"Amazon server error: {resp.status_code}", status_code=resp.status_code
113
+ )
114
+ return resp
115
+
116
+ def _soup(self, resp: Any) -> BeautifulSoup:
117
+ """Parse HTML response as BeautifulSoup."""
118
+ return BeautifulSoup(resp.text, "html.parser")
119
+
120
+ # ── Autocomplete Suggestions ────────────────────────────────────────
121
+
122
+ def get_suggestions(self, query: str, limit: int = 11) -> list[Suggestion]:
123
+ """Get autocomplete suggestions for a query.
124
+
125
+ Uses the /suggestions JSON endpoint.
126
+ """
127
+ params = {
128
+ "limit": str(limit),
129
+ "prefix": query,
130
+ "suggestion-type": ["WIDGET", "KEYWORD"],
131
+ "mid": MERCHANT_ID,
132
+ "alias": "aps",
133
+ }
134
+ resp = self._get(
135
+ f"{BASE_URL}/suggestions",
136
+ params=params,
137
+ headers=_JSON_HEADERS,
138
+ )
139
+ try:
140
+ data = resp.json()
141
+ except Exception as exc:
142
+ raise ParsingError(f"Could not parse suggestions response: {exc}") from exc
143
+
144
+ results = []
145
+ for item in data.get("suggestions", []):
146
+ value = item.get("value", "")
147
+ stype = item.get("type", "KEYWORD")
148
+ if value:
149
+ results.append(Suggestion(value=value, type=stype))
150
+ return results
151
+
152
+ # ── Search ──────────────────────────────────────────────────────────
153
+
154
+ def search(
155
+ self, query: str, page: int = 1, department: str | None = None
156
+ ) -> list[SearchResult]:
157
+ """Search Amazon products.
158
+
159
+ Args:
160
+ query: Search keywords.
161
+ page: Page number (default: 1).
162
+ department: Optional department/node filter.
163
+
164
+ Returns:
165
+ List of SearchResult objects.
166
+ """
167
+ params: dict[str, Any] = {"k": query}
168
+ if page > 1:
169
+ params["page"] = str(page)
170
+ if department:
171
+ params["i"] = department
172
+
173
+ resp = self._get(f"{BASE_URL}/s", params=params)
174
+ soup = self._soup(resp)
175
+
176
+ cards = soup.find_all("div", attrs={"data-component-type": "s-search-result"})
177
+ if not cards:
178
+ return []
179
+
180
+ results = []
181
+ for card in cards:
182
+ asin = card.get("data-asin", "")
183
+ if not asin:
184
+ continue
185
+
186
+ # Title from h2
187
+ title_elem = card.find("h2")
188
+ title = title_elem.get_text(strip=True) if title_elem else ""
189
+
190
+ # Price — try a-offscreen first (most reliable), then structured price
191
+ price = ""
192
+ offscreen = card.find("span", class_="a-offscreen")
193
+ if offscreen:
194
+ price = offscreen.get_text(strip=True)
195
+ else:
196
+ whole = card.find("span", class_="a-price-whole")
197
+ frac = card.find("span", class_="a-price-fraction")
198
+ if whole:
199
+ price = whole.get_text(strip=True)
200
+ if frac:
201
+ price += frac.get_text(strip=True)
202
+
203
+ # Rating from a-icon-alt
204
+ rating = ""
205
+ rating_elem = card.find("span", class_="a-icon-alt")
206
+ if rating_elem:
207
+ rating = rating_elem.get_text(strip=True)
208
+
209
+ # Review count — aria-label near rating
210
+ review_count = ""
211
+ review_elem = card.find("span", attrs={"aria-label": re.compile(r"\d")})
212
+ if review_elem:
213
+ review_count = review_elem.get("aria-label", "")
214
+
215
+ # URL — first product link
216
+ link_elem = card.find("a", class_="a-link-normal", href=True)
217
+ url = ""
218
+ if link_elem:
219
+ href = link_elem.get("href", "")
220
+ if href.startswith("http"):
221
+ url = href
222
+ elif href:
223
+ url = f"{BASE_URL}{href}"
224
+
225
+ results.append(
226
+ SearchResult(
227
+ asin=asin,
228
+ title=title,
229
+ price=price,
230
+ rating=rating,
231
+ review_count=review_count,
232
+ url=url,
233
+ )
234
+ )
235
+ return results
236
+
237
+ # ── Product Detail ──────────────────────────────────────────────────
238
+
239
+ def get_product(self, asin: str) -> Product:
240
+ """Get product details by ASIN.
241
+
242
+ Args:
243
+ asin: Amazon Standard Identification Number.
244
+
245
+ Returns:
246
+ Product object with full details.
247
+
248
+ Raises:
249
+ NotFoundError: If ASIN does not exist.
250
+ """
251
+ resp = self._get(f"{BASE_URL}/dp/{asin}")
252
+ soup = self._soup(resp)
253
+ html_text = resp.text
254
+
255
+ # Title
256
+ title_elem = soup.find("span", attrs={"id": "productTitle"})
257
+ title = title_elem.get_text(strip=True) if title_elem else ""
258
+ if not title:
259
+ # Fallback: check if page actually has a product
260
+ if "dp/" not in str(resp.url):
261
+ raise NotFoundError(f"Product not found: {asin}")
262
+ raise ParsingError(f"Could not parse product title for ASIN: {asin}")
263
+
264
+ # Detect geo-restriction — Amazon replaces buybox with a "cannot ship" message
265
+ geo_restricted = (
266
+ "cannot be shipped to your selected delivery location" in html_text
267
+ or "item can't be shipped to your selected location" in html_text.lower()
268
+ )
269
+
270
+ # Price — try a-offscreen (available in SSR when product ships to this region),
271
+ # then a-price-whole, then embedded JSON blobs in script tags.
272
+ # Note: price is empty when the product is geo-restricted or JS-rendered.
273
+ price = ""
274
+ price_elem = soup.find("span", class_="a-offscreen")
275
+ if price_elem:
276
+ price = price_elem.get_text(strip=True)
277
+ if not price:
278
+ whole = soup.find("span", class_="a-price-whole")
279
+ frac = soup.find("span", class_="a-price-fraction")
280
+ if whole:
281
+ price = whole.get_text(strip=True)
282
+ if frac:
283
+ price += frac.get_text(strip=True)
284
+ if not price:
285
+ # Fallback: scan embedded script tags for priceAmount / displayPrice JSON fields
286
+ for m in re.finditer(r'"(?:priceAmount|displayPrice)"\s*:\s*"?([^",}]+)"?', html_text):
287
+ candidate = m.group(1).strip()
288
+ if candidate and candidate not in ("", "0"):
289
+ price = candidate
290
+ break
291
+
292
+ # Build price_note when price is unavailable
293
+ price_note = ""
294
+ if not price:
295
+ if geo_restricted:
296
+ price_note = "Product not available in your region — price not shown"
297
+ else:
298
+ price_note = "Price JS-rendered, not available in SSR HTML"
299
+
300
+ # Rating
301
+ rating = ""
302
+ rating_elem = soup.find("span", attrs={"id": "acrPopover"})
303
+ if rating_elem:
304
+ rating = rating_elem.get("title", "") or rating_elem.get_text(strip=True)
305
+
306
+ # Review count
307
+ review_count = ""
308
+ review_elem = soup.find("span", attrs={"id": "acrCustomerReviewText"})
309
+ if review_elem:
310
+ review_count = review_elem.get_text(strip=True)
311
+
312
+ # Brand
313
+ brand = ""
314
+ brand_elem = soup.find(attrs={"id": "bylineInfo"})
315
+ if brand_elem:
316
+ brand = brand_elem.get_text(strip=True)
317
+
318
+ # Image
319
+ image_url = ""
320
+ img_elem = soup.find("img", attrs={"id": "landingImage"})
321
+ if img_elem:
322
+ image_url = img_elem.get("src", "") or img_elem.get("data-old-hires", "")
323
+
324
+ return Product(
325
+ asin=asin,
326
+ title=title,
327
+ price=price,
328
+ price_note=price_note,
329
+ geo_restricted=geo_restricted,
330
+ rating=rating,
331
+ review_count=review_count,
332
+ brand=brand,
333
+ image_url=image_url,
334
+ url=f"{BASE_URL}/dp/{asin}",
335
+ )
336
+
337
+ # ── Product Variants ────────────────────────────────────────────────
338
+
339
+ # ── Best Sellers ────────────────────────────────────────────────────
340
+
341
+ def get_bestsellers(self, category: str = "electronics", page: int = 1) -> list[BestSeller]:
342
+ """Get Amazon Best Sellers for a category.
343
+
344
+ Args:
345
+ category: Category slug (e.g., "electronics", "books", "toys").
346
+ page: Page number.
347
+
348
+ Returns:
349
+ List of BestSeller objects.
350
+ """
351
+ url = f"{BASE_URL}/Best-Sellers/zgbs/{category}"
352
+ params = {}
353
+ if page > 1:
354
+ params["pg"] = str(page)
355
+
356
+ resp = self._get(url, params=params if params else None)
357
+ soup = self._soup(resp)
358
+
359
+ results = []
360
+ # Best seller grid items — each has id="gridItemRoot"
361
+ containers = soup.find_all("div", attrs={"id": "gridItemRoot"})
362
+
363
+ for container in containers:
364
+ # ASIN from inner div
365
+ asin_div = container.find("div", attrs={"data-asin": True})
366
+ asin = asin_div.get("data-asin", "") if asin_div else ""
367
+ if not asin:
368
+ continue
369
+
370
+ # Rank
371
+ rank = 0
372
+ rank_elem = container.find("span", class_="zg-bdg-text")
373
+ if rank_elem:
374
+ rank_text = rank_elem.get_text(strip=True).lstrip("#")
375
+ try:
376
+ rank = int(rank_text)
377
+ except ValueError:
378
+ pass
379
+
380
+ # Title — from image alt or link text
381
+ title = ""
382
+ img = container.find("img")
383
+ if img:
384
+ title = img.get("alt", "")
385
+ if not title:
386
+ link = container.find("a", class_="a-link-normal")
387
+ if link:
388
+ title = link.get_text(strip=True)
389
+
390
+ # Price
391
+ price = ""
392
+ price_elem = container.find("span", class_="p13n-sc-price")
393
+ if price_elem:
394
+ price = price_elem.get_text(strip=True)
395
+
396
+ # URL
397
+ url_path = ""
398
+ link_elem = container.find("a", class_="a-link-normal", href=True)
399
+ if link_elem:
400
+ href = link_elem.get("href", "")
401
+ if href.startswith("http"):
402
+ url_path = href
403
+ elif href:
404
+ url_path = f"{BASE_URL}{href}"
405
+
406
+ results.append(
407
+ BestSeller(
408
+ rank=rank,
409
+ asin=asin,
410
+ title=title,
411
+ price=price,
412
+ url=url_path,
413
+ )
414
+ )
415
+
416
+ return results
@@ -0,0 +1,76 @@
1
+ """Domain-specific exception hierarchy for cli-web-amazon."""
2
+
3
+
4
+ class AmazonError(Exception):
5
+ """Base exception for all amazon CLI errors."""
6
+
7
+ def to_dict(self) -> dict:
8
+ return {"error": True, "code": "ERROR", "message": str(self)}
9
+
10
+
11
+ class NetworkError(AmazonError):
12
+ """Connection failed, DNS error, timeout."""
13
+
14
+ def to_dict(self) -> dict:
15
+ return {"error": True, "code": "NETWORK_ERROR", "message": str(self)}
16
+
17
+
18
+ class RateLimitError(AmazonError):
19
+ """HTTP 429 — too many requests."""
20
+
21
+ def __init__(self, message: str, retry_after: float | None = None):
22
+ self.retry_after = retry_after
23
+ super().__init__(message)
24
+
25
+ def to_dict(self) -> dict:
26
+ d = {"error": True, "code": "RATE_LIMITED", "message": str(self)}
27
+ if self.retry_after is not None:
28
+ d["retry_after"] = self.retry_after
29
+ return d
30
+
31
+
32
+ class ParsingError(AmazonError):
33
+ """HTML/JSON response could not be parsed — site structure may have changed."""
34
+
35
+ def to_dict(self) -> dict:
36
+ return {"error": True, "code": "PARSING_ERROR", "message": str(self)}
37
+
38
+
39
+ class NotFoundError(AmazonError):
40
+ """Resource not found (product ASIN, category, etc.)."""
41
+
42
+ def to_dict(self) -> dict:
43
+ return {"error": True, "code": "NOT_FOUND", "message": str(self)}
44
+
45
+
46
+ class ServerError(AmazonError):
47
+ """Amazon returned 5xx."""
48
+
49
+ def __init__(self, message: str, status_code: int = 500):
50
+ self.status_code = status_code
51
+ super().__init__(message)
52
+
53
+ def to_dict(self) -> dict:
54
+ return {
55
+ "error": True,
56
+ "code": "SERVER_ERROR",
57
+ "message": str(self),
58
+ "status_code": self.status_code,
59
+ }
60
+
61
+
62
+ EXCEPTION_CODE_MAP = {
63
+ RateLimitError: "RATE_LIMITED",
64
+ NotFoundError: "NOT_FOUND",
65
+ ServerError: "SERVER_ERROR",
66
+ NetworkError: "NETWORK_ERROR",
67
+ ParsingError: "PARSING_ERROR",
68
+ }
69
+
70
+
71
+ def error_code_for(exc: Exception) -> str:
72
+ """Get the JSON error code string for an exception."""
73
+ for exc_type, code in EXCEPTION_CODE_MAP.items():
74
+ if isinstance(exc, exc_type):
75
+ return code
76
+ return "UNKNOWN_ERROR"
@@ -0,0 +1,63 @@
1
+ """Data models for cli-web-amazon."""
2
+
3
+ from dataclasses import asdict, dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class SearchResult:
9
+ """A single product in Amazon search results."""
10
+
11
+ asin: str
12
+ title: str
13
+ price: str = ""
14
+ rating: str = ""
15
+ review_count: str = ""
16
+ url: str = ""
17
+
18
+ def to_dict(self) -> dict[str, Any]:
19
+ return asdict(self)
20
+
21
+
22
+ @dataclass
23
+ class Product:
24
+ """Amazon product detail."""
25
+
26
+ asin: str
27
+ title: str
28
+ price: str = ""
29
+ price_note: str = ""
30
+ geo_restricted: bool = False
31
+ rating: str = ""
32
+ review_count: str = ""
33
+ brand: str = ""
34
+ image_url: str = ""
35
+ url: str = ""
36
+
37
+ def to_dict(self) -> dict[str, Any]:
38
+ return asdict(self)
39
+
40
+
41
+ @dataclass
42
+ class BestSeller:
43
+ """A product in Amazon Best Sellers list."""
44
+
45
+ rank: int
46
+ asin: str
47
+ title: str
48
+ price: str = ""
49
+ url: str = ""
50
+
51
+ def to_dict(self) -> dict[str, Any]:
52
+ return asdict(self)
53
+
54
+
55
+ @dataclass
56
+ class Suggestion:
57
+ """An autocomplete suggestion from Amazon."""
58
+
59
+ value: str
60
+ type: str = "KEYWORD"
61
+
62
+ def to_dict(self) -> dict[str, Any]:
63
+ return asdict(self)
@@ -0,0 +1,105 @@
1
+ ---
2
+ name: amazon-cli
3
+ description: Use cli-web-amazon to search Amazon products, get product details, check
4
+ prices, browse best sellers, and get autocomplete suggestions. Invoke this skill
5
+ whenever the user asks about Amazon products, prices, best sellers, or wants to
6
+ search Amazon. Always prefer cli-web-amazon over manually fetching the website.
7
+ No authentication required — fully public site.
8
+ ---
9
+
10
+ # cli-web-amazon
11
+
12
+ Search Amazon products, view details, browse Best Sellers, and get autocomplete suggestions. No authentication required.
13
+
14
+ ## Quick Start
15
+
16
+ ```bash
17
+ cli-web-amazon search "laptop" --json
18
+ cli-web-amazon product get B0GRZ78683 --json
19
+ cli-web-amazon bestsellers electronics --json
20
+ ```
21
+
22
+ Always use `--json` when parsing output programmatically.
23
+
24
+ ---
25
+
26
+ ## Commands
27
+
28
+ ### `search QUERY`
29
+ Search Amazon products by keyword.
30
+
31
+ ```bash
32
+ cli-web-amazon search "wireless headphones" --json
33
+ cli-web-amazon search "laptop" --page 2 --dept electronics --json
34
+ ```
35
+
36
+ **Key options:** `--page N` (default 1), `--dept <department>`
37
+
38
+ **Output fields:** `asin`, `title`, `price`, `rating`, `review_count`, `url`
39
+
40
+ ---
41
+
42
+ ### `suggest QUERY`
43
+ Autocomplete suggestions.
44
+
45
+ ```bash
46
+ cli-web-amazon suggest "iphone case" --json
47
+ ```
48
+
49
+ **Output fields:** `value`, `type`
50
+
51
+ ---
52
+
53
+ ### `product get ASIN`
54
+ Full product detail by ASIN.
55
+
56
+ ```bash
57
+ cli-web-amazon product get B0GRZ78683 --json
58
+ ```
59
+
60
+ **Output fields:** `asin`, `title`, `price`, `price_note`, `geo_restricted`, `rating`, `review_count`, `brand`, `image_url`, `url`
61
+
62
+ ---
63
+
64
+ ### `bestsellers [CATEGORY]`
65
+ Browse Amazon Best Sellers by category.
66
+
67
+ ```bash
68
+ cli-web-amazon bestsellers electronics --json
69
+ cli-web-amazon bestsellers books --page 2 --json
70
+ ```
71
+
72
+ **Categories:** `electronics`, `books`, `toys-and-games`, `music`, `kitchen`, `clothing`
73
+
74
+ **Key options:** `--page N`
75
+
76
+ **Output fields:** `rank`, `asin`, `title`, `price`, `url`
77
+
78
+ ---
79
+
80
+ ## Agent Patterns
81
+
82
+ ```bash
83
+ # Search then get full detail on top result
84
+ ASIN=$(cli-web-amazon search "headphones" --json | python -c "import json,sys; print(json.load(sys.stdin)[0]['asin'])")
85
+ cli-web-amazon product get "$ASIN" --json
86
+
87
+ # Top-5 bestsellers
88
+ cli-web-amazon bestsellers electronics --json | \
89
+ python -c "import json,sys; [print(p['rank'], p['title'], p['price']) for p in json.load(sys.stdin)[:5]]"
90
+
91
+ # Autocomplete then search
92
+ cli-web-amazon suggest "wireles" --json | \
93
+ python -c "import json,sys; print(json.load(sys.stdin)[0]['value'])"
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Notes
99
+
100
+ - **Auth:** No authentication required — all commands work on public Amazon endpoints.
101
+ - **Price:** May be empty for some products (Amazon client-side renders prices). Use `product get` for reliable pricing; `price_note` explains why price is missing.
102
+ - **ASIN:** 10-character alphanumeric identifier (e.g. `B0GRZ78683`).
103
+ - **Pagination:** Search supports `--page N` (typically 1–7 pages). Best sellers supports `--page N`.
104
+ - **Errors in --json mode:** `{"error": true, "code": "NOT_FOUND|RATE_LIMITED|NETWORK_ERROR|SERVER_ERROR", "message": "..."}`
105
+ - **Installation:** `pip install cli-web-amazon`