amaprice 1.0.16 → 1.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -4
- package/package.json +6 -1
- package/src/db.js +61 -0
- package/src/orchestrator/runner.js +122 -0
- package/src/url.js +262 -21
package/README.md
CHANGED
|
@@ -187,7 +187,9 @@ Run this SQL in Supabase SQL Editor:
|
|
|
187
187
|
|
|
188
188
|
`supabase/migrations/20260223_enforce_collector_first_claiming.sql`
|
|
189
189
|
|
|
190
|
-
|
|
190
|
+
`supabase/migrations/20260225_add_web_track_requests.sql`
|
|
191
|
+
|
|
192
|
+
These migrations add tier fields, indexes, telemetry, worker health rollups, `price_history.currency`, collector orchestration tables/functions, strict collector-first claim policy, and website intake request tracking.
|
|
191
193
|
|
|
192
194
|
Note: these files are additive migrations and expect existing `products` + `price_history` tables.
|
|
193
195
|
|
|
@@ -260,6 +262,18 @@ Environment variables used by the npm package:
|
|
|
260
262
|
|
|
261
263
|
For production background workers, prefer the Supabase **service role key**.
|
|
262
264
|
|
|
265
|
+
Website API runtime (Next.js server routes):
|
|
266
|
+
|
|
267
|
+
| Variable | Default | Used by | Notes |
|
|
268
|
+
|---|---|---|---|
|
|
269
|
+
| `SUPABASE_URL` | none | website API routes | Server-side Supabase project URL |
|
|
270
|
+
| `SUPABASE_SERVICE_ROLE_KEY` | none | website API routes | Required for secure writes (`/api/v1/track-requests`) |
|
|
271
|
+
| `WEB_TRACK_RATE_LIMIT_WINDOW_SECONDS` | `300` | website API routes | Sliding rate-limit window |
|
|
272
|
+
| `WEB_TRACK_RATE_LIMIT_MAX_REQUESTS_PER_IP` | `8` | website API routes | Rate-limit threshold per IP hash |
|
|
273
|
+
| `WEB_TRACK_RATE_LIMIT_MAX_REQUESTS_PER_VISITOR` | `12` | website API routes | Rate-limit threshold per visitor cookie |
|
|
274
|
+
| `WEB_TRACK_IP_HASH_PEPPER` | `amaprice-default-pepper` | website API routes | Pepper for deterministic IP hashing |
|
|
275
|
+
| `TURNSTILE_SECRET_KEY` | none | website API routes | Optional CAPTCHA verification for intake endpoint |
|
|
276
|
+
|
|
263
277
|
## Railway Worker Deployment
|
|
264
278
|
|
|
265
279
|
This repo includes:
|
|
@@ -292,19 +306,28 @@ npm run worker:once
|
|
|
292
306
|
|
|
293
307
|
## Vercel Website Deployment (`amaprice.sh`)
|
|
294
308
|
|
|
295
|
-
|
|
309
|
+
The website is a server-capable Next.js app in `website/` with API routes.
|
|
296
310
|
|
|
297
311
|
Steps:
|
|
298
312
|
1. Import the repo in Vercel.
|
|
299
313
|
2. Leave the project at repo root (deployment is controlled by root `vercel.json`).
|
|
300
314
|
3. Set website env vars:
|
|
301
|
-
- `
|
|
302
|
-
- `
|
|
315
|
+
- `SUPABASE_URL`
|
|
316
|
+
- `SUPABASE_SERVICE_ROLE_KEY`
|
|
303
317
|
- `NEXT_PUBLIC_GOOGLE_SITE_VERIFICATION` (for Google Search Console verification meta tag)
|
|
304
318
|
4. Deploy.
|
|
305
319
|
5. Add domain `amaprice.sh` in Vercel Domains and assign to this project.
|
|
306
320
|
6. Set `www.amaprice.sh` redirect to `amaprice.sh`.
|
|
307
321
|
|
|
322
|
+
### Website API Endpoints
|
|
323
|
+
|
|
324
|
+
- `POST /api/v1/track-requests` - submit Amazon URL/ASIN intake request
|
|
325
|
+
- `GET /api/v1/track-requests/:id` - poll request state (`queued`, `collecting`, `live`, ...)
|
|
326
|
+
- `GET /api/v1/products` - live tracked products for website explorer
|
|
327
|
+
- `GET /api/v1/products/:productId/history` - historical points for one product
|
|
328
|
+
- `GET /api/v1/prices/recent` - recent global price points
|
|
329
|
+
- `GET /api/v1/worker-health` - worker health rollup row
|
|
330
|
+
|
|
308
331
|
Local website development:
|
|
309
332
|
|
|
310
333
|
```bash
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "amaprice",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.17",
|
|
4
4
|
"description": "CLI tool to scrape and track Amazon product prices",
|
|
5
5
|
"main": "src/scraper.js",
|
|
6
6
|
"type": "commonjs",
|
|
@@ -39,5 +39,10 @@
|
|
|
39
39
|
"playwright": "^1.58.2",
|
|
40
40
|
"commander": "^13.1.0",
|
|
41
41
|
"@supabase/supabase-js": "^2.49.4"
|
|
42
|
+
},
|
|
43
|
+
"devDependencies": {
|
|
44
|
+
"next": "16.1.6",
|
|
45
|
+
"react": "19.2.4",
|
|
46
|
+
"react-dom": "19.2.4"
|
|
42
47
|
}
|
|
43
48
|
}
|
package/src/db.js
CHANGED
|
@@ -54,6 +54,20 @@ function isMissingHybridSchema(error) {
|
|
|
54
54
|
);
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
function isMissingWebTrackSchema(error) {
|
|
58
|
+
const code = String(error?.code || '');
|
|
59
|
+
const message = String(error?.message || '');
|
|
60
|
+
return (
|
|
61
|
+
code === '42P01'
|
|
62
|
+
|| code === 'PGRST205'
|
|
63
|
+
|| code === '42703'
|
|
64
|
+
|| code === 'PGRST204'
|
|
65
|
+
|| /relation .* does not exist/i.test(message)
|
|
66
|
+
|| /column .* does not exist/i.test(message)
|
|
67
|
+
|| /web_track_requests/i.test(message)
|
|
68
|
+
);
|
|
69
|
+
}
|
|
70
|
+
|
|
57
71
|
/**
|
|
58
72
|
* Upsert a product by ASIN. Returns the product row.
|
|
59
73
|
*/
|
|
@@ -535,6 +549,51 @@ async function heartbeatCollector({
|
|
|
535
549
|
return data;
|
|
536
550
|
}
|
|
537
551
|
|
|
552
|
+
async function listPendingWebTrackRequests(limit = 20) {
|
|
553
|
+
const supabase = getClient();
|
|
554
|
+
const safeLimit = Math.max(1, Number(limit) || 20);
|
|
555
|
+
|
|
556
|
+
const { data, error } = await supabase
|
|
557
|
+
.from('web_track_requests')
|
|
558
|
+
.select('id, raw_input, status, status_reason, request_meta, queued_at, created_at')
|
|
559
|
+
.eq('status', 'queued')
|
|
560
|
+
.is('product_id', null)
|
|
561
|
+
.order('created_at', { ascending: true })
|
|
562
|
+
.limit(safeLimit);
|
|
563
|
+
|
|
564
|
+
if (error) {
|
|
565
|
+
if (isMissingWebTrackSchema(error)) {
|
|
566
|
+
return [];
|
|
567
|
+
}
|
|
568
|
+
throw new Error(`Supabase web_track_requests error: ${error.message}`);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
return data || [];
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
async function updateWebTrackRequestById(requestId, patch) {
|
|
575
|
+
const supabase = getClient();
|
|
576
|
+
const payload = cleanPayload({
|
|
577
|
+
...patch,
|
|
578
|
+
updated_at: new Date().toISOString(),
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
const { data, error } = await supabase
|
|
582
|
+
.from('web_track_requests')
|
|
583
|
+
.update(payload)
|
|
584
|
+
.eq('id', requestId)
|
|
585
|
+
.select()
|
|
586
|
+
.single();
|
|
587
|
+
|
|
588
|
+
if (error) {
|
|
589
|
+
if (isMissingWebTrackSchema(error)) {
|
|
590
|
+
return null;
|
|
591
|
+
}
|
|
592
|
+
throw new Error(`Supabase web_track_requests error: ${error.message}`);
|
|
593
|
+
}
|
|
594
|
+
return data;
|
|
595
|
+
}
|
|
596
|
+
|
|
538
597
|
async function enqueueDueCollectionJobs(limit = 20) {
|
|
539
598
|
const supabase = getClient();
|
|
540
599
|
const safeLimit = Math.max(1, Number(limit) || 20);
|
|
@@ -753,6 +812,8 @@ module.exports = {
|
|
|
753
812
|
upsertCollector,
|
|
754
813
|
getCollectorById,
|
|
755
814
|
heartbeatCollector,
|
|
815
|
+
listPendingWebTrackRequests,
|
|
816
|
+
updateWebTrackRequestById,
|
|
756
817
|
enqueueDueCollectionJobs,
|
|
757
818
|
claimCollectionJobs,
|
|
758
819
|
completeCollectionJob,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
const { runCollectionPipeline } = require('../extractors/pipeline');
|
|
2
2
|
const {
|
|
3
|
+
getProductByAsin,
|
|
3
4
|
enqueueDueCollectionJobs,
|
|
4
5
|
claimCollectionJobs,
|
|
5
6
|
completeCollectionJob,
|
|
@@ -7,10 +8,14 @@ const {
|
|
|
7
8
|
insertCollectionAttempt,
|
|
8
9
|
insertPrice,
|
|
9
10
|
insertScrapeAttempt,
|
|
11
|
+
listPendingWebTrackRequests,
|
|
10
12
|
upsertProductLatestPrice,
|
|
13
|
+
upsertProduct,
|
|
14
|
+
updateWebTrackRequestById,
|
|
11
15
|
updateProductById,
|
|
12
16
|
getRecentPrices,
|
|
13
17
|
} = require('../db');
|
|
18
|
+
const { normalizeAmazonInput, isAmazonUrl, extractDomain } = require('../url');
|
|
14
19
|
const {
|
|
15
20
|
normalizeTier,
|
|
16
21
|
computeNextScrapeAt,
|
|
@@ -78,6 +83,119 @@ function nextJobStateAfterFailure(job) {
|
|
|
78
83
|
return attempts >= maxAttempts ? 'dead' : 'queued';
|
|
79
84
|
}
|
|
80
85
|
|
|
86
|
+
function nowIso() {
|
|
87
|
+
return new Date().toISOString();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function cleanJsonObject(value) {
|
|
91
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
|
92
|
+
return {};
|
|
93
|
+
}
|
|
94
|
+
return { ...value };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function resolvePendingWebTrackRequests({ limit = 20 } = {}) {
|
|
98
|
+
const safeLimit = Math.max(1, Number(limit) || 20);
|
|
99
|
+
const pending = await listPendingWebTrackRequests(safeLimit);
|
|
100
|
+
if (!pending || pending.length === 0) {
|
|
101
|
+
return { processed: 0, resolved: 0, rejected: 0 };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
let resolved = 0;
|
|
105
|
+
let rejected = 0;
|
|
106
|
+
|
|
107
|
+
for (const request of pending) {
|
|
108
|
+
const rawInput = String(request.raw_input || '').trim();
|
|
109
|
+
if (!rawInput) {
|
|
110
|
+
rejected += 1;
|
|
111
|
+
await updateWebTrackRequestById(request.id, {
|
|
112
|
+
status: 'rejected',
|
|
113
|
+
status_reason: 'invalid_amazon_input',
|
|
114
|
+
last_error: 'Input is empty and could not be resolved.',
|
|
115
|
+
completed_at: nowIso(),
|
|
116
|
+
request_meta: {
|
|
117
|
+
...cleanJsonObject(request.request_meta),
|
|
118
|
+
resolution_stage: 'collector',
|
|
119
|
+
resolution_error: 'empty_input',
|
|
120
|
+
resolution_finished_at: nowIso(),
|
|
121
|
+
},
|
|
122
|
+
}).catch(() => {});
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const defaultDomain = isAmazonUrl(rawInput) ? extractDomain(rawInput) : 'amazon.de';
|
|
127
|
+
let normalized = null;
|
|
128
|
+
try {
|
|
129
|
+
normalized = await normalizeAmazonInput(rawInput, defaultDomain);
|
|
130
|
+
} catch {
|
|
131
|
+
normalized = null;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (!normalized) {
|
|
135
|
+
rejected += 1;
|
|
136
|
+
await updateWebTrackRequestById(request.id, {
|
|
137
|
+
status: 'rejected',
|
|
138
|
+
status_reason: 'invalid_amazon_input',
|
|
139
|
+
last_error: 'Could not resolve Amazon URL to a product ASIN.',
|
|
140
|
+
completed_at: nowIso(),
|
|
141
|
+
request_meta: {
|
|
142
|
+
...cleanJsonObject(request.request_meta),
|
|
143
|
+
resolution_stage: 'collector',
|
|
144
|
+
resolution_error: 'asin_not_found',
|
|
145
|
+
resolution_finished_at: nowIso(),
|
|
146
|
+
},
|
|
147
|
+
}).catch(() => {});
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const existingProduct = await getProductByAsin(normalized.asin);
|
|
152
|
+
const now = nowIso();
|
|
153
|
+
|
|
154
|
+
const product = existingProduct
|
|
155
|
+
? await updateProductById(existingProduct.id, {
|
|
156
|
+
is_active: true,
|
|
157
|
+
next_scrape_at: now,
|
|
158
|
+
domain: normalized.domain,
|
|
159
|
+
url: normalized.url,
|
|
160
|
+
})
|
|
161
|
+
: await upsertProduct({
|
|
162
|
+
asin: normalized.asin,
|
|
163
|
+
title: `ASIN ${normalized.asin}`,
|
|
164
|
+
url: normalized.url,
|
|
165
|
+
domain: normalized.domain,
|
|
166
|
+
tier: 'daily',
|
|
167
|
+
tierMode: 'auto',
|
|
168
|
+
isActive: true,
|
|
169
|
+
nextScrapeAt: now,
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
resolved += 1;
|
|
173
|
+
await updateWebTrackRequestById(request.id, {
|
|
174
|
+
asin: normalized.asin,
|
|
175
|
+
domain: normalized.domain,
|
|
176
|
+
normalized_url: normalized.url,
|
|
177
|
+
product_id: product.id,
|
|
178
|
+
status: 'queued',
|
|
179
|
+
status_reason: null,
|
|
180
|
+
last_error: null,
|
|
181
|
+
queued_at: request.queued_at || now,
|
|
182
|
+
completed_at: null,
|
|
183
|
+
request_meta: {
|
|
184
|
+
...cleanJsonObject(request.request_meta),
|
|
185
|
+
resolution_stage: 'collector',
|
|
186
|
+
resolution_source: 'normalize_amazon_input',
|
|
187
|
+
resolution_finished_at: now,
|
|
188
|
+
},
|
|
189
|
+
}).catch(() => {});
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
processed: pending.length,
|
|
194
|
+
resolved,
|
|
195
|
+
rejected,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
|
|
81
199
|
async function processClaimedJob(job, {
|
|
82
200
|
collectorId = null,
|
|
83
201
|
executor = 'railway',
|
|
@@ -267,6 +385,9 @@ async function runOrchestratedSync({
|
|
|
267
385
|
} = {}) {
|
|
268
386
|
const safeLimit = Math.max(1, Number(limit) || 20);
|
|
269
387
|
|
|
388
|
+
await resolvePendingWebTrackRequests({
|
|
389
|
+
limit: Math.max(safeLimit * 2, safeLimit),
|
|
390
|
+
}).catch(() => {});
|
|
270
391
|
await requeueExpiredCollectionJobs(200).catch(() => {});
|
|
271
392
|
await enqueueDueCollectionJobs(Math.max(safeLimit * 2, safeLimit)).catch(() => {});
|
|
272
393
|
|
|
@@ -319,4 +440,5 @@ module.exports.__test = {
|
|
|
319
440
|
classifyFailure,
|
|
320
441
|
buildNoPriceErrorMessage,
|
|
321
442
|
nextJobStateAfterFailure,
|
|
443
|
+
cleanJsonObject,
|
|
322
444
|
};
|
package/src/url.js
CHANGED
|
@@ -5,7 +5,37 @@ const AMAZON_DOMAINS = [
|
|
|
5
5
|
];
|
|
6
6
|
const AMAZON_SHORT_DOMAINS = ['amzn.eu', 'amzn.to', 'a.co'];
|
|
7
7
|
|
|
8
|
-
const
|
|
8
|
+
const ASIN_PATH_REGEX = /(?:\/(?:dp|gp\/product|gp\/aw\/d|ASIN)\/)([A-Z0-9]{10})(?=[/?]|$)/i;
|
|
9
|
+
const DIRECT_ASIN_REGEX = /^[A-Z0-9]{10}$/i;
|
|
10
|
+
const QUERY_ASIN_KEYS = new Set(['asin', 'pd_rd_i']);
|
|
11
|
+
const QUERY_NESTED_URL_KEYS = new Set(['url', 'u', 'redirecturl', 'path']);
|
|
12
|
+
const HTML_ASIN_PATTERNS = [
|
|
13
|
+
/<link[^>]+rel=["']canonical["'][^>]+href=["'][^"']*(?:\/dp\/|\/gp\/product\/|\/gp\/aw\/d\/)([A-Z0-9]{10})(?=[/?"'&]|$)/i,
|
|
14
|
+
/<link[^>]+href=["'][^"']*(?:\/dp\/|\/gp\/product\/|\/gp\/aw\/d\/)([A-Z0-9]{10})(?=[/?"'&]|$)[^>]*rel=["']canonical["']/i,
|
|
15
|
+
/"(?:currentAsin|parentAsin|landingAsin)"\s*:\s*"([A-Z0-9]{10})"/i,
|
|
16
|
+
/"(?:canonicalUrl|productUrl|dpUrl|redirectUrl)"\s*:\s*"[^"]*(?:\/dp\/|\/gp\/product\/|\/gp\/aw\/d\/)([A-Z0-9]{10})(?=[/?"&]|$)/i,
|
|
17
|
+
];
|
|
18
|
+
const FETCH_TIMEOUT_MS = 9000;
|
|
19
|
+
const BROWSERISH_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36';
|
|
20
|
+
|
|
21
|
+
const DOMAIN_ACCEPT_LANGUAGE = {
|
|
22
|
+
'amazon.de': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
23
|
+
'amazon.fr': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
24
|
+
'amazon.it': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
25
|
+
'amazon.es': 'es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
26
|
+
'amazon.nl': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
27
|
+
'amazon.co.jp': 'ja-JP,ja;q=0.9,en-US;q=0.7,en;q=0.6',
|
|
28
|
+
'amazon.com.br': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
29
|
+
'amazon.co.uk': 'en-GB,en;q=0.9',
|
|
30
|
+
'amazon.com': 'en-US,en;q=0.9',
|
|
31
|
+
'amazon.ca': 'en-CA,en;q=0.9,fr-CA;q=0.6',
|
|
32
|
+
'amazon.com.au': 'en-AU,en;q=0.9',
|
|
33
|
+
'amazon.in': 'en-IN,en;q=0.9',
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
function safeTrim(value) {
|
|
37
|
+
return String(value || '').trim();
|
|
38
|
+
}
|
|
9
39
|
|
|
10
40
|
function isAmazonUrl(url) {
|
|
11
41
|
try {
|
|
@@ -19,22 +49,97 @@ function isAmazonUrl(url) {
|
|
|
19
49
|
function isAmazonShortUrl(url) {
|
|
20
50
|
try {
|
|
21
51
|
const parsed = new URL(url);
|
|
22
|
-
const hostname = parsed.hostname.replace(/^www
|
|
52
|
+
const hostname = parsed.hostname.replace(/^www\./i, '').toLowerCase();
|
|
23
53
|
return AMAZON_SHORT_DOMAINS.includes(hostname);
|
|
24
54
|
} catch {
|
|
25
55
|
return false;
|
|
26
56
|
}
|
|
27
57
|
}
|
|
28
58
|
|
|
29
|
-
function
|
|
30
|
-
|
|
31
|
-
if (
|
|
32
|
-
return
|
|
59
|
+
function extractAsinFromPathLikeValue(value) {
|
|
60
|
+
const trimmed = safeTrim(value);
|
|
61
|
+
if (!trimmed) {
|
|
62
|
+
return null;
|
|
33
63
|
}
|
|
34
|
-
|
|
64
|
+
|
|
65
|
+
if (DIRECT_ASIN_REGEX.test(trimmed)) {
|
|
66
|
+
return trimmed.toUpperCase();
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const match = trimmed.match(ASIN_PATH_REGEX);
|
|
35
70
|
return match ? match[1].toUpperCase() : null;
|
|
36
71
|
}
|
|
37
72
|
|
|
73
|
+
function extractAsinFromQueryParams(parsed) {
|
|
74
|
+
for (const [rawKey, rawValue] of parsed.searchParams.entries()) {
|
|
75
|
+
const key = safeTrim(rawKey).toLowerCase();
|
|
76
|
+
const value = safeTrim(rawValue);
|
|
77
|
+
if (!key || !value) {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (QUERY_ASIN_KEYS.has(key) && DIRECT_ASIN_REGEX.test(value)) {
|
|
82
|
+
return value.toUpperCase();
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (!QUERY_NESTED_URL_KEYS.has(key)) {
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const nestedDirect = extractAsinFromPathLikeValue(value);
|
|
90
|
+
if (nestedDirect) {
|
|
91
|
+
return nestedDirect;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
try {
|
|
95
|
+
const decoded = decodeURIComponent(value);
|
|
96
|
+
const nestedDecoded = extractAsinFromPathLikeValue(decoded);
|
|
97
|
+
if (nestedDecoded) {
|
|
98
|
+
return nestedDecoded;
|
|
99
|
+
}
|
|
100
|
+
} catch {
|
|
101
|
+
// Ignore decode failures for non-encoded payloads.
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function extractAsinFromHtml(html) {
|
|
109
|
+
const body = safeTrim(html);
|
|
110
|
+
if (!body) {
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for (const pattern of HTML_ASIN_PATTERNS) {
|
|
115
|
+
const match = body.match(pattern);
|
|
116
|
+
if (match?.[1]) {
|
|
117
|
+
return match[1].toUpperCase();
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return null;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function extractAsin(urlOrAsin) {
|
|
125
|
+
const value = safeTrim(urlOrAsin);
|
|
126
|
+
if (!value) {
|
|
127
|
+
return null;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const direct = extractAsinFromPathLikeValue(value);
|
|
131
|
+
if (direct) {
|
|
132
|
+
return direct;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
try {
|
|
136
|
+
const parsed = new URL(value);
|
|
137
|
+
return extractAsinFromQueryParams(parsed);
|
|
138
|
+
} catch {
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
38
143
|
function extractDomain(url) {
|
|
39
144
|
try {
|
|
40
145
|
const parsed = new URL(url);
|
|
@@ -48,39 +153,175 @@ function canonicalUrl(asin, domain = 'amazon.de') {
|
|
|
48
153
|
return `https://www.${domain}/dp/${asin}`;
|
|
49
154
|
}
|
|
50
155
|
|
|
156
|
+
function isResolvableAmazonUrl(url) {
|
|
157
|
+
return isAmazonShortUrl(url) || isAmazonUrl(url);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function defaultDomainForUrl(url) {
|
|
161
|
+
return isAmazonUrl(url) ? extractDomain(url) : 'amazon.de';
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function buildResolveHeaders(url) {
|
|
165
|
+
const domain = defaultDomainForUrl(url);
|
|
166
|
+
return {
|
|
167
|
+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
168
|
+
'accept-language': DOMAIN_ACCEPT_LANGUAGE[domain] || 'en-US,en;q=0.9',
|
|
169
|
+
'cache-control': 'no-cache',
|
|
170
|
+
pragma: 'no-cache',
|
|
171
|
+
'user-agent': BROWSERISH_USER_AGENT,
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async function fetchWithTimeout(url, options = {}) {
|
|
176
|
+
const controller = new AbortController();
|
|
177
|
+
const timeoutId = setTimeout(() => {
|
|
178
|
+
controller.abort();
|
|
179
|
+
}, FETCH_TIMEOUT_MS);
|
|
180
|
+
|
|
181
|
+
try {
|
|
182
|
+
return await fetch(url, {
|
|
183
|
+
...options,
|
|
184
|
+
signal: controller.signal,
|
|
185
|
+
});
|
|
186
|
+
} catch {
|
|
187
|
+
return null;
|
|
188
|
+
} finally {
|
|
189
|
+
clearTimeout(timeoutId);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
async function extractAsinFromResponse(response) {
|
|
194
|
+
const contentType = response.headers?.get
|
|
195
|
+
? String(response.headers.get('content-type') || '').toLowerCase()
|
|
196
|
+
: '';
|
|
197
|
+
if (contentType && !contentType.includes('text/html')) {
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (typeof response.text !== 'function') {
|
|
202
|
+
return null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
try {
|
|
206
|
+
const html = await response.text();
|
|
207
|
+
return extractAsinFromHtml(String(html || '').slice(0, 512_000));
|
|
208
|
+
} catch {
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function resolveAsinFromAmazonUrl(url, fallbackDomain = 'amazon.de') {
|
|
214
|
+
const asin = extractAsin(url);
|
|
215
|
+
if (!asin || !isAmazonUrl(url)) {
|
|
216
|
+
return null;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const domain = extractDomain(url) || fallbackDomain;
|
|
220
|
+
return canonicalUrl(asin, domain);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
async function resolveByFollowingRedirects(url) {
|
|
224
|
+
const originDomain = defaultDomainForUrl(url);
|
|
225
|
+
const response = await fetchWithTimeout(url, {
|
|
226
|
+
method: 'GET',
|
|
227
|
+
redirect: 'follow',
|
|
228
|
+
headers: buildResolveHeaders(url),
|
|
229
|
+
});
|
|
230
|
+
if (!response) {
|
|
231
|
+
return null;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const responseUrl = safeTrim(response.url);
|
|
235
|
+
const fromUrl = resolveAsinFromAmazonUrl(responseUrl, originDomain);
|
|
236
|
+
if (fromUrl) {
|
|
237
|
+
try {
|
|
238
|
+
await response.body?.cancel?.();
|
|
239
|
+
} catch {
|
|
240
|
+
// Ignore body cancellation failures.
|
|
241
|
+
}
|
|
242
|
+
return fromUrl;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const htmlAsin = await extractAsinFromResponse(response);
|
|
246
|
+
if (!htmlAsin) {
|
|
247
|
+
return null;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const domain = isAmazonUrl(responseUrl)
|
|
251
|
+
? extractDomain(responseUrl)
|
|
252
|
+
: originDomain;
|
|
253
|
+
return canonicalUrl(htmlAsin, domain);
|
|
254
|
+
}
|
|
255
|
+
|
|
51
256
|
async function resolveAmazonShortUrl(url, maxRedirects = 8) {
|
|
52
|
-
if (!
|
|
257
|
+
if (!isResolvableAmazonUrl(url)) {
|
|
258
|
+
return url;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const followed = await resolveByFollowingRedirects(url);
|
|
262
|
+
if (followed) {
|
|
263
|
+
return followed;
|
|
264
|
+
}
|
|
53
265
|
|
|
54
266
|
let current = url;
|
|
267
|
+
const originDomain = defaultDomainForUrl(url);
|
|
268
|
+
const visited = new Set();
|
|
55
269
|
for (let i = 0; i < maxRedirects; i += 1) {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
270
|
+
if (visited.has(current)) {
|
|
271
|
+
return current;
|
|
272
|
+
}
|
|
273
|
+
visited.add(current);
|
|
274
|
+
|
|
275
|
+
const response = await fetchWithTimeout(current, {
|
|
276
|
+
method: 'GET',
|
|
277
|
+
redirect: 'manual',
|
|
278
|
+
headers: buildResolveHeaders(current),
|
|
279
|
+
});
|
|
280
|
+
if (!response) {
|
|
63
281
|
return current;
|
|
64
282
|
}
|
|
65
283
|
|
|
284
|
+
const fromResponseUrl = resolveAsinFromAmazonUrl(
|
|
285
|
+
safeTrim(response.url),
|
|
286
|
+
originDomain,
|
|
287
|
+
);
|
|
288
|
+
if (fromResponseUrl) {
|
|
289
|
+
try {
|
|
290
|
+
await response.body?.cancel?.();
|
|
291
|
+
} catch {
|
|
292
|
+
// Ignore body cancellation failures.
|
|
293
|
+
}
|
|
294
|
+
return fromResponseUrl;
|
|
295
|
+
}
|
|
296
|
+
|
|
66
297
|
const location = response.headers?.get ? response.headers.get('location') : null;
|
|
298
|
+
if (!location || response.status < 300 || response.status > 399) {
|
|
299
|
+
const htmlAsin = await extractAsinFromResponse(response);
|
|
300
|
+
if (htmlAsin) {
|
|
301
|
+
const domain = isAmazonUrl(current) ? extractDomain(current) : originDomain;
|
|
302
|
+
return canonicalUrl(htmlAsin, domain);
|
|
303
|
+
}
|
|
304
|
+
return current;
|
|
305
|
+
}
|
|
306
|
+
|
|
67
307
|
try {
|
|
68
308
|
await response.body?.cancel?.();
|
|
69
309
|
} catch {
|
|
70
310
|
// Ignore body cancellation failures.
|
|
71
311
|
}
|
|
72
312
|
|
|
73
|
-
if (!location || response.status < 300 || response.status > 399) {
|
|
74
|
-
return current;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
313
|
try {
|
|
78
314
|
current = new URL(location, current).toString();
|
|
79
315
|
} catch {
|
|
80
316
|
return current;
|
|
81
317
|
}
|
|
82
318
|
|
|
83
|
-
|
|
319
|
+
const fromLocation = resolveAsinFromAmazonUrl(current, originDomain);
|
|
320
|
+
if (fromLocation) {
|
|
321
|
+
return fromLocation;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
if (!isResolvableAmazonUrl(current)) {
|
|
84
325
|
return current;
|
|
85
326
|
}
|
|
86
327
|
}
|
|
@@ -89,7 +330,7 @@ async function resolveAmazonShortUrl(url, maxRedirects = 8) {
|
|
|
89
330
|
}
|
|
90
331
|
|
|
91
332
|
async function normalizeAmazonInput(input, defaultDomain = 'amazon.de') {
|
|
92
|
-
const raw =
|
|
333
|
+
const raw = safeTrim(input);
|
|
93
334
|
if (!raw) return null;
|
|
94
335
|
|
|
95
336
|
const asin = extractAsin(raw);
|