channel-worker 2.5.13 → 2.5.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -180,18 +180,26 @@ class CommandPoller {
180
180
  let conn;
181
181
  try {
182
182
  conn = await this._connectNstProfileByName(profileName);
183
- const products = await scraper.scrapeOffers(conn.browser, {
184
- sort_type: payload.sort_type ?? 2, // 2 = commission desc
183
+ const res = await scraper.scrapeOffers(conn.browser, {
184
+ category: payload.category || 'women_clothes', // Tier-1 default: quần áo nữ
185
+ match_id: payload.match_id || null,
186
+ sort_types: payload.sort_types || [2, 3], // commission + sales, merged
187
+ pages: payload.pages ?? 2,
185
188
  page_limit: payload.page_limit ?? 20,
186
- pages: payload.pages ?? 3,
187
- list_type: payload.list_type ?? 0,
189
+ delay_ms: payload.delay_ms ?? 2500, // human-like pacing (anti-bot)
190
+ filters: payload.filters || {},
188
191
  });
189
- const filtered = payload.category_group
190
- ? products.filter(p => p.category_group === payload.category_group)
191
- : products;
192
- console.log(`[shopee] scraped ${products.length} offers (${filtered.length} after filter) from ${profileName}`);
193
- await this.api.upsertAffiliateProducts(filtered, { user_id: command.user_id });
194
- await this.api.updateCommand(command._id, { status: 'done', result: { count: filtered.length, total: products.length } });
192
+ if (res.status === 'needs_verify') {
193
+ // Anti-bot bounce — DON'T fail/retry into more captchas. Report cleanly
194
+ // so the dashboard can prompt the operator to re-verify via VNC.
195
+ console.warn(`[shopee] needs_verify: ${res.reason}`);
196
+ if (res.products?.length) await this.api.upsertAffiliateProducts(res.products, { user_id: command.user_id });
197
+ await this.api.updateCommand(command._id, { status: 'done', result: { needs_verify: true, reason: res.reason, count: res.products?.length || 0 } });
198
+ return;
199
+ }
200
+ console.log(`[shopee] scraped ${res.raw_count} raw → ${res.products.length} after filters from ${profileName}`);
201
+ await this.api.upsertAffiliateProducts(res.products, { user_id: command.user_id });
202
+ await this.api.updateCommand(command._id, { status: 'done', result: { count: res.products.length, raw: res.raw_count } });
195
203
  } catch (err) {
196
204
  console.error(`[shopee] scrape failed: ${err.message}`);
197
205
  await this.api.updateCommand(command._id, { status: 'failed', error: String(err.message || err).slice(0, 500) });
@@ -120,33 +120,115 @@ function parseOfferRow(row) {
120
120
  };
121
121
  }
122
122
 
123
- // In-page fetch the affiliate offer list. sort_type: 1=relevance, 2=commission,
124
- // 3=sales (Shopee's internal codes). Loops `pages` times, page_limit per page.
125
- async function scrapeOffers(browser, { sort_type = 2, page_limit = 20, pages = 1, list_type = 0, timeoutMs = 45000 } = {}) {
123
+ // Affiliate offer-page category tabs match_id (read from rc-tabs node keys,
124
+ // confirmed live). Tier-1 fashion focus first; the offer page has NO dedicated
125
+ // Shoes/Bags/Accessories tab those come later via keyword search.
126
+ const AFFILIATE_CATEGORIES = {
127
+ women_clothes: { match_id: 100017, label: 'Quần áo nữ', group: 'fashion' },
128
+ beauty: { match_id: 100630, label: 'Làm đẹp', group: 'beauty' },
129
+ home_living: { match_id: 100636, label: 'Nhà cửa', group: 'other' },
130
+ grocery: { match_id: 100629, label: 'Tạp hoá', group: 'other' },
131
+ };
132
+
133
+ const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
134
+
135
+ // True when the session got bounced to the anti-bot captcha (scene=crawler_item)
136
+ // or logged out — in either case the offer API returns 404/empty and we must
137
+ // STOP (not keep hammering) and ask the operator to re-verify via VNC.
138
+ function isBlockedState(pageUrl, res) {
139
+ if (/\/verify\/captcha|\/buyer\/login|is_from_login/.test(pageUrl || '')) return true;
140
+ if (res && (res.status === 403 || res.status === 404)) return true;
141
+ return false;
142
+ }
143
+
144
+ // Apply the Flow-1 hard filters (criteria) to parsed rows. Defaults are lenient;
145
+ // the controller passes the agreed thresholds (commission/price/sold/rating/imgs).
146
+ function applyFilters(rows, f = {}) {
147
+ return rows.filter((p) => {
148
+ if (f.category_groups?.length && !f.category_groups.includes(p.category_group)) return false;
149
+ if (f.min_commission != null && (p.commission_rate || 0) < f.min_commission) return false;
150
+ if (f.price_min != null && (p.price || 0) < f.price_min) return false;
151
+ if (f.price_max != null && (p.price || 0) > f.price_max) return false;
152
+ if (f.min_sold != null && (p.sold_count || 0) < f.min_sold) return false;
153
+ if (f.min_rating != null && p.rating != null && p.rating < f.min_rating) return false;
154
+ if (f.min_images != null && (p.images?.length || 0) < f.min_images) return false;
155
+ return true;
156
+ });
157
+ }
158
+
159
+ // In-page fetch the affiliate offer list — anti-bot aware. Pulls one or more
160
+ // sort_types (2=commission, 3=sales), paginates gently with a human-like delay,
161
+ // merges + dedups by item_id, then applies hard filters. Returns
162
+ // { status: 'ok'|'needs_verify', products: [...], raw_count }
163
+ // Never throws on a blocked session — returns status:'needs_verify' so the
164
+ // worker reports it cleanly instead of failing + retrying into more captchas.
165
+ async function scrapeOffers(browser, {
166
+ category = null, // key in AFFILIATE_CATEGORIES (e.g. 'women_clothes')
167
+ match_id = null, // explicit category id override
168
+ sort_types = [2], // [2]=commission; pass [2,3] to merge commission+sales
169
+ pages = 2,
170
+ page_limit = 20,
171
+ delay_ms = 2500, // human-like pause between calls (anti-bot)
172
+ filters = {},
173
+ timeoutMs = 45000,
174
+ // legacy single-sort param (back-compat with the first build)
175
+ sort_type = null,
176
+ } = {}) {
177
+ if (sort_type != null) sort_types = [sort_type];
178
+ const cat = category && AFFILIATE_CATEGORIES[category] ? AFFILIATE_CATEGORIES[category] : null;
179
+ const mid = match_id || cat?.match_id || null;
180
+ // Default the category_groups filter to the category's own group (so a
181
+ // Women-Clothes scrape keeps only fashion rows even though the tab mixes in
182
+ // a few accessories), unless the caller overrides.
183
+ if (cat && !filters.category_groups) filters = { ...filters, category_groups: [cat.group] };
184
+
126
185
  const pagesOpen = await browser.pages();
127
186
  let page = pagesOpen.find((p) => p.url().includes('affiliate.shopee.vn'));
128
187
  let opened = false;
129
188
  if (!page) { page = await browser.newPage(); opened = true; }
130
189
  try {
131
- if (!page.url().includes('affiliate.shopee.vn')) {
132
- await page.goto('https://affiliate.shopee.vn/offer/product_offer', { waitUntil: 'networkidle2', timeout: timeoutMs });
190
+ if (!page.url().includes('affiliate.shopee.vn/offer')) {
191
+ await page.goto('https://affiliate.shopee.vn/offer/product_offer', { waitUntil: 'networkidle2', timeout: timeoutMs }).catch(() => {});
192
+ await sleep(1500);
193
+ }
194
+ if (isBlockedState(page.url(), null)) {
195
+ return { status: 'needs_verify', products: [], raw_count: 0, reason: 'session at captcha/login — re-verify via VNC' };
133
196
  }
134
- const all = [];
135
- for (let i = 0; i < pages; i++) {
136
- const offset = i * page_limit;
137
- const res = await page.evaluate(async (q) => {
138
- const url = `/api/v3/offer/product/list?list_type=${q.list_type}&sort_type=${q.sort_type}&page_offset=${q.offset}&page_limit=${q.page_limit}&client_type=1`;
139
- const r = await fetch(url, { credentials: 'include' });
140
- return { status: r.status, json: await r.json().catch(() => null) };
141
- }, { list_type, sort_type, offset, page_limit });
142
- if (res.status !== 200 || res.json?.code !== 0) {
143
- throw new Error(`offer list failed (http ${res.status}, code ${res.json?.code}) — session may be expired`);
197
+
198
+ const byId = new Map();
199
+ let rawCount = 0;
200
+ for (const st of sort_types) {
201
+ for (let i = 0; i < pages; i++) {
202
+ const offset = i * page_limit;
203
+ const res = await page.evaluate(async (q) => {
204
+ const cat = q.mid ? `&list_type=3&match_type=2&match_id=${q.mid}` : '&list_type=0';
205
+ const url = `/api/v3/offer/product/list?sort_type=${q.st}&page_offset=${q.offset}&page_limit=${q.page_limit}&client_type=1${cat}`;
206
+ try {
207
+ const r = await fetch(url, { credentials: 'include' });
208
+ return { status: r.status, json: await r.json().catch(() => null) };
209
+ } catch (e) { return { status: 0, error: String(e.message) }; }
210
+ }, { st, offset, page_limit, mid });
211
+
212
+ if (isBlockedState(page.url(), res) || res.json?.code === 90309999) {
213
+ return { status: 'needs_verify', products: [...byId.values()], raw_count: rawCount, reason: `blocked mid-scrape (http ${res.status}, code ${res.json?.code})` };
214
+ }
215
+ if (res.status !== 200 || res.json?.code !== 0) {
216
+ // soft stop — treat as end-of-data for this sort, move on
217
+ break;
218
+ }
219
+ const list = res.json?.data?.list || [];
220
+ rawCount += list.length;
221
+ for (const row of list.map(parseOfferRow)) {
222
+ if (row.item_id && !byId.has(row.item_id)) byId.set(row.item_id, row);
223
+ }
224
+ if (list.length < page_limit) break; // ran out
225
+ await sleep(delay_ms); // pace between page fetches
144
226
  }
145
- const list = res.json?.data?.list || [];
146
- all.push(...list.map(parseOfferRow).filter((p) => p.item_id));
147
- if (list.length < page_limit) break; // ran out
227
+ await sleep(delay_ms); // pace between sort passes
148
228
  }
149
- return all;
229
+
230
+ const products = applyFilters([...byId.values()], filters);
231
+ return { status: 'ok', products, raw_count: rawCount };
150
232
  } finally {
151
233
  if (opened) await page.close().catch(() => {});
152
234
  }
@@ -162,4 +244,4 @@ function parseProductUrl(url = '') {
162
244
  return null;
163
245
  }
164
246
 
165
- module.exports = { ingestProduct, scrapeOffers, parseProductUrl, parseOfferRow, parsePdpItem, categoryGroup, pctToNumber };
247
+ module.exports = { ingestProduct, scrapeOffers, parseProductUrl, parseOfferRow, parsePdpItem, categoryGroup, pctToNumber, AFFILIATE_CATEGORIES, applyFilters };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "channel-worker",
3
- "version": "2.5.13",
3
+ "version": "2.5.14",
4
4
  "description": "Channel Manager worker daemon — runs on remote machines to execute video pipeline jobs",
5
5
  "main": "lib/daemon.js",
6
6
  "bin": {
@@ -187,7 +187,7 @@ async function run({ page, payload, log }) {
187
187
  } = payload || {};
188
188
  if (!video_url) throw new Error('No video_url provided');
189
189
 
190
- log('info', '[fb-pw] selectors version=2026.06.05a-pagewall-no-double-post');
190
+ log('info', '[fb-pw] selectors version=2026.06.12a-thumb-miss-soft');
191
191
 
192
192
  page.on('dialog', (d) => { d.accept().catch(() => {}); });
193
193
 
@@ -1326,9 +1326,12 @@ async function run({ page, payload, log }) {
1326
1326
  // output. (Observed on reel 1506614811005729: step 1 → click Tiếp
1327
1327
  // → publish, the thumb-edit pill never appeared.)
1328
1328
  if (thumbPath && !customThumbDone) {
1329
- await dumpInventory(page, log, `thumb-step-missed-${step + 1}`);
1330
- await dumpFailure(page, `thumb-step-missed-${step + 1}`, log);
1331
- throw new Error(`FB publish: thumbnail_url provided but the "Chỉnh sửa hình thu nhỏ" overlay was never detected before publish (step ${step + 1}). Refusing to ship without custom thumb. Inspect dump screenshots in Temp\\cm-worker-pw\\.`);
1329
+ // Custom thumb couldn't be applied (FB didn't render the "Chỉnh sửa
1330
+ // hình thu nhỏ" overlay). DON'T refuse to publish — the strict
1331
+ // failure shipped false "failed" results (the reel still published)
1332
+ // and double-posts on retry. Publish with FB's auto-thumbnail; the
1333
+ // custom artwork is a nice-to-have, not worth a hard failure.
1334
+ log('warn', `[fb-pw] custom thumb skipped — "Chỉnh sửa hình thu nhỏ" overlay never appeared (step ${step + 1}); publishing with FB auto-thumb`);
1332
1335
  }
1333
1336
  log('info', `[fb-pw] click publish "${pub.verb}" via "${pub.sel}" (step ${step + 1})`);
1334
1337
  // Snapshot the captured-IDs list RIGHT BEFORE the publish click. The