channel-worker 2.5.13 → 2.5.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -180,18 +180,26 @@ class CommandPoller {
180
180
  let conn;
181
181
  try {
182
182
  conn = await this._connectNstProfileByName(profileName);
183
- const products = await scraper.scrapeOffers(conn.browser, {
184
- sort_type: payload.sort_type ?? 2, // 2 = commission desc
183
+ const res = await scraper.scrapeOffers(conn.browser, {
184
+ category: payload.category || 'women_clothes', // Tier-1 default: quần áo nữ
185
+ match_id: payload.match_id || null,
186
+ sort_types: payload.sort_types || [2, 3], // commission + sales, merged
187
+ pages: payload.pages ?? 2,
185
188
  page_limit: payload.page_limit ?? 20,
186
- pages: payload.pages ?? 3,
187
- list_type: payload.list_type ?? 0,
189
+ delay_ms: payload.delay_ms ?? 2500, // human-like pacing (anti-bot)
190
+ filters: payload.filters || {},
188
191
  });
189
- const filtered = payload.category_group
190
- ? products.filter(p => p.category_group === payload.category_group)
191
- : products;
192
- console.log(`[shopee] scraped ${products.length} offers (${filtered.length} after filter) from ${profileName}`);
193
- await this.api.upsertAffiliateProducts(filtered, { user_id: command.user_id });
194
- await this.api.updateCommand(command._id, { status: 'done', result: { count: filtered.length, total: products.length } });
192
+ if (res.status === 'needs_verify') {
193
+ // Anti-bot bounce — DON'T fail/retry into more captchas. Report cleanly
194
+ // so the dashboard can prompt the operator to re-verify via VNC.
195
+ console.warn(`[shopee] needs_verify: ${res.reason}`);
196
+ if (res.products?.length) await this.api.upsertAffiliateProducts(res.products, { user_id: command.user_id });
197
+ await this.api.updateCommand(command._id, { status: 'done', result: { needs_verify: true, reason: res.reason, count: res.products?.length || 0 } });
198
+ return;
199
+ }
200
+ console.log(`[shopee] scraped ${res.raw_count} raw → ${res.products.length} after filters from ${profileName}`);
201
+ await this.api.upsertAffiliateProducts(res.products, { user_id: command.user_id });
202
+ await this.api.updateCommand(command._id, { status: 'done', result: { count: res.products.length, raw: res.raw_count } });
195
203
  } catch (err) {
196
204
  console.error(`[shopee] scrape failed: ${err.message}`);
197
205
  await this.api.updateCommand(command._id, { status: 'failed', error: String(err.message || err).slice(0, 500) });
@@ -120,33 +120,115 @@ function parseOfferRow(row) {
120
120
  };
121
121
  }
122
122
 
123
- // In-page fetch the affiliate offer list. sort_type: 1=relevance, 2=commission,
124
- // 3=sales (Shopee's internal codes). Loops `pages` times, page_limit per page.
125
- async function scrapeOffers(browser, { sort_type = 2, page_limit = 20, pages = 1, list_type = 0, timeoutMs = 45000 } = {}) {
123
+ // Affiliate offer-page category tabs match_id (read from rc-tabs node keys,
124
+ // confirmed live). Tier-1 fashion focus first; the offer page has NO dedicated
125
+ // Shoes/Bags/Accessories tab those come later via keyword search.
126
+ const AFFILIATE_CATEGORIES = {
127
+ women_clothes: { match_id: 100017, label: 'Quần áo nữ', group: 'fashion' },
128
+ beauty: { match_id: 100630, label: 'Làm đẹp', group: 'beauty' },
129
+ home_living: { match_id: 100636, label: 'Nhà cửa', group: 'other' },
130
+ grocery: { match_id: 100629, label: 'Tạp hoá', group: 'other' },
131
+ };
132
+
133
+ const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
134
+
135
+ // True when the session got bounced to the anti-bot captcha (scene=crawler_item)
136
+ // or logged out — in either case the offer API returns 404/empty and we must
137
+ // STOP (not keep hammering) and ask the operator to re-verify via VNC.
138
+ function isBlockedState(pageUrl, res) {
139
+ if (/\/verify\/captcha|\/buyer\/login|is_from_login/.test(pageUrl || '')) return true;
140
+ if (res && (res.status === 403 || res.status === 404)) return true;
141
+ return false;
142
+ }
143
+
144
+ // Apply the Flow-1 hard filters (criteria) to parsed rows. Defaults are lenient;
145
+ // the controller passes the agreed thresholds (commission/price/sold/rating/imgs).
146
+ function applyFilters(rows, f = {}) {
147
+ return rows.filter((p) => {
148
+ if (f.category_groups?.length && !f.category_groups.includes(p.category_group)) return false;
149
+ if (f.min_commission != null && (p.commission_rate || 0) < f.min_commission) return false;
150
+ if (f.price_min != null && (p.price || 0) < f.price_min) return false;
151
+ if (f.price_max != null && (p.price || 0) > f.price_max) return false;
152
+ if (f.min_sold != null && (p.sold_count || 0) < f.min_sold) return false;
153
+ if (f.min_rating != null && p.rating != null && p.rating < f.min_rating) return false;
154
+ if (f.min_images != null && (p.images?.length || 0) < f.min_images) return false;
155
+ return true;
156
+ });
157
+ }
158
+
159
+ // In-page fetch the affiliate offer list — anti-bot aware. Pulls one or more
160
+ // sort_types (2=commission, 3=sales), paginates gently with a human-like delay,
161
+ // merges + dedups by item_id, then applies hard filters. Returns
162
+ // { status: 'ok'|'needs_verify', products: [...], raw_count }
163
+ // Never throws on a blocked session — returns status:'needs_verify' so the
164
+ // worker reports it cleanly instead of failing + retrying into more captchas.
165
+ async function scrapeOffers(browser, {
166
+ category = null, // key in AFFILIATE_CATEGORIES (e.g. 'women_clothes')
167
+ match_id = null, // explicit category id override
168
+ sort_types = [2], // [2]=commission; pass [2,3] to merge commission+sales
169
+ pages = 2,
170
+ page_limit = 20,
171
+ delay_ms = 2500, // human-like pause between calls (anti-bot)
172
+ filters = {},
173
+ timeoutMs = 45000,
174
+ // legacy single-sort param (back-compat with the first build)
175
+ sort_type = null,
176
+ } = {}) {
177
+ if (sort_type != null) sort_types = [sort_type];
178
+ const cat = category && AFFILIATE_CATEGORIES[category] ? AFFILIATE_CATEGORIES[category] : null;
179
+ const mid = match_id || cat?.match_id || null;
180
+ // Default the category_groups filter to the category's own group (so a
181
+ // Women-Clothes scrape keeps only fashion rows even though the tab mixes in
182
+ // a few accessories), unless the caller overrides.
183
+ if (cat && !filters.category_groups) filters = { ...filters, category_groups: [cat.group] };
184
+
126
185
  const pagesOpen = await browser.pages();
127
186
  let page = pagesOpen.find((p) => p.url().includes('affiliate.shopee.vn'));
128
187
  let opened = false;
129
188
  if (!page) { page = await browser.newPage(); opened = true; }
130
189
  try {
131
- if (!page.url().includes('affiliate.shopee.vn')) {
132
- await page.goto('https://affiliate.shopee.vn/offer/product_offer', { waitUntil: 'networkidle2', timeout: timeoutMs });
190
+ if (!page.url().includes('affiliate.shopee.vn/offer')) {
191
+ await page.goto('https://affiliate.shopee.vn/offer/product_offer', { waitUntil: 'networkidle2', timeout: timeoutMs }).catch(() => {});
192
+ await sleep(1500);
193
+ }
194
+ if (isBlockedState(page.url(), null)) {
195
+ return { status: 'needs_verify', products: [], raw_count: 0, reason: 'session at captcha/login — re-verify via VNC' };
133
196
  }
134
- const all = [];
135
- for (let i = 0; i < pages; i++) {
136
- const offset = i * page_limit;
137
- const res = await page.evaluate(async (q) => {
138
- const url = `/api/v3/offer/product/list?list_type=${q.list_type}&sort_type=${q.sort_type}&page_offset=${q.offset}&page_limit=${q.page_limit}&client_type=1`;
139
- const r = await fetch(url, { credentials: 'include' });
140
- return { status: r.status, json: await r.json().catch(() => null) };
141
- }, { list_type, sort_type, offset, page_limit });
142
- if (res.status !== 200 || res.json?.code !== 0) {
143
- throw new Error(`offer list failed (http ${res.status}, code ${res.json?.code}) — session may be expired`);
197
+
198
+ const byId = new Map();
199
+ let rawCount = 0;
200
+ for (const st of sort_types) {
201
+ for (let i = 0; i < pages; i++) {
202
+ const offset = i * page_limit;
203
+ const res = await page.evaluate(async (q) => {
204
+ const cat = q.mid ? `&list_type=3&match_type=2&match_id=${q.mid}` : '&list_type=0';
205
+ const url = `/api/v3/offer/product/list?sort_type=${q.st}&page_offset=${q.offset}&page_limit=${q.page_limit}&client_type=1${cat}`;
206
+ try {
207
+ const r = await fetch(url, { credentials: 'include' });
208
+ return { status: r.status, json: await r.json().catch(() => null) };
209
+ } catch (e) { return { status: 0, error: String(e.message) }; }
210
+ }, { st, offset, page_limit, mid });
211
+
212
+ if (isBlockedState(page.url(), res) || res.json?.code === 90309999) {
213
+ return { status: 'needs_verify', products: [...byId.values()], raw_count: rawCount, reason: `blocked mid-scrape (http ${res.status}, code ${res.json?.code})` };
214
+ }
215
+ if (res.status !== 200 || res.json?.code !== 0) {
216
+ // soft stop — treat as end-of-data for this sort, move on
217
+ break;
218
+ }
219
+ const list = res.json?.data?.list || [];
220
+ rawCount += list.length;
221
+ for (const row of list.map(parseOfferRow)) {
222
+ if (row.item_id && !byId.has(row.item_id)) byId.set(row.item_id, row);
223
+ }
224
+ if (list.length < page_limit) break; // ran out
225
+ await sleep(delay_ms); // pace between page fetches
144
226
  }
145
- const list = res.json?.data?.list || [];
146
- all.push(...list.map(parseOfferRow).filter((p) => p.item_id));
147
- if (list.length < page_limit) break; // ran out
227
+ await sleep(delay_ms); // pace between sort passes
148
228
  }
149
- return all;
229
+
230
+ const products = applyFilters([...byId.values()], filters);
231
+ return { status: 'ok', products, raw_count: rawCount };
150
232
  } finally {
151
233
  if (opened) await page.close().catch(() => {});
152
234
  }
@@ -162,4 +244,4 @@ function parseProductUrl(url = '') {
162
244
  return null;
163
245
  }
164
246
 
165
- module.exports = { ingestProduct, scrapeOffers, parseProductUrl, parseOfferRow, parsePdpItem, categoryGroup, pctToNumber };
247
+ module.exports = { ingestProduct, scrapeOffers, parseProductUrl, parseOfferRow, parsePdpItem, categoryGroup, pctToNumber, AFFILIATE_CATEGORIES, applyFilters };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "channel-worker",
3
- "version": "2.5.13",
3
+ "version": "2.5.15",
4
4
  "description": "Channel Manager worker daemon — runs on remote machines to execute video pipeline jobs",
5
5
  "main": "lib/daemon.js",
6
6
  "bin": {
@@ -180,6 +180,54 @@ async function dismissBsOnboarding(page, log) {
180
180
  }
181
181
  }
182
182
 
183
+ // ── Large-file upload bypass ───────────────────────────────────────────────
184
+ // Playwright caps file transfers at 50MB when the browser is reached over
185
+ // connectOverCDP (our Nstbrowser case):
186
+ // "Cannot transfer files larger than 50Mb to a browser not connected to the
187
+ // server"
188
+ // Reup videos are routinely 60–200MB, so the native setInputFiles()/
189
+ // fileChooser.setFiles() throws and the upload never starts. Bypass via the raw
190
+ // CDP command DOM.setFileInputFiles: the browser reads the path off its OWN
191
+ // local disk (worker + Nstbrowser are co-located on the same machine), so no
192
+ // bytes cross the wire and there is no size cap. Unlike an in-page fetch() this
193
+ // is also immune to facebook.com's strict connect-src CSP. We tag the exact
194
+ // <input> with a unique attribute so CDP resolves its nodeId unambiguously
195
+ // (the page carries several file inputs).
196
+ async function cdpSetInputFiles(page, inputHandle, filePath, log, tag) {
197
+ const ATTR = 'data-cm-bigfile';
198
+ let cdp;
199
+ try {
200
+ await inputHandle.evaluate((el, a) => el.setAttribute(a, '1'), ATTR);
201
+ cdp = await page.context().newCDPSession(page);
202
+ const { root } = await cdp.send('DOM.getDocument', { depth: 0 });
203
+ const { nodeId } = await cdp.send('DOM.querySelector', {
204
+ nodeId: root.nodeId,
205
+ selector: `input[${ATTR}="1"]`,
206
+ });
207
+ if (!nodeId) throw new Error('CDP querySelector: tagged input not found');
208
+ await cdp.send('DOM.setFileInputFiles', { files: [filePath], nodeId });
209
+ log('info', `[${tag}] file set via CDP DOM.setFileInputFiles (>50MB bypass)`);
210
+ } finally {
211
+ await inputHandle.evaluate((el, a) => el.removeAttribute(a), ATTR).catch(() => {});
212
+ if (cdp) await cdp.detach().catch(() => {});
213
+ }
214
+ }
215
+
216
+ // Set a video file on a file <input>, transparently bypassing the CDP 50MB cap.
217
+ // Tries Playwright's native path first (fine for <50MB / thumbnails), then
218
+ // falls back to the CDP command only on the size error.
219
+ async function setVideoFile(page, inputHandle, filePath, log, tag = 'fb-pw') {
220
+ if (!inputHandle) throw new Error('setVideoFile: no input handle');
221
+ try {
222
+ await inputHandle.setInputFiles(filePath); // native — works for <50MB
223
+ return;
224
+ } catch (e) {
225
+ if (!/larger than 50\s?mb|not connected to the server/i.test(String(e.message || ''))) throw e;
226
+ log('info', `[${tag}] native setInputFiles hit the 50MB CDP cap — switching to DOM.setFileInputFiles…`);
227
+ }
228
+ await cdpSetInputFiles(page, inputHandle, filePath, log, tag);
229
+ }
230
+
183
231
  async function run({ page, payload, log }) {
184
232
  const {
185
233
  video_url, title, description = '', tags = [],
@@ -187,7 +235,7 @@ async function run({ page, payload, log }) {
187
235
  } = payload || {};
188
236
  if (!video_url) throw new Error('No video_url provided');
189
237
 
190
- log('info', '[fb-pw] selectors version=2026.06.05a-pagewall-no-double-post');
238
+ log('info', '[fb-pw] selectors version=2026.06.14a-bigfile-cdp');
191
239
 
192
240
  page.on('dialog', (d) => { d.accept().catch(() => {}); });
193
241
 
@@ -389,7 +437,7 @@ async function run({ page, payload, log }) {
389
437
  page.waitForEvent('filechooser', { timeout: 8000 }),
390
438
  btn.click({ timeout: 3000 }),
391
439
  ]);
392
- await chooser.setFiles(videoPath);
440
+ await setVideoFile(page, chooser.element(), videoPath, log);
393
441
  videoSet = true;
394
442
  log('info', `[fb-pw] video file set via modal-scoped "${(await btn.innerText().catch(() => '')).slice(0, 30)}" button`);
395
443
  } catch (e) {
@@ -402,7 +450,7 @@ async function run({ page, payload, log }) {
402
450
  const fi = reelsDialog.locator("input[type='file']").last();
403
451
  if (await fi.count().catch(() => 0) > 0) {
404
452
  try {
405
- await fi.setInputFiles(videoPath);
453
+ await setVideoFile(page, await fi.elementHandle(), videoPath, log);
406
454
  videoSet = true;
407
455
  log('info', '[fb-pw] video file set via modal-scoped fallback input[type=file]');
408
456
  } catch (e) { log('info', `[fb-pw] modal fallback input setInputFiles failed: ${e.message.slice(0, 80)}`); }
@@ -1326,9 +1374,12 @@ async function run({ page, payload, log }) {
1326
1374
  // output. (Observed on reel 1506614811005729: step 1 → click Tiếp
1327
1375
  // → publish, the thumb-edit pill never appeared.)
1328
1376
  if (thumbPath && !customThumbDone) {
1329
- await dumpInventory(page, log, `thumb-step-missed-${step + 1}`);
1330
- await dumpFailure(page, `thumb-step-missed-${step + 1}`, log);
1331
- throw new Error(`FB publish: thumbnail_url provided but the "Chỉnh sửa hình thu nhỏ" overlay was never detected before publish (step ${step + 1}). Refusing to ship without custom thumb. Inspect dump screenshots in Temp\\cm-worker-pw\\.`);
1377
+ // Custom thumb couldn't be applied (FB didn't render the "Chỉnh sửa
1378
+ // hình thu nhỏ" overlay). DON'T refuse to publish — the strict
1379
+ // failure shipped false "failed" results (the reel still published)
1380
+ // and double-posts on retry. Publish with FB's auto-thumbnail; the
1381
+ // custom artwork is a nice-to-have, not worth a hard failure.
1382
+ log('warn', `[fb-pw] custom thumb skipped — "Chỉnh sửa hình thu nhỏ" overlay never appeared (step ${step + 1}); publishing with FB auto-thumb`);
1332
1383
  }
1333
1384
  log('info', `[fb-pw] click publish "${pub.verb}" via "${pub.sel}" (step ${step + 1})`);
1334
1385
  // Snapshot the captured-IDs list RIGHT BEFORE the publish click. The