channel-worker 2.5.13 → 2.5.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/command-poller.js +18 -10
- package/lib/shopee-scraper.js +102 -20
- package/package.json +1 -1
- package/scripts/upload_facebook.js +57 -6
package/lib/command-poller.js
CHANGED
|
@@ -180,18 +180,26 @@ class CommandPoller {
|
|
|
180
180
|
let conn;
|
|
181
181
|
try {
|
|
182
182
|
conn = await this._connectNstProfileByName(profileName);
|
|
183
|
-
const
|
|
184
|
-
|
|
183
|
+
const res = await scraper.scrapeOffers(conn.browser, {
|
|
184
|
+
category: payload.category || 'women_clothes', // Tier-1 default: quần áo nữ
|
|
185
|
+
match_id: payload.match_id || null,
|
|
186
|
+
sort_types: payload.sort_types || [2, 3], // commission + sales, merged
|
|
187
|
+
pages: payload.pages ?? 2,
|
|
185
188
|
page_limit: payload.page_limit ?? 20,
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
delay_ms: payload.delay_ms ?? 2500, // human-like pacing (anti-bot)
|
|
190
|
+
filters: payload.filters || {},
|
|
188
191
|
});
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
192
|
+
if (res.status === 'needs_verify') {
|
|
193
|
+
// Anti-bot bounce — DON'T fail/retry into more captchas. Report cleanly
|
|
194
|
+
// so the dashboard can prompt the operator to re-verify via VNC.
|
|
195
|
+
console.warn(`[shopee] needs_verify: ${res.reason}`);
|
|
196
|
+
if (res.products?.length) await this.api.upsertAffiliateProducts(res.products, { user_id: command.user_id });
|
|
197
|
+
await this.api.updateCommand(command._id, { status: 'done', result: { needs_verify: true, reason: res.reason, count: res.products?.length || 0 } });
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
console.log(`[shopee] scraped ${res.raw_count} raw → ${res.products.length} after filters from ${profileName}`);
|
|
201
|
+
await this.api.upsertAffiliateProducts(res.products, { user_id: command.user_id });
|
|
202
|
+
await this.api.updateCommand(command._id, { status: 'done', result: { count: res.products.length, raw: res.raw_count } });
|
|
195
203
|
} catch (err) {
|
|
196
204
|
console.error(`[shopee] scrape failed: ${err.message}`);
|
|
197
205
|
await this.api.updateCommand(command._id, { status: 'failed', error: String(err.message || err).slice(0, 500) });
|
package/lib/shopee-scraper.js
CHANGED
|
@@ -120,33 +120,115 @@ function parseOfferRow(row) {
|
|
|
120
120
|
};
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
//
|
|
124
|
-
//
|
|
125
|
-
|
|
123
|
+
// Affiliate offer-page category tabs → match_id (read from rc-tabs node keys,
|
|
124
|
+
// confirmed live). Tier-1 fashion focus first; the offer page has NO dedicated
|
|
125
|
+
// Shoes/Bags/Accessories tab → those come later via keyword search.
|
|
126
|
+
const AFFILIATE_CATEGORIES = {
|
|
127
|
+
women_clothes: { match_id: 100017, label: 'Quần áo nữ', group: 'fashion' },
|
|
128
|
+
beauty: { match_id: 100630, label: 'Làm đẹp', group: 'beauty' },
|
|
129
|
+
home_living: { match_id: 100636, label: 'Nhà cửa', group: 'other' },
|
|
130
|
+
grocery: { match_id: 100629, label: 'Tạp hoá', group: 'other' },
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
134
|
+
|
|
135
|
+
// True when the session got bounced to the anti-bot captcha (scene=crawler_item)
|
|
136
|
+
// or logged out — in either case the offer API returns 404/empty and we must
|
|
137
|
+
// STOP (not keep hammering) and ask the operator to re-verify via VNC.
|
|
138
|
+
function isBlockedState(pageUrl, res) {
|
|
139
|
+
if (/\/verify\/captcha|\/buyer\/login|is_from_login/.test(pageUrl || '')) return true;
|
|
140
|
+
if (res && (res.status === 403 || res.status === 404)) return true;
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Apply the Flow-1 hard filters (criteria) to parsed rows. Defaults are lenient;
|
|
145
|
+
// the controller passes the agreed thresholds (commission/price/sold/rating/imgs).
|
|
146
|
+
function applyFilters(rows, f = {}) {
|
|
147
|
+
return rows.filter((p) => {
|
|
148
|
+
if (f.category_groups?.length && !f.category_groups.includes(p.category_group)) return false;
|
|
149
|
+
if (f.min_commission != null && (p.commission_rate || 0) < f.min_commission) return false;
|
|
150
|
+
if (f.price_min != null && (p.price || 0) < f.price_min) return false;
|
|
151
|
+
if (f.price_max != null && (p.price || 0) > f.price_max) return false;
|
|
152
|
+
if (f.min_sold != null && (p.sold_count || 0) < f.min_sold) return false;
|
|
153
|
+
if (f.min_rating != null && p.rating != null && p.rating < f.min_rating) return false;
|
|
154
|
+
if (f.min_images != null && (p.images?.length || 0) < f.min_images) return false;
|
|
155
|
+
return true;
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// In-page fetch the affiliate offer list — anti-bot aware. Pulls one or more
|
|
160
|
+
// sort_types (2=commission, 3=sales), paginates gently with a human-like delay,
|
|
161
|
+
// merges + dedups by item_id, then applies hard filters. Returns
|
|
162
|
+
// { status: 'ok'|'needs_verify', products: [...], raw_count }
|
|
163
|
+
// Never throws on a blocked session — returns status:'needs_verify' so the
|
|
164
|
+
// worker reports it cleanly instead of failing + retrying into more captchas.
|
|
165
|
+
async function scrapeOffers(browser, {
|
|
166
|
+
category = null, // key in AFFILIATE_CATEGORIES (e.g. 'women_clothes')
|
|
167
|
+
match_id = null, // explicit category id override
|
|
168
|
+
sort_types = [2], // [2]=commission; pass [2,3] to merge commission+sales
|
|
169
|
+
pages = 2,
|
|
170
|
+
page_limit = 20,
|
|
171
|
+
delay_ms = 2500, // human-like pause between calls (anti-bot)
|
|
172
|
+
filters = {},
|
|
173
|
+
timeoutMs = 45000,
|
|
174
|
+
// legacy single-sort param (back-compat with the first build)
|
|
175
|
+
sort_type = null,
|
|
176
|
+
} = {}) {
|
|
177
|
+
if (sort_type != null) sort_types = [sort_type];
|
|
178
|
+
const cat = category && AFFILIATE_CATEGORIES[category] ? AFFILIATE_CATEGORIES[category] : null;
|
|
179
|
+
const mid = match_id || cat?.match_id || null;
|
|
180
|
+
// Default the category_groups filter to the category's own group (so a
|
|
181
|
+
// Women-Clothes scrape keeps only fashion rows even though the tab mixes in
|
|
182
|
+
// a few accessories), unless the caller overrides.
|
|
183
|
+
if (cat && !filters.category_groups) filters = { ...filters, category_groups: [cat.group] };
|
|
184
|
+
|
|
126
185
|
const pagesOpen = await browser.pages();
|
|
127
186
|
let page = pagesOpen.find((p) => p.url().includes('affiliate.shopee.vn'));
|
|
128
187
|
let opened = false;
|
|
129
188
|
if (!page) { page = await browser.newPage(); opened = true; }
|
|
130
189
|
try {
|
|
131
|
-
if (!page.url().includes('affiliate.shopee.vn')) {
|
|
132
|
-
await page.goto('https://affiliate.shopee.vn/offer/product_offer', { waitUntil: 'networkidle2', timeout: timeoutMs });
|
|
190
|
+
if (!page.url().includes('affiliate.shopee.vn/offer')) {
|
|
191
|
+
await page.goto('https://affiliate.shopee.vn/offer/product_offer', { waitUntil: 'networkidle2', timeout: timeoutMs }).catch(() => {});
|
|
192
|
+
await sleep(1500);
|
|
193
|
+
}
|
|
194
|
+
if (isBlockedState(page.url(), null)) {
|
|
195
|
+
return { status: 'needs_verify', products: [], raw_count: 0, reason: 'session at captcha/login — re-verify via VNC' };
|
|
133
196
|
}
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
const
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
197
|
+
|
|
198
|
+
const byId = new Map();
|
|
199
|
+
let rawCount = 0;
|
|
200
|
+
for (const st of sort_types) {
|
|
201
|
+
for (let i = 0; i < pages; i++) {
|
|
202
|
+
const offset = i * page_limit;
|
|
203
|
+
const res = await page.evaluate(async (q) => {
|
|
204
|
+
const cat = q.mid ? `&list_type=3&match_type=2&match_id=${q.mid}` : '&list_type=0';
|
|
205
|
+
const url = `/api/v3/offer/product/list?sort_type=${q.st}&page_offset=${q.offset}&page_limit=${q.page_limit}&client_type=1${cat}`;
|
|
206
|
+
try {
|
|
207
|
+
const r = await fetch(url, { credentials: 'include' });
|
|
208
|
+
return { status: r.status, json: await r.json().catch(() => null) };
|
|
209
|
+
} catch (e) { return { status: 0, error: String(e.message) }; }
|
|
210
|
+
}, { st, offset, page_limit, mid });
|
|
211
|
+
|
|
212
|
+
if (isBlockedState(page.url(), res) || res.json?.code === 90309999) {
|
|
213
|
+
return { status: 'needs_verify', products: [...byId.values()], raw_count: rawCount, reason: `blocked mid-scrape (http ${res.status}, code ${res.json?.code})` };
|
|
214
|
+
}
|
|
215
|
+
if (res.status !== 200 || res.json?.code !== 0) {
|
|
216
|
+
// soft stop — treat as end-of-data for this sort, move on
|
|
217
|
+
break;
|
|
218
|
+
}
|
|
219
|
+
const list = res.json?.data?.list || [];
|
|
220
|
+
rawCount += list.length;
|
|
221
|
+
for (const row of list.map(parseOfferRow)) {
|
|
222
|
+
if (row.item_id && !byId.has(row.item_id)) byId.set(row.item_id, row);
|
|
223
|
+
}
|
|
224
|
+
if (list.length < page_limit) break; // ran out
|
|
225
|
+
await sleep(delay_ms); // pace between page fetches
|
|
144
226
|
}
|
|
145
|
-
|
|
146
|
-
all.push(...list.map(parseOfferRow).filter((p) => p.item_id));
|
|
147
|
-
if (list.length < page_limit) break; // ran out
|
|
227
|
+
await sleep(delay_ms); // pace between sort passes
|
|
148
228
|
}
|
|
149
|
-
|
|
229
|
+
|
|
230
|
+
const products = applyFilters([...byId.values()], filters);
|
|
231
|
+
return { status: 'ok', products, raw_count: rawCount };
|
|
150
232
|
} finally {
|
|
151
233
|
if (opened) await page.close().catch(() => {});
|
|
152
234
|
}
|
|
@@ -162,4 +244,4 @@ function parseProductUrl(url = '') {
|
|
|
162
244
|
return null;
|
|
163
245
|
}
|
|
164
246
|
|
|
165
|
-
module.exports = { ingestProduct, scrapeOffers, parseProductUrl, parseOfferRow, parsePdpItem, categoryGroup, pctToNumber };
|
|
247
|
+
module.exports = { ingestProduct, scrapeOffers, parseProductUrl, parseOfferRow, parsePdpItem, categoryGroup, pctToNumber, AFFILIATE_CATEGORIES, applyFilters };
|
package/package.json
CHANGED
|
@@ -180,6 +180,54 @@ async function dismissBsOnboarding(page, log) {
|
|
|
180
180
|
}
|
|
181
181
|
}
|
|
182
182
|
|
|
183
|
+
// ── Large-file upload bypass ───────────────────────────────────────────────
|
|
184
|
+
// Playwright caps file transfers at 50MB when the browser is reached over
|
|
185
|
+
// connectOverCDP (our Nstbrowser case):
|
|
186
|
+
// "Cannot transfer files larger than 50Mb to a browser not connected to the
|
|
187
|
+
// server"
|
|
188
|
+
// Reup videos are routinely 60–200MB, so the native setInputFiles()/
|
|
189
|
+
// fileChooser.setFiles() throws and the upload never starts. Bypass via the raw
|
|
190
|
+
// CDP command DOM.setFileInputFiles: the browser reads the path off its OWN
|
|
191
|
+
// local disk (worker + Nstbrowser are co-located on the same machine), so no
|
|
192
|
+
// bytes cross the wire and there is no size cap. Unlike an in-page fetch() this
|
|
193
|
+
// is also immune to facebook.com's strict connect-src CSP. We tag the exact
|
|
194
|
+
// <input> with a unique attribute so CDP resolves its nodeId unambiguously
|
|
195
|
+
// (the page carries several file inputs).
|
|
196
|
+
async function cdpSetInputFiles(page, inputHandle, filePath, log, tag) {
|
|
197
|
+
const ATTR = 'data-cm-bigfile';
|
|
198
|
+
let cdp;
|
|
199
|
+
try {
|
|
200
|
+
await inputHandle.evaluate((el, a) => el.setAttribute(a, '1'), ATTR);
|
|
201
|
+
cdp = await page.context().newCDPSession(page);
|
|
202
|
+
const { root } = await cdp.send('DOM.getDocument', { depth: 0 });
|
|
203
|
+
const { nodeId } = await cdp.send('DOM.querySelector', {
|
|
204
|
+
nodeId: root.nodeId,
|
|
205
|
+
selector: `input[${ATTR}="1"]`,
|
|
206
|
+
});
|
|
207
|
+
if (!nodeId) throw new Error('CDP querySelector: tagged input not found');
|
|
208
|
+
await cdp.send('DOM.setFileInputFiles', { files: [filePath], nodeId });
|
|
209
|
+
log('info', `[${tag}] file set via CDP DOM.setFileInputFiles (>50MB bypass)`);
|
|
210
|
+
} finally {
|
|
211
|
+
await inputHandle.evaluate((el, a) => el.removeAttribute(a), ATTR).catch(() => {});
|
|
212
|
+
if (cdp) await cdp.detach().catch(() => {});
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Set a video file on a file <input>, transparently bypassing the CDP 50MB cap.
|
|
217
|
+
// Tries Playwright's native path first (fine for <50MB / thumbnails), then
|
|
218
|
+
// falls back to the CDP command only on the size error.
|
|
219
|
+
async function setVideoFile(page, inputHandle, filePath, log, tag = 'fb-pw') {
|
|
220
|
+
if (!inputHandle) throw new Error('setVideoFile: no input handle');
|
|
221
|
+
try {
|
|
222
|
+
await inputHandle.setInputFiles(filePath); // native — works for <50MB
|
|
223
|
+
return;
|
|
224
|
+
} catch (e) {
|
|
225
|
+
if (!/larger than 50\s?mb|not connected to the server/i.test(String(e.message || ''))) throw e;
|
|
226
|
+
log('info', `[${tag}] native setInputFiles hit the 50MB CDP cap — switching to DOM.setFileInputFiles…`);
|
|
227
|
+
}
|
|
228
|
+
await cdpSetInputFiles(page, inputHandle, filePath, log, tag);
|
|
229
|
+
}
|
|
230
|
+
|
|
183
231
|
async function run({ page, payload, log }) {
|
|
184
232
|
const {
|
|
185
233
|
video_url, title, description = '', tags = [],
|
|
@@ -187,7 +235,7 @@ async function run({ page, payload, log }) {
|
|
|
187
235
|
} = payload || {};
|
|
188
236
|
if (!video_url) throw new Error('No video_url provided');
|
|
189
237
|
|
|
190
|
-
log('info', '[fb-pw] selectors version=2026.06.
|
|
238
|
+
log('info', '[fb-pw] selectors version=2026.06.14a-bigfile-cdp');
|
|
191
239
|
|
|
192
240
|
page.on('dialog', (d) => { d.accept().catch(() => {}); });
|
|
193
241
|
|
|
@@ -389,7 +437,7 @@ async function run({ page, payload, log }) {
|
|
|
389
437
|
page.waitForEvent('filechooser', { timeout: 8000 }),
|
|
390
438
|
btn.click({ timeout: 3000 }),
|
|
391
439
|
]);
|
|
392
|
-
await chooser.
|
|
440
|
+
await setVideoFile(page, chooser.element(), videoPath, log);
|
|
393
441
|
videoSet = true;
|
|
394
442
|
log('info', `[fb-pw] video file set via modal-scoped "${(await btn.innerText().catch(() => '')).slice(0, 30)}" button`);
|
|
395
443
|
} catch (e) {
|
|
@@ -402,7 +450,7 @@ async function run({ page, payload, log }) {
|
|
|
402
450
|
const fi = reelsDialog.locator("input[type='file']").last();
|
|
403
451
|
if (await fi.count().catch(() => 0) > 0) {
|
|
404
452
|
try {
|
|
405
|
-
await fi.
|
|
453
|
+
await setVideoFile(page, await fi.elementHandle(), videoPath, log);
|
|
406
454
|
videoSet = true;
|
|
407
455
|
log('info', '[fb-pw] video file set via modal-scoped fallback input[type=file]');
|
|
408
456
|
} catch (e) { log('info', `[fb-pw] modal fallback input setInputFiles failed: ${e.message.slice(0, 80)}`); }
|
|
@@ -1326,9 +1374,12 @@ async function run({ page, payload, log }) {
|
|
|
1326
1374
|
// output. (Observed on reel 1506614811005729: step 1 → click Tiếp
|
|
1327
1375
|
// → publish, the thumb-edit pill never appeared.)
|
|
1328
1376
|
if (thumbPath && !customThumbDone) {
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1377
|
+
// Custom thumb couldn't be applied (FB didn't render the "Chỉnh sửa
|
|
1378
|
+
// hình thu nhỏ" overlay). DON'T refuse to publish — the strict
|
|
1379
|
+
// failure shipped false "failed" results (the reel still published)
|
|
1380
|
+
// and double-posts on retry. Publish with FB's auto-thumbnail; the
|
|
1381
|
+
// custom artwork is a nice-to-have, not worth a hard failure.
|
|
1382
|
+
log('warn', `[fb-pw] custom thumb skipped — "Chỉnh sửa hình thu nhỏ" overlay never appeared (step ${step + 1}); publishing with FB auto-thumb`);
|
|
1332
1383
|
}
|
|
1333
1384
|
log('info', `[fb-pw] click publish "${pub.verb}" via "${pub.sel}" (step ${step + 1})`);
|
|
1334
1385
|
// Snapshot the captured-IDs list RIGHT BEFORE the publish click. The
|