optimal-cli 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/.gitkeep +0 -0
- package/agents/content-ops.md +227 -0
- package/agents/financial-ops.md +184 -0
- package/agents/infra-ops.md +206 -0
- package/agents/profiles.json +5 -0
- package/bin/optimal.ts +1731 -0
- package/docs/CLI-REFERENCE.md +361 -0
- package/lib/assets/index.ts +225 -0
- package/lib/assets.ts +124 -0
- package/lib/auth/index.ts +189 -0
- package/lib/board/index.ts +309 -0
- package/lib/board/types.ts +124 -0
- package/lib/bot/claim.ts +43 -0
- package/lib/bot/coordinator.ts +254 -0
- package/lib/bot/heartbeat.ts +37 -0
- package/lib/bot/index.ts +9 -0
- package/lib/bot/protocol.ts +99 -0
- package/lib/bot/reporter.ts +42 -0
- package/lib/bot/skills.ts +81 -0
- package/lib/budget/projections.ts +561 -0
- package/lib/budget/scenarios.ts +312 -0
- package/lib/cms/publish-blog.ts +129 -0
- package/lib/cms/strapi-client.ts +302 -0
- package/lib/config/registry.ts +228 -0
- package/lib/config/schema.ts +58 -0
- package/lib/config.ts +247 -0
- package/lib/errors.ts +129 -0
- package/lib/format.ts +120 -0
- package/lib/infra/.gitkeep +0 -0
- package/lib/infra/deploy.ts +70 -0
- package/lib/infra/migrate.ts +141 -0
- package/lib/newsletter/.gitkeep +0 -0
- package/lib/newsletter/distribute.ts +256 -0
- package/{dist/lib/newsletter/generate-insurance.d.ts → lib/newsletter/generate-insurance.ts} +24 -7
- package/lib/newsletter/generate.ts +735 -0
- package/lib/returnpro/.gitkeep +0 -0
- package/lib/returnpro/anomalies.ts +258 -0
- package/lib/returnpro/audit.ts +194 -0
- package/lib/returnpro/diagnose.ts +400 -0
- package/lib/returnpro/kpis.ts +255 -0
- package/lib/returnpro/templates.ts +323 -0
- package/lib/returnpro/upload-income.ts +311 -0
- package/lib/returnpro/upload-netsuite.ts +696 -0
- package/lib/returnpro/upload-r1.ts +563 -0
- package/lib/returnpro/validate.ts +154 -0
- package/lib/social/meta.ts +228 -0
- package/lib/social/post-generator.ts +468 -0
- package/lib/social/publish.ts +301 -0
- package/lib/social/scraper.ts +503 -0
- package/lib/supabase.ts +25 -0
- package/lib/transactions/delete-batch.ts +258 -0
- package/lib/transactions/ingest.ts +659 -0
- package/lib/transactions/stamp.ts +654 -0
- package/package.json +15 -25
- package/dist/bin/optimal.d.ts +0 -2
- package/dist/bin/optimal.js +0 -995
- package/dist/lib/budget/projections.d.ts +0 -115
- package/dist/lib/budget/projections.js +0 -384
- package/dist/lib/budget/scenarios.d.ts +0 -93
- package/dist/lib/budget/scenarios.js +0 -214
- package/dist/lib/cms/publish-blog.d.ts +0 -62
- package/dist/lib/cms/publish-blog.js +0 -74
- package/dist/lib/cms/strapi-client.d.ts +0 -123
- package/dist/lib/cms/strapi-client.js +0 -213
- package/dist/lib/config.d.ts +0 -55
- package/dist/lib/config.js +0 -206
- package/dist/lib/infra/deploy.d.ts +0 -29
- package/dist/lib/infra/deploy.js +0 -58
- package/dist/lib/infra/migrate.d.ts +0 -34
- package/dist/lib/infra/migrate.js +0 -103
- package/dist/lib/kanban.d.ts +0 -46
- package/dist/lib/kanban.js +0 -118
- package/dist/lib/newsletter/distribute.d.ts +0 -52
- package/dist/lib/newsletter/distribute.js +0 -193
- package/dist/lib/newsletter/generate-insurance.js +0 -36
- package/dist/lib/newsletter/generate.d.ts +0 -104
- package/dist/lib/newsletter/generate.js +0 -571
- package/dist/lib/returnpro/anomalies.d.ts +0 -64
- package/dist/lib/returnpro/anomalies.js +0 -166
- package/dist/lib/returnpro/audit.d.ts +0 -32
- package/dist/lib/returnpro/audit.js +0 -147
- package/dist/lib/returnpro/diagnose.d.ts +0 -52
- package/dist/lib/returnpro/diagnose.js +0 -281
- package/dist/lib/returnpro/kpis.d.ts +0 -32
- package/dist/lib/returnpro/kpis.js +0 -192
- package/dist/lib/returnpro/templates.d.ts +0 -48
- package/dist/lib/returnpro/templates.js +0 -229
- package/dist/lib/returnpro/upload-income.d.ts +0 -25
- package/dist/lib/returnpro/upload-income.js +0 -235
- package/dist/lib/returnpro/upload-netsuite.d.ts +0 -37
- package/dist/lib/returnpro/upload-netsuite.js +0 -566
- package/dist/lib/returnpro/upload-r1.d.ts +0 -48
- package/dist/lib/returnpro/upload-r1.js +0 -398
- package/dist/lib/social/post-generator.d.ts +0 -83
- package/dist/lib/social/post-generator.js +0 -333
- package/dist/lib/social/publish.d.ts +0 -66
- package/dist/lib/social/publish.js +0 -226
- package/dist/lib/social/scraper.d.ts +0 -67
- package/dist/lib/social/scraper.js +0 -361
- package/dist/lib/supabase.d.ts +0 -4
- package/dist/lib/supabase.js +0 -20
- package/dist/lib/transactions/delete-batch.d.ts +0 -60
- package/dist/lib/transactions/delete-batch.js +0 -203
- package/dist/lib/transactions/ingest.d.ts +0 -43
- package/dist/lib/transactions/ingest.js +0 -555
- package/dist/lib/transactions/stamp.d.ts +0 -51
- package/dist/lib/transactions/stamp.js +0 -524
|
@@ -0,0 +1,503 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Meta Ad Library Scraper
|
|
3
|
+
*
|
|
4
|
+
* Ported from Python: ~/projects/meta-ad-scraper/scripts/meta_ad_scraper_v2.py
|
|
5
|
+
*
|
|
6
|
+
* Scrapes Facebook Ad Library for competitor ad intelligence.
|
|
7
|
+
* Uses Playwright headless Chromium with anti-detection measures.
|
|
8
|
+
* Splits ads by Library ID pattern, extracts metadata via regex.
|
|
9
|
+
*
|
|
10
|
+
* Functions:
|
|
11
|
+
* buildUrl() — construct Facebook Ad Library URL for a company
|
|
12
|
+
* scrollAndLoad() — auto-scroll page to load all ads (max 15 scrolls)
|
|
13
|
+
* extractAds() — two-stage extraction: DOM containers, then text split fallback
|
|
14
|
+
* parseAdText() — regex extraction of ad metadata from text blocks
|
|
15
|
+
* extractLandingUrls() — find landing page URLs from DOM links
|
|
16
|
+
* scrapeCompany() — orchestrate single company scrape
|
|
17
|
+
* scrapeCompanies() — batch-scrape multiple companies with configurable parallelism
|
|
18
|
+
* formatCsv() — convert ad records to CSV string
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { chromium, type Browser, type Page, type BrowserContext } from 'playwright'
|
|
22
|
+
import { writeFileSync } from 'node:fs'
|
|
23
|
+
|
|
24
|
+
// ── Types ────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
export interface AdRecord {
|
|
27
|
+
company_searched: string
|
|
28
|
+
ad_id: string
|
|
29
|
+
page_name: string
|
|
30
|
+
ad_text: string
|
|
31
|
+
status: string
|
|
32
|
+
start_date: string
|
|
33
|
+
impressions: string
|
|
34
|
+
spend: string
|
|
35
|
+
media_type: string
|
|
36
|
+
platforms: string
|
|
37
|
+
landing_page_url: string
|
|
38
|
+
full_text_snippet: string
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface ScrapeOptions {
|
|
42
|
+
/** Companies to scrape */
|
|
43
|
+
companies: string[]
|
|
44
|
+
/** Output file path (if undefined, return results only) */
|
|
45
|
+
outputPath?: string
|
|
46
|
+
/** Batch size for parallel processing (default: 6) */
|
|
47
|
+
batchSize?: number
|
|
48
|
+
/** Maximum scrolls per page (default: 15) */
|
|
49
|
+
maxScrolls?: number
|
|
50
|
+
/** Delay between companies in ms (default: 4000) */
|
|
51
|
+
companyDelay?: number
|
|
52
|
+
/** Run headless (default: true) */
|
|
53
|
+
headless?: boolean
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export interface ScrapeResult {
|
|
57
|
+
ads: AdRecord[]
|
|
58
|
+
totalCompanies: number
|
|
59
|
+
companiesScraped: number
|
|
60
|
+
outputPath?: string
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
interface DomAdContainer {
|
|
64
|
+
text: string
|
|
65
|
+
textLen: number
|
|
66
|
+
tag: string
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// ── CSV Column Order ────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
const CSV_FIELDS: (keyof AdRecord)[] = [
|
|
72
|
+
'company_searched',
|
|
73
|
+
'ad_id',
|
|
74
|
+
'page_name',
|
|
75
|
+
'ad_text',
|
|
76
|
+
'status',
|
|
77
|
+
'start_date',
|
|
78
|
+
'impressions',
|
|
79
|
+
'spend',
|
|
80
|
+
'media_type',
|
|
81
|
+
'platforms',
|
|
82
|
+
'landing_page_url',
|
|
83
|
+
'full_text_snippet',
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
// ── URL Builder ─────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
export function buildUrl(companyName: string): string {
|
|
89
|
+
const base = 'https://www.facebook.com/ads/library/'
|
|
90
|
+
const params =
|
|
91
|
+
`?active_status=active` +
|
|
92
|
+
`&ad_type=all` +
|
|
93
|
+
`&country=US` +
|
|
94
|
+
`&is_targeted_country=false` +
|
|
95
|
+
`&media_type=all` +
|
|
96
|
+
`&sort_data[mode]=total_impressions` +
|
|
97
|
+
`&sort_data[direction]=desc` +
|
|
98
|
+
`&q=${encodeURIComponent(companyName)}`
|
|
99
|
+
return base + params
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// ── Scroll & Load ───────────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
export async function scrollAndLoad(
|
|
105
|
+
page: Page,
|
|
106
|
+
maxScrolls = 15,
|
|
107
|
+
): Promise<void> {
|
|
108
|
+
let prevHeight = 0
|
|
109
|
+
for (let i = 0; i < maxScrolls; i++) {
|
|
110
|
+
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight))
|
|
111
|
+
await page.waitForTimeout(2000)
|
|
112
|
+
const currHeight = await page.evaluate(() => document.body.scrollHeight)
|
|
113
|
+
if (currHeight === prevHeight && i > 1) break
|
|
114
|
+
prevHeight = currHeight
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ── Parse Ad Text ───────────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
export function parseAdText(
|
|
121
|
+
text: string,
|
|
122
|
+
companyName: string,
|
|
123
|
+
): AdRecord | null {
|
|
124
|
+
if (!text || text.length < 20) return null
|
|
125
|
+
|
|
126
|
+
const ad: Partial<AdRecord> = { company_searched: companyName }
|
|
127
|
+
|
|
128
|
+
// Library ID
|
|
129
|
+
const idMatch = text.match(/Library ID:\s*(\d+)/)
|
|
130
|
+
if (idMatch) {
|
|
131
|
+
ad.ad_id = idMatch[1]
|
|
132
|
+
} else {
|
|
133
|
+
return null // Skip blocks without a Library ID
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Start date
|
|
137
|
+
const dateMatch = text.match(/Started running on\s+(\w+ \d+,?\s*\d*)/)
|
|
138
|
+
if (dateMatch) {
|
|
139
|
+
ad.start_date = dateMatch[1].trim()
|
|
140
|
+
} else {
|
|
141
|
+
ad.start_date = ''
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Status (Active/Inactive)
|
|
145
|
+
if (text.includes('Active')) {
|
|
146
|
+
ad.status = 'Active'
|
|
147
|
+
} else if (text.includes('Inactive')) {
|
|
148
|
+
ad.status = 'Inactive'
|
|
149
|
+
} else {
|
|
150
|
+
ad.status = 'Unknown'
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Page name - look for "Sponsored" text preceded by the page name
|
|
154
|
+
const sponsorMatch = text.match(/(?:^|\n)([^\n]+)\nSponsored/)
|
|
155
|
+
if (sponsorMatch) {
|
|
156
|
+
ad.page_name = sponsorMatch[1].trim()
|
|
157
|
+
} else {
|
|
158
|
+
ad.page_name = ''
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Ad creative text - text after "Sponsored" and before common end markers
|
|
162
|
+
const creativeMatch = text.match(
|
|
163
|
+
/Sponsored\n(.+?)(?:\n(?:Learn More|Sign Up|Shop Now|Get Offer|Download|Apply Now|Book Now|Contact Us|Send Message|Watch More|See Menu|Get Quote|Subscribe|Get Showtimes)|\Z)/s,
|
|
164
|
+
)
|
|
165
|
+
if (creativeMatch) {
|
|
166
|
+
ad.ad_text = creativeMatch[1].trim().slice(0, 500)
|
|
167
|
+
} else {
|
|
168
|
+
ad.ad_text = ''
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Impressions
|
|
172
|
+
const impMatch = text.match(
|
|
173
|
+
/(?:impressions?)\s*[:\s]*([\d,.]+\s*[-\u2013]\s*[\d,.]+)/i,
|
|
174
|
+
)
|
|
175
|
+
if (impMatch) {
|
|
176
|
+
ad.impressions = impMatch[1]
|
|
177
|
+
} else {
|
|
178
|
+
ad.impressions = ''
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Spend
|
|
182
|
+
const spendMatch = text.match(
|
|
183
|
+
/(?:spend|spent)\s*[:\s]*\$?([\d,.]+\s*[-\u2013]\s*\$?[\d,.]+)/i,
|
|
184
|
+
)
|
|
185
|
+
if (spendMatch) {
|
|
186
|
+
ad.spend = spendMatch[1]
|
|
187
|
+
} else {
|
|
188
|
+
ad.spend = ''
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Media type
|
|
192
|
+
const textLower = text.toLowerCase()
|
|
193
|
+
if (['video', '0:00', 'play'].some((kw) => textLower.includes(kw))) {
|
|
194
|
+
ad.media_type = 'video'
|
|
195
|
+
} else if (
|
|
196
|
+
textLower.includes('carousel') ||
|
|
197
|
+
textLower.includes('multiple versions')
|
|
198
|
+
) {
|
|
199
|
+
ad.media_type = 'carousel/multiple'
|
|
200
|
+
} else {
|
|
201
|
+
ad.media_type = 'image'
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Platforms
|
|
205
|
+
const platformNames = ['Facebook', 'Instagram', 'Messenger', 'Audience Network']
|
|
206
|
+
const platforms = platformNames.filter((p) =>
|
|
207
|
+
textLower.includes(p.toLowerCase()),
|
|
208
|
+
)
|
|
209
|
+
ad.platforms = platforms.join(', ')
|
|
210
|
+
|
|
211
|
+
// Landing page URL (not available from text, would need DOM links)
|
|
212
|
+
ad.landing_page_url = ''
|
|
213
|
+
|
|
214
|
+
// Full text snippet for reference
|
|
215
|
+
ad.full_text_snippet = text.slice(0, 500)
|
|
216
|
+
|
|
217
|
+
return ad as AdRecord
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// ── Extract Ads ─────────────────────────────────────────────────────
|
|
221
|
+
|
|
222
|
+
export async function extractAds(
|
|
223
|
+
page: Page,
|
|
224
|
+
companyName: string,
|
|
225
|
+
maxScrolls = 15,
|
|
226
|
+
): Promise<AdRecord[]> {
|
|
227
|
+
const ads: AdRecord[] = []
|
|
228
|
+
|
|
229
|
+
// Wait for content
|
|
230
|
+
try {
|
|
231
|
+
await page.waitForLoadState('networkidle', { timeout: 15000 })
|
|
232
|
+
} catch {
|
|
233
|
+
// Timeout is acceptable — continue with what loaded
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
await page.waitForTimeout(3000)
|
|
237
|
+
|
|
238
|
+
// Check for no results
|
|
239
|
+
const pageText = await page.evaluate(() => document.body.innerText)
|
|
240
|
+
if (
|
|
241
|
+
!pageText ||
|
|
242
|
+
pageText.toLowerCase().includes('no results') ||
|
|
243
|
+
pageText.toLowerCase().includes('no ads match')
|
|
244
|
+
) {
|
|
245
|
+
console.log(` [INFO] No ads found for ${companyName}`)
|
|
246
|
+
return ads
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Scroll to load all ads
|
|
250
|
+
await scrollAndLoad(page, maxScrolls)
|
|
251
|
+
|
|
252
|
+
// Also try to extract structured data from the DOM
|
|
253
|
+
const domAds = await page.evaluate(() => {
|
|
254
|
+
const results: DomAdContainer[] = []
|
|
255
|
+
|
|
256
|
+
// Find all Library ID occurrences via DOM containers
|
|
257
|
+
const allElements = document.querySelectorAll('div')
|
|
258
|
+
const adContainers: DomAdContainer[] = []
|
|
259
|
+
|
|
260
|
+
allElements.forEach((el) => {
|
|
261
|
+
const text = el.innerText || ''
|
|
262
|
+
// An ad container typically has EXACTLY ONE Library ID
|
|
263
|
+
const idMatches = text.match(/Library ID:\s*\d+/g)
|
|
264
|
+
if (idMatches && idMatches.length === 1) {
|
|
265
|
+
// Check it's not too small (just a label) or too large (parent of multiple ads)
|
|
266
|
+
const textLen = text.length
|
|
267
|
+
if (textLen > 50 && textLen < 5000) {
|
|
268
|
+
adContainers.push({
|
|
269
|
+
text,
|
|
270
|
+
textLen,
|
|
271
|
+
tag: el.tagName,
|
|
272
|
+
})
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
})
|
|
276
|
+
|
|
277
|
+
// Deduplicate - remove containers that are subsets of other containers
|
|
278
|
+
// Sort by text length (smallest first - these are the most specific)
|
|
279
|
+
adContainers.sort((a, b) => a.textLen - b.textLen)
|
|
280
|
+
|
|
281
|
+
const seen = new Set<string>()
|
|
282
|
+
adContainers.forEach((container) => {
|
|
283
|
+
const idMatch = container.text.match(/Library ID:\s*(\d+)/)
|
|
284
|
+
if (idMatch && !seen.has(idMatch[1])) {
|
|
285
|
+
seen.add(idMatch[1])
|
|
286
|
+
results.push(container)
|
|
287
|
+
}
|
|
288
|
+
})
|
|
289
|
+
|
|
290
|
+
return results
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
if (domAds && domAds.length > 0) {
|
|
294
|
+
console.log(` [DOM] Found ${domAds.length} individual ad containers`)
|
|
295
|
+
for (const raw of domAds) {
|
|
296
|
+
const ad = parseAdText(raw.text, companyName)
|
|
297
|
+
if (ad) ads.push(ad)
|
|
298
|
+
}
|
|
299
|
+
} else {
|
|
300
|
+
// Fallback: split page text by "Library ID:" pattern
|
|
301
|
+
console.log(` [TEXT] Falling back to text-based splitting`)
|
|
302
|
+
const fullText = await page.evaluate(() => document.body.innerText)
|
|
303
|
+
const sections = fullText.split(/(?=Library ID:\s*\d+)/)
|
|
304
|
+
for (const section of sections) {
|
|
305
|
+
const trimmed = section.trim()
|
|
306
|
+
if (!trimmed || trimmed.length < 30) continue
|
|
307
|
+
const ad = parseAdText(trimmed, companyName)
|
|
308
|
+
if (ad) ads.push(ad)
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
return ads
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// ── Extract Landing URLs ────────────────────────────────────────────
|
|
316
|
+
|
|
317
|
+
export async function extractLandingUrls(
|
|
318
|
+
page: Page,
|
|
319
|
+
adIds: string[],
|
|
320
|
+
): Promise<Record<string, string>> {
|
|
321
|
+
return page.evaluate((ids: string[]) => {
|
|
322
|
+
const result: Record<string, string> = {}
|
|
323
|
+
const links = document.querySelectorAll('a[href*="l.facebook.com"]')
|
|
324
|
+
links.forEach((link) => {
|
|
325
|
+
const href = (link as HTMLAnchorElement).href || ''
|
|
326
|
+
const parent = link.closest('div')
|
|
327
|
+
if (parent) {
|
|
328
|
+
const text = parent.innerText || ''
|
|
329
|
+
for (const id of ids) {
|
|
330
|
+
if (text.includes(id) && !result[id]) {
|
|
331
|
+
result[id] = href
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
})
|
|
336
|
+
return result
|
|
337
|
+
}, adIds)
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// ── Scrape Single Company ───────────────────────────────────────────
|
|
341
|
+
|
|
342
|
+
export async function scrapeCompany(
|
|
343
|
+
page: Page,
|
|
344
|
+
companyName: string,
|
|
345
|
+
maxScrolls = 15,
|
|
346
|
+
): Promise<AdRecord[]> {
|
|
347
|
+
const url = buildUrl(companyName)
|
|
348
|
+
console.log(`\n${'='.repeat(60)}`)
|
|
349
|
+
console.log(`Scraping: ${companyName}`)
|
|
350
|
+
console.log(`URL: ${url}`)
|
|
351
|
+
console.log(`${'='.repeat(60)}`)
|
|
352
|
+
|
|
353
|
+
try {
|
|
354
|
+
await page.goto(url, { timeout: 30000, waitUntil: 'domcontentloaded' })
|
|
355
|
+
} catch {
|
|
356
|
+
console.log(` [ERROR] Page load timeout for ${companyName}`)
|
|
357
|
+
return []
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
const ads = await extractAds(page, companyName, maxScrolls)
|
|
361
|
+
|
|
362
|
+
// Try to get landing URLs
|
|
363
|
+
if (ads.length > 0) {
|
|
364
|
+
const adIds = ads.map((a) => a.ad_id).filter(Boolean)
|
|
365
|
+
if (adIds.length > 0) {
|
|
366
|
+
const urls = await extractLandingUrls(page, adIds)
|
|
367
|
+
for (const ad of ads) {
|
|
368
|
+
if (ad.ad_id in urls) {
|
|
369
|
+
ad.landing_page_url = urls[ad.ad_id]
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
console.log(` [DONE] Extracted ${ads.length} individual ads for ${companyName}`)
|
|
376
|
+
return ads
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// ── Batch Scraper ───────────────────────────────────────────────────
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Scrape multiple companies in batches.
|
|
383
|
+
* Default: 6 companies per batch, 3 parallel batches (as documented in memory).
|
|
384
|
+
*/
|
|
385
|
+
export async function scrapeCompanies(
|
|
386
|
+
opts: ScrapeOptions,
|
|
387
|
+
): Promise<ScrapeResult> {
|
|
388
|
+
const {
|
|
389
|
+
companies,
|
|
390
|
+
outputPath,
|
|
391
|
+
batchSize = 6,
|
|
392
|
+
maxScrolls = 15,
|
|
393
|
+
companyDelay = 4000,
|
|
394
|
+
headless = true,
|
|
395
|
+
} = opts
|
|
396
|
+
|
|
397
|
+
console.log(
|
|
398
|
+
`Starting Meta Ad Library scraper for ${companies.length} companies`,
|
|
399
|
+
)
|
|
400
|
+
if (outputPath) console.log(`Output: ${outputPath}`)
|
|
401
|
+
|
|
402
|
+
const allAds: AdRecord[] = []
|
|
403
|
+
let companiesScraped = 0
|
|
404
|
+
|
|
405
|
+
// Split into batches
|
|
406
|
+
const batches: string[][] = []
|
|
407
|
+
for (let i = 0; i < companies.length; i += batchSize) {
|
|
408
|
+
batches.push(companies.slice(i, i + batchSize))
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
console.log(
|
|
412
|
+
`Processing ${batches.length} batch(es) of up to ${batchSize} companies each`,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
let browser: Browser | null = null
|
|
416
|
+
try {
|
|
417
|
+
browser = await chromium.launch({
|
|
418
|
+
headless,
|
|
419
|
+
args: [
|
|
420
|
+
'--no-sandbox',
|
|
421
|
+
'--disable-setuid-sandbox',
|
|
422
|
+
'--disable-dev-shm-usage',
|
|
423
|
+
'--disable-blink-features=AutomationControlled',
|
|
424
|
+
],
|
|
425
|
+
})
|
|
426
|
+
|
|
427
|
+
for (let bi = 0; bi < batches.length; bi++) {
|
|
428
|
+
const batch = batches[bi]
|
|
429
|
+
console.log(
|
|
430
|
+
`\nBatch ${bi + 1}/${batches.length}: ${batch.length} companies`,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
const context: BrowserContext = await browser.newContext({
|
|
434
|
+
viewport: { width: 1920, height: 1080 },
|
|
435
|
+
userAgent:
|
|
436
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
437
|
+
})
|
|
438
|
+
|
|
439
|
+
const page = await context.newPage()
|
|
440
|
+
|
|
441
|
+
for (let ci = 0; ci < batch.length; ci++) {
|
|
442
|
+
const company = batch[ci]
|
|
443
|
+
|
|
444
|
+
if (ci > 0) {
|
|
445
|
+
console.log(
|
|
446
|
+
`\n [WAIT] Waiting ${companyDelay / 1000}s before next company...`,
|
|
447
|
+
)
|
|
448
|
+
await page.waitForTimeout(companyDelay)
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const ads = await scrapeCompany(page, company, maxScrolls)
|
|
452
|
+
allAds.push(...ads)
|
|
453
|
+
companiesScraped++
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
await context.close()
|
|
457
|
+
}
|
|
458
|
+
} finally {
|
|
459
|
+
if (browser) await browser.close()
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Write CSV output if path specified
|
|
463
|
+
if (outputPath) {
|
|
464
|
+
const csv = formatCsv(allAds)
|
|
465
|
+
writeFileSync(outputPath, csv, 'utf-8')
|
|
466
|
+
console.log(`\nSaved ${allAds.length} ads to ${outputPath}`)
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
console.log(
|
|
470
|
+
`\nBatch complete: ${allAds.length} total ads from ${companiesScraped} companies`,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
return {
|
|
474
|
+
ads: allAds,
|
|
475
|
+
totalCompanies: companies.length,
|
|
476
|
+
companiesScraped,
|
|
477
|
+
outputPath,
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// ── CSV Formatter ───────────────────────────────────────────────────
|
|
482
|
+
|
|
483
|
+
/** Escape a value for CSV (double-quote wrapping, escape inner quotes) */
|
|
484
|
+
function escapeCsvField(value: string): string {
|
|
485
|
+
if (
|
|
486
|
+
value.includes(',') ||
|
|
487
|
+
value.includes('"') ||
|
|
488
|
+
value.includes('\n') ||
|
|
489
|
+
value.includes('\r')
|
|
490
|
+
) {
|
|
491
|
+
return `"${value.replace(/"/g, '""')}"`
|
|
492
|
+
}
|
|
493
|
+
return value
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
/** Convert ad records to CSV string */
|
|
497
|
+
export function formatCsv(ads: AdRecord[]): string {
|
|
498
|
+
const header = CSV_FIELDS.join(',')
|
|
499
|
+
const rows = ads.map((ad) =>
|
|
500
|
+
CSV_FIELDS.map((field) => escapeCsvField(ad[field] ?? '')).join(','),
|
|
501
|
+
)
|
|
502
|
+
return [header, ...rows].join('\n') + '\n'
|
|
503
|
+
}
|
package/lib/supabase.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { createClient, SupabaseClient } from '@supabase/supabase-js'
|
|
2
|
+
import 'dotenv/config'
|
|
3
|
+
|
|
4
|
+
export type SupabaseInstance = 'optimal' | 'returnpro'
|
|
5
|
+
|
|
6
|
+
const configs: Record<SupabaseInstance, { urlEnv: string; keyEnv: string }> = {
|
|
7
|
+
optimal: { urlEnv: 'OPTIMAL_SUPABASE_URL', keyEnv: 'OPTIMAL_SUPABASE_SERVICE_KEY' },
|
|
8
|
+
returnpro: { urlEnv: 'RETURNPRO_SUPABASE_URL', keyEnv: 'RETURNPRO_SUPABASE_SERVICE_KEY' },
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
const clients = new Map<SupabaseInstance, SupabaseClient>()
|
|
12
|
+
|
|
13
|
+
export function getSupabase(instance: SupabaseInstance): SupabaseClient {
|
|
14
|
+
const existing = clients.get(instance)
|
|
15
|
+
if (existing) return existing
|
|
16
|
+
|
|
17
|
+
const config = configs[instance]
|
|
18
|
+
const url = process.env[config.urlEnv]
|
|
19
|
+
const key = process.env[config.keyEnv]
|
|
20
|
+
if (!url || !key) throw new Error(`Missing env vars: ${config.urlEnv}, ${config.keyEnv}`)
|
|
21
|
+
|
|
22
|
+
const client = createClient(url, key)
|
|
23
|
+
clients.set(instance, client)
|
|
24
|
+
return client
|
|
25
|
+
}
|