web-agent-bridge 3.0.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +72 -21
- package/README.ar.md +1286 -1073
- package/README.md +1764 -1535
- package/bin/agent-runner.js +474 -474
- package/bin/cli.js +237 -138
- package/bin/wab.js +80 -80
- package/examples/bidi-agent.js +119 -119
- package/examples/cross-site-agent.js +91 -91
- package/examples/mcp-agent.js +94 -94
- package/examples/next-app-router/README.md +44 -44
- package/examples/puppeteer-agent.js +108 -108
- package/examples/saas-dashboard/README.md +55 -55
- package/examples/shopify-hydrogen/README.md +74 -74
- package/examples/vision-agent.js +171 -171
- package/examples/wordpress-elementor/README.md +77 -77
- package/package.json +17 -3
- package/public/.well-known/agent-tools.json +180 -180
- package/public/.well-known/ai-assets.json +59 -59
- package/public/.well-known/ai-plugin.json +28 -0
- package/public/.well-known/security.txt +8 -0
- package/public/agent-workspace.html +349 -347
- package/public/ai.html +198 -196
- package/public/api.html +413 -0
- package/public/browser.html +486 -484
- package/public/commander-dashboard.html +243 -243
- package/public/cookies.html +210 -208
- package/public/css/agent-workspace.css +1713 -1713
- package/public/css/premium.css +317 -317
- package/public/css/styles.css +1235 -1235
- package/public/dashboard.html +706 -704
- package/public/demo.html +1770 -1
- package/public/dns.html +507 -0
- package/public/docs.html +587 -585
- package/public/feed.xml +89 -89
- package/public/growth.html +463 -0
- package/public/index.html +341 -9
- package/public/integrations.html +556 -0
- package/public/js/agent-workspace.js +1740 -1740
- package/public/js/auth-nav.js +31 -31
- package/public/js/auth-redirect.js +12 -12
- package/public/js/cookie-consent.js +56 -56
- package/public/js/wab-demo-page.js +721 -721
- package/public/js/ws-client.js +74 -74
- package/public/llms-full.txt +360 -309
- package/public/llms.txt +125 -86
- package/public/login.html +85 -83
- package/public/mesh-dashboard.html +328 -328
- package/public/openapi.json +580 -580
- package/public/phone-shield.html +281 -0
- package/public/premium-dashboard.html +2489 -2487
- package/public/premium.html +793 -791
- package/public/privacy.html +297 -295
- package/public/register.html +105 -103
- package/public/robots.txt +87 -87
- package/public/script/wab-consent.d.ts +36 -36
- package/public/script/wab-consent.js +104 -104
- package/public/script/wab-schema.js +131 -131
- package/public/script/wab.d.ts +108 -108
- package/public/script/wab.min.js +580 -580
- package/public/security.txt +8 -0
- package/public/terms.html +256 -254
- package/script/ai-agent-bridge.js +1754 -1754
- package/sdk/README.md +99 -99
- package/sdk/agent-mesh.js +449 -449
- package/sdk/commander.js +262 -262
- package/sdk/index.d.ts +464 -464
- package/sdk/index.js +18 -1
- package/sdk/multi-agent.js +318 -318
- package/sdk/package.json +12 -1
- package/sdk/safety-shield.js +219 -0
- package/sdk/schema-discovery.js +83 -83
- package/server/adapters/index.js +520 -520
- package/server/config/plans.js +367 -367
- package/server/config/secrets.js +102 -102
- package/server/control-plane/index.js +301 -301
- package/server/data-plane/index.js +354 -354
- package/server/index.js +175 -19
- package/server/llm/index.js +404 -404
- package/server/middleware/adminAuth.js +35 -35
- package/server/middleware/auth.js +50 -50
- package/server/middleware/featureGate.js +88 -88
- package/server/middleware/rateLimits.js +100 -100
- package/server/middleware/sensitiveAction.js +157 -0
- package/server/migrations/001_add_analytics_indexes.sql +7 -7
- package/server/migrations/002_premium_features.sql +418 -418
- package/server/migrations/003_ads_integer_cents.sql +33 -33
- package/server/migrations/004_agent_os.sql +158 -158
- package/server/migrations/005_marketplace_metering.sql +126 -126
- package/server/models/adapters/index.js +33 -33
- package/server/models/adapters/mysql.js +183 -183
- package/server/models/adapters/postgresql.js +172 -172
- package/server/models/adapters/sqlite.js +7 -7
- package/server/models/db.js +681 -681
- package/server/observability/failure-analysis.js +337 -337
- package/server/observability/index.js +394 -394
- package/server/protocol/capabilities.js +223 -223
- package/server/protocol/index.js +243 -243
- package/server/protocol/schema.js +584 -584
- package/server/registry/certification.js +271 -271
- package/server/registry/index.js +326 -326
- package/server/routes/admin-premium.js +671 -671
- package/server/routes/admin.js +261 -261
- package/server/routes/ads.js +130 -130
- package/server/routes/agent-workspace.js +540 -378
- package/server/routes/api.js +150 -150
- package/server/routes/auth.js +71 -71
- package/server/routes/billing.js +45 -45
- package/server/routes/commander.js +316 -316
- package/server/routes/demo-showcase.js +332 -0
- package/server/routes/demo-store.js +154 -0
- package/server/routes/discovery.js +417 -406
- package/server/routes/gateway.js +173 -0
- package/server/routes/license.js +251 -240
- package/server/routes/mesh.js +469 -469
- package/server/routes/noscript.js +543 -543
- package/server/routes/premium-v2.js +686 -686
- package/server/routes/premium.js +724 -724
- package/server/routes/runtime.js +2148 -2147
- package/server/routes/sovereign.js +465 -385
- package/server/routes/universal.js +200 -177
- package/server/routes/wab-api.js +850 -491
- package/server/runtime/container-worker.js +111 -111
- package/server/runtime/container.js +448 -448
- package/server/runtime/distributed-worker.js +362 -362
- package/server/runtime/event-bus.js +210 -210
- package/server/runtime/index.js +253 -253
- package/server/runtime/queue.js +599 -599
- package/server/runtime/replay.js +666 -666
- package/server/runtime/sandbox.js +266 -266
- package/server/runtime/scheduler.js +534 -534
- package/server/runtime/session-engine.js +293 -293
- package/server/runtime/state-manager.js +188 -188
- package/server/security/cross-site-redactor.js +196 -0
- package/server/security/dry-run.js +180 -0
- package/server/security/human-gate-rate-limit.js +147 -0
- package/server/security/human-gate-transports.js +178 -0
- package/server/security/human-gate.js +281 -0
- package/server/security/index.js +368 -368
- package/server/security/intent-engine.js +245 -0
- package/server/security/reward-guard.js +171 -0
- package/server/security/rollback-store.js +239 -0
- package/server/security/token-scope.js +404 -0
- package/server/security/url-policy.js +139 -0
- package/server/services/agent-chat.js +506 -506
- package/server/services/agent-learning.js +601 -575
- package/server/services/agent-memory.js +625 -625
- package/server/services/agent-mesh.js +555 -539
- package/server/services/agent-symphony.js +717 -717
- package/server/services/agent-tasks.js +1807 -1807
- package/server/services/api-key-engine.js +292 -0
- package/server/services/cluster.js +894 -894
- package/server/services/commander.js +738 -738
- package/server/services/edge-compute.js +440 -440
- package/server/services/email.js +204 -204
- package/server/services/hosted-runtime.js +205 -205
- package/server/services/lfd.js +635 -616
- package/server/services/local-ai.js +389 -389
- package/server/services/marketplace.js +270 -270
- package/server/services/metering.js +182 -182
- package/server/services/modules/affiliate-intelligence.js +93 -0
- package/server/services/modules/agent-firewall.js +90 -0
- package/server/services/modules/bounty.js +89 -0
- package/server/services/modules/collective-bargaining.js +92 -0
- package/server/services/modules/dark-pattern.js +66 -0
- package/server/services/modules/gov-intelligence.js +45 -0
- package/server/services/modules/neural.js +55 -0
- package/server/services/modules/notary.js +49 -0
- package/server/services/modules/price-time-machine.js +86 -0
- package/server/services/modules/protocol.js +104 -0
- package/server/services/negotiation.js +439 -439
- package/server/services/plugins.js +771 -771
- package/server/services/premium.js +1 -1
- package/server/services/price-intelligence.js +566 -565
- package/server/services/price-shield.js +1137 -1137
- package/server/services/reputation.js +465 -465
- package/server/services/search-engine.js +357 -357
- package/server/services/security.js +513 -513
- package/server/services/self-healing.js +843 -843
- package/server/services/sovereign-shield.js +542 -0
- package/server/services/stripe.js +192 -192
- package/server/services/swarm.js +788 -788
- package/server/services/universal-scraper.js +662 -661
- package/server/services/verification.js +481 -481
- package/server/services/vision.js +1163 -1163
- package/server/utils/cache.js +125 -125
- package/server/utils/migrate.js +81 -81
- package/server/utils/safe-fetch.js +228 -0
- package/server/utils/secureFields.js +50 -50
- package/server/ws.js +161 -161
- package/templates/artisan-marketplace.yaml +104 -104
- package/templates/book-price-scout.yaml +98 -98
- package/templates/electronics-price-tracker.yaml +108 -108
- package/templates/flight-deal-hunter.yaml +113 -113
- package/templates/freelancer-direct.yaml +116 -116
- package/templates/grocery-price-compare.yaml +93 -93
- package/templates/hotel-direct-booking.yaml +113 -113
- package/templates/local-services.yaml +98 -98
- package/templates/olive-oil-tunisia.yaml +88 -88
- package/templates/organic-farm-fresh.yaml +101 -101
- package/templates/restaurant-direct.yaml +97 -97
- package/server/services/fairness-engine.js +0 -409
- package/server/services/fairness.js +0 -420
|
@@ -1,661 +1,662 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* WAB Universal Scraper Engine
|
|
3
|
-
* ═══════════════════════════════════════════════════════════════════
|
|
4
|
-
* Works with ANY website — no script installation required.
|
|
5
|
-
* Extracts prices, products, availability from raw HTML/DOM.
|
|
6
|
-
*
|
|
7
|
-
* Three extraction modes:
|
|
8
|
-
* 1. Schema.org JSON-LD (structured, most reliable)
|
|
9
|
-
* 2. Open Graph / Meta tags (semi-structured)
|
|
10
|
-
* 3. DOM pattern matching (heuristic, any site)
|
|
11
|
-
*
|
|
12
|
-
* Used by: WAB Browser (webview), Chrome Extension (content script),
|
|
13
|
-
* Server-side fetch (Node.js)
|
|
14
|
-
*/
|
|
15
|
-
|
|
16
|
-
const crypto = require('crypto');
|
|
17
|
-
const { db } = require('../models/db');
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
CREATE INDEX IF NOT EXISTS
|
|
52
|
-
CREATE INDEX IF NOT EXISTS
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
'
|
|
72
|
-
'
|
|
73
|
-
'
|
|
74
|
-
'
|
|
75
|
-
'
|
|
76
|
-
'
|
|
77
|
-
'
|
|
78
|
-
'
|
|
79
|
-
'
|
|
80
|
-
'
|
|
81
|
-
'
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
]
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
const
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
const
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
const
|
|
136
|
-
|
|
137
|
-
seen.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
const
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
const
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
const
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
if (meta['
|
|
227
|
-
if (meta['product:price:
|
|
228
|
-
if (meta['
|
|
229
|
-
if (meta['og:
|
|
230
|
-
if (meta['
|
|
231
|
-
if (meta['product:
|
|
232
|
-
if (meta['
|
|
233
|
-
if (meta['og:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
//
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
const
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
const
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
/class="[^"]*(?:
|
|
270
|
-
/class="[^"]*(?:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
/data-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
.replace(/<
|
|
291
|
-
.replace(/<[
|
|
292
|
-
.replace(
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
'Mozilla/5.0 (
|
|
326
|
-
'Mozilla/5.0 (
|
|
327
|
-
'Mozilla/5.0 (
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
const
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
'
|
|
348
|
-
'Accept': '
|
|
349
|
-
'Accept-
|
|
350
|
-
'
|
|
351
|
-
'Sec-Fetch-
|
|
352
|
-
'Sec-Fetch-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
const
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
p.
|
|
377
|
-
p.
|
|
378
|
-
p.
|
|
379
|
-
p.
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
//
|
|
398
|
-
//
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
if (data
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
const
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
const
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
if (m['
|
|
457
|
-
if (m['product:price:
|
|
458
|
-
if (m['
|
|
459
|
-
if (m['og:
|
|
460
|
-
if (
|
|
461
|
-
return
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
/(
|
|
469
|
-
/(
|
|
470
|
-
/([\\d,]+\\.?\\d*)
|
|
471
|
-
/(
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
const
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
'[
|
|
480
|
-
'[
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
const
|
|
504
|
-
|
|
505
|
-
'[class*="
|
|
506
|
-
'[class*="
|
|
507
|
-
'[
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
allCards.
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
const
|
|
514
|
-
const
|
|
515
|
-
const
|
|
516
|
-
const
|
|
517
|
-
const
|
|
518
|
-
const
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
if
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
//
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
const
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
p.
|
|
631
|
-
p.
|
|
632
|
-
p.
|
|
633
|
-
p.
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
1
|
+
/**
|
|
2
|
+
* WAB Universal Scraper Engine
|
|
3
|
+
* ═══════════════════════════════════════════════════════════════════
|
|
4
|
+
* Works with ANY website — no script installation required.
|
|
5
|
+
* Extracts prices, products, availability from raw HTML/DOM.
|
|
6
|
+
*
|
|
7
|
+
* Three extraction modes:
|
|
8
|
+
* 1. Schema.org JSON-LD (structured, most reliable)
|
|
9
|
+
* 2. Open Graph / Meta tags (semi-structured)
|
|
10
|
+
* 3. DOM pattern matching (heuristic, any site)
|
|
11
|
+
*
|
|
12
|
+
* Used by: WAB Browser (webview), Chrome Extension (content script),
|
|
13
|
+
* Server-side fetch (Node.js)
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const crypto = require('crypto');
|
|
17
|
+
const { db } = require('../models/db');
|
|
18
|
+
const { safeFetch } = require('../utils/safe-fetch');
|
|
19
|
+
|
|
20
|
+
// ─── Schema ──────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
db.exec(`
|
|
23
|
+
CREATE TABLE IF NOT EXISTS scraped_prices (
|
|
24
|
+
id TEXT PRIMARY KEY,
|
|
25
|
+
url TEXT NOT NULL,
|
|
26
|
+
domain TEXT NOT NULL,
|
|
27
|
+
product_name TEXT,
|
|
28
|
+
price REAL,
|
|
29
|
+
currency TEXT DEFAULT 'USD',
|
|
30
|
+
original_price REAL,
|
|
31
|
+
availability TEXT,
|
|
32
|
+
rating REAL,
|
|
33
|
+
review_count INTEGER,
|
|
34
|
+
seller TEXT,
|
|
35
|
+
category TEXT,
|
|
36
|
+
extraction_method TEXT,
|
|
37
|
+
raw_data TEXT DEFAULT '{}',
|
|
38
|
+
scraped_at TEXT DEFAULT (datetime('now'))
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
CREATE TABLE IF NOT EXISTS universal_price_history (
|
|
42
|
+
id TEXT PRIMARY KEY,
|
|
43
|
+
url_hash TEXT NOT NULL,
|
|
44
|
+
domain TEXT NOT NULL,
|
|
45
|
+
product_name TEXT,
|
|
46
|
+
price REAL,
|
|
47
|
+
currency TEXT DEFAULT 'USD',
|
|
48
|
+
recorded_at TEXT DEFAULT (datetime('now'))
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
CREATE INDEX IF NOT EXISTS idx_scraped_domain ON scraped_prices(domain);
|
|
52
|
+
CREATE INDEX IF NOT EXISTS idx_scraped_url ON scraped_prices(url);
|
|
53
|
+
CREATE INDEX IF NOT EXISTS idx_uph_hash ON universal_price_history(url_hash);
|
|
54
|
+
`);
|
|
55
|
+
|
|
56
|
+
const stmts = {
|
|
57
|
+
insertScraped: db.prepare(`INSERT OR REPLACE INTO scraped_prices
|
|
58
|
+
(id, url, domain, product_name, price, currency, original_price,
|
|
59
|
+
availability, rating, review_count, seller, category, extraction_method, raw_data)
|
|
60
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`),
|
|
61
|
+
getByUrl: db.prepare('SELECT * FROM scraped_prices WHERE url = ? ORDER BY scraped_at DESC LIMIT 1'),
|
|
62
|
+
getByDomain: db.prepare('SELECT * FROM scraped_prices WHERE domain = ? ORDER BY scraped_at DESC LIMIT ?'),
|
|
63
|
+
insertHistory: db.prepare(`INSERT INTO universal_price_history
|
|
64
|
+
(id, url_hash, domain, product_name, price, currency) VALUES (?, ?, ?, ?, ?, ?)`),
|
|
65
|
+
getHistory: db.prepare('SELECT * FROM universal_price_history WHERE url_hash = ? ORDER BY recorded_at DESC LIMIT ?'),
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
// ─── Currency Normalization ──────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
const CURRENCY_MAP = {
|
|
71
|
+
'$': 'USD', 'USD': 'USD', 'US$': 'USD',
|
|
72
|
+
'€': 'EUR', 'EUR': 'EUR',
|
|
73
|
+
'£': 'GBP', 'GBP': 'GBP',
|
|
74
|
+
'SAR': 'SAR', 'ريال': 'SAR', 'ر.س': 'SAR',
|
|
75
|
+
'AED': 'AED', 'درهم': 'AED', 'د.إ': 'AED',
|
|
76
|
+
'TND': 'TND', 'دينار': 'TND', 'د.ت': 'TND',
|
|
77
|
+
'EGP': 'EGP', 'ج.م': 'EGP',
|
|
78
|
+
'MAD': 'MAD', 'د.م': 'MAD',
|
|
79
|
+
'TRY': 'TRY', '₺': 'TRY',
|
|
80
|
+
'JPY': 'JPY', '¥': 'JPY',
|
|
81
|
+
'INR': 'INR', '₹': 'INR',
|
|
82
|
+
'KRW': 'KRW', '₩': 'KRW',
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
// Approximate USD rates for comparison
|
|
86
|
+
const TO_USD = {
|
|
87
|
+
USD: 1, EUR: 1.08, GBP: 1.27, SAR: 0.27, AED: 0.27,
|
|
88
|
+
TND: 0.32, EGP: 0.032, MAD: 0.10, TRY: 0.031,
|
|
89
|
+
JPY: 0.0067, INR: 0.012, KRW: 0.00074,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
function normalizeCurrency(symbol) {
|
|
93
|
+
if (!symbol) return 'USD';
|
|
94
|
+
const s = symbol.trim().toUpperCase();
|
|
95
|
+
return CURRENCY_MAP[s] || CURRENCY_MAP[symbol.trim()] || 'USD';
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function toUSD(price, currency) {
|
|
99
|
+
const rate = TO_USD[currency] || 1;
|
|
100
|
+
return Math.round(price * rate * 100) / 100;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ─── Price Extraction ────────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
const PRICE_PATTERNS = [
|
|
106
|
+
// $123.45 or $ 123.45
|
|
107
|
+
/(?<currency>\$|USD|US\$)\s*(?<price>[\d,]+\.?\d*)/gi,
|
|
108
|
+
// €123.45
|
|
109
|
+
/(?<currency>€|EUR)\s*(?<price>[\d,]+\.?\d*)/gi,
|
|
110
|
+
// £123.45
|
|
111
|
+
/(?<currency>£|GBP)\s*(?<price>[\d,]+\.?\d*)/gi,
|
|
112
|
+
// 123.45 SAR / ريال
|
|
113
|
+
/(?<price>[\d,]+\.?\d*)\s*(?<currency>SAR|ريال|ر\.س|AED|درهم|د\.إ|TND|دينار|د\.ت|EGP|ج\.م|MAD|د\.م)/gi,
|
|
114
|
+
// ₺ ₹ ¥ ₩ prefixed
|
|
115
|
+
/(?<currency>[₺₹¥₩])\s*(?<price>[\d,]+\.?\d*)/gi,
|
|
116
|
+
// 123.45$ (suffix dollar)
|
|
117
|
+
/(?<price>[\d,]+\.?\d*)\s*(?<currency>\$|€|£)/gi,
|
|
118
|
+
];
|
|
119
|
+
|
|
120
|
+
function extractPrices(text) {
|
|
121
|
+
const prices = [];
|
|
122
|
+
const seen = new Set();
|
|
123
|
+
|
|
124
|
+
for (const pattern of PRICE_PATTERNS) {
|
|
125
|
+
pattern.lastIndex = 0;
|
|
126
|
+
let m;
|
|
127
|
+
while ((m = pattern.exec(text)) !== null) {
|
|
128
|
+
const raw = m.groups?.price || m[2] || m[1];
|
|
129
|
+
const currSymbol = m.groups?.currency || m[1] || m[2];
|
|
130
|
+
if (!raw) continue;
|
|
131
|
+
|
|
132
|
+
const num = parseFloat(raw.replace(/,/g, ''));
|
|
133
|
+
if (isNaN(num) || num <= 0 || num > 1000000) continue;
|
|
134
|
+
|
|
135
|
+
const currency = normalizeCurrency(currSymbol);
|
|
136
|
+
const key = `${num}-${currency}`;
|
|
137
|
+
if (seen.has(key)) continue;
|
|
138
|
+
seen.add(key);
|
|
139
|
+
|
|
140
|
+
prices.push({ price: num, currency, usd: toUSD(num, currency), raw: m[0].trim() });
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return prices.sort((a, b) => a.usd - b.usd);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ─── Schema.org JSON-LD Extraction ───────────────────────────────────
|
|
148
|
+
|
|
149
|
+
function extractJsonLd(html) {
|
|
150
|
+
const products = [];
|
|
151
|
+
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
152
|
+
let m;
|
|
153
|
+
|
|
154
|
+
while ((m = regex.exec(html)) !== null) {
|
|
155
|
+
try {
|
|
156
|
+
let data = JSON.parse(m[1].trim());
|
|
157
|
+
if (Array.isArray(data)) data.forEach(d => processJsonLd(d, products));
|
|
158
|
+
else processJsonLd(data, products);
|
|
159
|
+
} catch (_) {}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return products;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function processJsonLd(data, products) {
|
|
166
|
+
if (!data || typeof data !== 'object') return;
|
|
167
|
+
|
|
168
|
+
// Handle @graph arrays
|
|
169
|
+
if (data['@graph'] && Array.isArray(data['@graph'])) {
|
|
170
|
+
data['@graph'].forEach(item => processJsonLd(item, products));
|
|
171
|
+
return;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const type = (data['@type'] || '').toLowerCase();
|
|
175
|
+
|
|
176
|
+
if (type === 'product' || type === 'hotel' || type === 'hotelroom' ||
|
|
177
|
+
type === 'lodgingbusiness' || type === 'offer') {
|
|
178
|
+
const offers = data.offers || data.priceSpecification || {};
|
|
179
|
+
const offer = Array.isArray(offers) ? offers[0] : offers;
|
|
180
|
+
|
|
181
|
+
products.push({
|
|
182
|
+
name: data.name || data.headline || null,
|
|
183
|
+
price: parseFloat(offer?.price || offer?.lowPrice || data.price) || null,
|
|
184
|
+
originalPrice: parseFloat(offer?.highPrice) || null,
|
|
185
|
+
currency: offer?.priceCurrency || 'USD',
|
|
186
|
+
availability: offer?.availability?.replace('https://schema.org/', '').replace('http://schema.org/', '') || null,
|
|
187
|
+
rating: parseFloat(data.aggregateRating?.ratingValue) || null,
|
|
188
|
+
reviewCount: parseInt(data.aggregateRating?.reviewCount || data.aggregateRating?.ratingCount) || null,
|
|
189
|
+
image: data.image?.url || (typeof data.image === 'string' ? data.image : null),
|
|
190
|
+
description: (data.description || '').slice(0, 500),
|
|
191
|
+
brand: data.brand?.name || data.brand || null,
|
|
192
|
+
sku: data.sku || null,
|
|
193
|
+
url: data.url || null,
|
|
194
|
+
method: 'json-ld',
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Recurse into nested objects
|
|
199
|
+
for (const key of Object.keys(data)) {
|
|
200
|
+
if (typeof data[key] === 'object' && data[key] !== null && key !== '@context') {
|
|
201
|
+
if (Array.isArray(data[key])) {
|
|
202
|
+
data[key].forEach(item => {
|
|
203
|
+
if (typeof item === 'object') processJsonLd(item, products);
|
|
204
|
+
});
|
|
205
|
+
} else {
|
|
206
|
+
processJsonLd(data[key], products);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// ─── Open Graph / Meta Tag Extraction ────────────────────────────────
|
|
213
|
+
|
|
214
|
+
function extractMetaTags(html) {
|
|
215
|
+
const meta = {};
|
|
216
|
+
const metaRegex = /<meta\s+(?:[^>]*?(?:property|name)=["']([^"']+)["'][^>]*?content=["']([^"']*?)["']|[^>]*?content=["']([^"']*?)["'][^>]*?(?:property|name)=["']([^"']+)["'])[^>]*\/?>/gi;
|
|
217
|
+
let m;
|
|
218
|
+
|
|
219
|
+
while ((m = metaRegex.exec(html)) !== null) {
|
|
220
|
+
const key = (m[1] || m[4] || '').toLowerCase();
|
|
221
|
+
const value = m[2] || m[3] || '';
|
|
222
|
+
if (key && value) meta[key] = value;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const product = {};
|
|
226
|
+
if (meta['og:title']) product.name = meta['og:title'];
|
|
227
|
+
if (meta['product:price:amount']) product.price = parseFloat(meta['product:price:amount']);
|
|
228
|
+
if (meta['product:price:currency']) product.currency = meta['product:price:currency'];
|
|
229
|
+
if (meta['og:description']) product.description = meta['og:description'].slice(0, 500);
|
|
230
|
+
if (meta['og:image']) product.image = meta['og:image'];
|
|
231
|
+
if (meta['product:availability']) product.availability = meta['product:availability'];
|
|
232
|
+
if (meta['product:brand']) product.brand = meta['product:brand'];
|
|
233
|
+
if (meta['og:type']) product.type = meta['og:type'];
|
|
234
|
+
if (meta['og:url']) product.url = meta['og:url'];
|
|
235
|
+
|
|
236
|
+
if (product.name || product.price) {
|
|
237
|
+
product.method = 'meta-tags';
|
|
238
|
+
return product;
|
|
239
|
+
}
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// ─── DOM Heuristic Extraction ────────────────────────────────────────
|
|
244
|
+
// This runs either on server (from fetched HTML) or in browser (content script)
|
|
245
|
+
|
|
246
|
+
function extractFromHtml(html, url) {
|
|
247
|
+
const results = [];
|
|
248
|
+
const domain = _extractDomain(url);
|
|
249
|
+
|
|
250
|
+
// 1. Try JSON-LD first (most reliable)
|
|
251
|
+
const jsonLdProducts = extractJsonLd(html);
|
|
252
|
+
if (jsonLdProducts.length > 0) {
|
|
253
|
+
results.push(...jsonLdProducts);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// 2. Try Open Graph meta tags
|
|
257
|
+
const metaProduct = extractMetaTags(html);
|
|
258
|
+
if (metaProduct && metaProduct.price) {
|
|
259
|
+
results.push(metaProduct);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// 3. Heuristic price extraction from common patterns
|
|
263
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
264
|
+
const pageTitle = titleMatch ? titleMatch[1].replace(/\s+/g, ' ').trim() : '';
|
|
265
|
+
|
|
266
|
+
// Extract potential product containers
|
|
267
|
+
const priceContainerPatterns = [
|
|
268
|
+
// Common e-commerce price selectors reflected as class names
|
|
269
|
+
/class="[^"]*(?:price|cost|amount|tarif|سعر|ثمن)[^"]*"[^>]*>([^<]{1,100})</g,
|
|
270
|
+
/class="[^"]*(?:product-price|item-price|sale-price|offer-price|current-price)[^"]*"[^>]*>([^<]{1,100})</g,
|
|
271
|
+
/class="[^"]*(?:room-price|rate-price|nightly-rate|total-price)[^"]*"[^>]*>([^<]{1,100})</g,
|
|
272
|
+
// data-price attributes
|
|
273
|
+
/data-price=["']([^"']+)["']/gi,
|
|
274
|
+
/data-product-price=["']([^"']+)["']/gi,
|
|
275
|
+
];
|
|
276
|
+
|
|
277
|
+
const rawPrices = [];
|
|
278
|
+
for (const pattern of priceContainerPatterns) {
|
|
279
|
+
let m;
|
|
280
|
+
while ((m = pattern.exec(html)) !== null) {
|
|
281
|
+
const extracted = extractPrices(m[1]);
|
|
282
|
+
rawPrices.push(...extracted);
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// If no structured prices found, do a broad sweep
|
|
287
|
+
if (results.length === 0 && rawPrices.length === 0) {
|
|
288
|
+
// Extract from visible text areas (skip scripts/styles)
|
|
289
|
+
const cleaned = html
|
|
290
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
291
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
292
|
+
.replace(/<[^>]+>/g, ' ')
|
|
293
|
+
.replace(/\s+/g, ' ');
|
|
294
|
+
const broadPrices = extractPrices(cleaned);
|
|
295
|
+
rawPrices.push(...broadPrices);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Deduplicate and create heuristic results
|
|
299
|
+
if (rawPrices.length > 0 && results.length === 0) {
|
|
300
|
+
// Filter: likely product prices (not phone numbers, years, etc.)
|
|
301
|
+
const validPrices = rawPrices.filter(p =>
|
|
302
|
+
p.usd >= 1 && p.usd <= 50000 &&
|
|
303
|
+
!`${p.price}`.match(/^(19|20)\d{2}$/) // not a year
|
|
304
|
+
);
|
|
305
|
+
|
|
306
|
+
if (validPrices.length > 0) {
|
|
307
|
+
const sorted = validPrices.sort((a, b) => a.usd - b.usd);
|
|
308
|
+
results.push({
|
|
309
|
+
name: pageTitle || domain,
|
|
310
|
+
price: sorted[0].price,
|
|
311
|
+
currency: sorted[0].currency,
|
|
312
|
+
originalPrice: sorted.length > 1 ? sorted[sorted.length - 1].price : null,
|
|
313
|
+
method: 'heuristic',
|
|
314
|
+
allPrices: sorted.slice(0, 10),
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return results;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// ─── Server-side Fetch & Extract ─────────────────────────────────────
|
|
323
|
+
|
|
324
|
+
const UA_LIST = [
|
|
325
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
326
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
327
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
|
|
328
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
|
|
329
|
+
];
|
|
330
|
+
|
|
331
|
+
async function fetchAndExtract(url, options = {}) {
|
|
332
|
+
const domain = _extractDomain(url);
|
|
333
|
+
|
|
334
|
+
// Check cache (< 1 hour old)
|
|
335
|
+
const cached = stmts.getByUrl.get(url);
|
|
336
|
+
if (cached && !options.force) {
|
|
337
|
+
const age = Date.now() - new Date(cached.scraped_at).getTime();
|
|
338
|
+
if (age < 3600000) return { cached: true, ...JSON.parse(cached.raw_data), products: [cached] };
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
try {
|
|
342
|
+
const ua = UA_LIST[Math.floor(Math.random() * UA_LIST.length)];
|
|
343
|
+
|
|
344
|
+
const resp = await safeFetch(url, {
|
|
345
|
+
headers: {
|
|
346
|
+
'User-Agent': ua,
|
|
347
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
348
|
+
'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8',
|
|
349
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
350
|
+
'Sec-Fetch-Dest': 'document',
|
|
351
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
352
|
+
'Sec-Fetch-Site': 'none',
|
|
353
|
+
},
|
|
354
|
+
}, {
|
|
355
|
+
timeoutMs: options.timeout || 10000,
|
|
356
|
+
maxBytes: options.maxBytes || 5 * 1024 * 1024,
|
|
357
|
+
maxRedirects: 3,
|
|
358
|
+
allowedContentTypes: ['text/html', 'application/xhtml', 'application/xml'],
|
|
359
|
+
});
|
|
360
|
+
|
|
361
|
+
if (!resp.ok) return { error: `HTTP ${resp.status}`, products: [] };
|
|
362
|
+
|
|
363
|
+
const contentType = resp.headers.get('content-type') || '';
|
|
364
|
+
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
|
|
365
|
+
return { error: 'Not HTML', products: [] };
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
const html = await resp.text();
|
|
369
|
+
const products = extractFromHtml(html, url);
|
|
370
|
+
|
|
371
|
+
// Store results
|
|
372
|
+
for (const p of products) {
|
|
373
|
+
const id = crypto.randomUUID();
|
|
374
|
+
stmts.insertScraped.run(
|
|
375
|
+
id, url, domain,
|
|
376
|
+
p.name || null, p.price || null, p.currency || 'USD',
|
|
377
|
+
p.originalPrice || null, p.availability || null,
|
|
378
|
+
p.rating || null, p.reviewCount || null,
|
|
379
|
+
p.seller || p.brand || null, p.category || null,
|
|
380
|
+
p.method || 'unknown', JSON.stringify(p)
|
|
381
|
+
);
|
|
382
|
+
|
|
383
|
+
// Record price history
|
|
384
|
+
const urlHash = crypto.createHash('sha256').update(url).digest('hex').slice(0, 16);
|
|
385
|
+
stmts.insertHistory.run(
|
|
386
|
+
crypto.randomUUID(), urlHash, domain,
|
|
387
|
+
p.name || null, p.price || null, p.currency || 'USD'
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
return { products, domain, url, fetchedAt: new Date().toISOString() };
|
|
392
|
+
} catch (err) {
|
|
393
|
+
return { error: err.message, products: [] };
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// ─── Browser-side extraction (for content script / WAB Browser) ──────
|
|
398
|
+
// This generates a script that can be injected into any page via
|
|
399
|
+
// webview.executeJavaScript() or chrome content script
|
|
400
|
+
|
|
401
|
+
function getBrowserExtractionScript() {
|
|
402
|
+
return `
|
|
403
|
+
(function() {
|
|
404
|
+
'use strict';
|
|
405
|
+
const WAB_EXTRACT = {
|
|
406
|
+
// Extract JSON-LD products
|
|
407
|
+
getJsonLd() {
|
|
408
|
+
const products = [];
|
|
409
|
+
document.querySelectorAll('script[type="application/ld+json"]').forEach(el => {
|
|
410
|
+
try {
|
|
411
|
+
let data = JSON.parse(el.textContent);
|
|
412
|
+
if (Array.isArray(data)) data.forEach(d => this._processLd(d, products));
|
|
413
|
+
else this._processLd(data, products);
|
|
414
|
+
} catch(_) {}
|
|
415
|
+
});
|
|
416
|
+
return products;
|
|
417
|
+
},
|
|
418
|
+
|
|
419
|
+
_processLd(data, products) {
|
|
420
|
+
if (!data || typeof data !== 'object') return;
|
|
421
|
+
if (data['@graph']) { data['@graph'].forEach(i => this._processLd(i, products)); return; }
|
|
422
|
+
const type = (data['@type'] || '').toLowerCase();
|
|
423
|
+
if (['product','hotel','hotelroom','lodgingbusiness','offer'].includes(type)) {
|
|
424
|
+
const offers = data.offers || {};
|
|
425
|
+
const offer = Array.isArray(offers) ? offers[0] : offers;
|
|
426
|
+
products.push({
|
|
427
|
+
name: data.name || null,
|
|
428
|
+
price: parseFloat(offer?.price || offer?.lowPrice || data.price) || null,
|
|
429
|
+
originalPrice: parseFloat(offer?.highPrice) || null,
|
|
430
|
+
currency: offer?.priceCurrency || 'USD',
|
|
431
|
+
availability: (offer?.availability || '').replace(/https?:\\/\\/schema\\.org\\//,''),
|
|
432
|
+
rating: parseFloat(data.aggregateRating?.ratingValue) || null,
|
|
433
|
+
reviewCount: parseInt(data.aggregateRating?.reviewCount) || null,
|
|
434
|
+
image: typeof data.image === 'string' ? data.image : data.image?.url || null,
|
|
435
|
+
brand: data.brand?.name || data.brand || null,
|
|
436
|
+
method: 'json-ld'
|
|
437
|
+
});
|
|
438
|
+
}
|
|
439
|
+
for (const key of Object.keys(data)) {
|
|
440
|
+
if (typeof data[key] === 'object' && data[key] !== null && key !== '@context') {
|
|
441
|
+
if (Array.isArray(data[key])) data[key].forEach(i => { if (typeof i === 'object') this._processLd(i, products); });
|
|
442
|
+
else this._processLd(data[key], products);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
},
|
|
446
|
+
|
|
447
|
+
// Extract Open Graph meta
|
|
448
|
+
getMeta() {
|
|
449
|
+
const m = {};
|
|
450
|
+
document.querySelectorAll('meta[property], meta[name]').forEach(el => {
|
|
451
|
+
const key = (el.getAttribute('property') || el.getAttribute('name') || '').toLowerCase();
|
|
452
|
+
const val = el.getAttribute('content');
|
|
453
|
+
if (key && val) m[key] = val;
|
|
454
|
+
});
|
|
455
|
+
const p = {};
|
|
456
|
+
if (m['og:title']) p.name = m['og:title'];
|
|
457
|
+
if (m['product:price:amount']) p.price = parseFloat(m['product:price:amount']);
|
|
458
|
+
if (m['product:price:currency']) p.currency = m['product:price:currency'];
|
|
459
|
+
if (m['og:description']) p.description = m['og:description'];
|
|
460
|
+
if (m['og:image']) p.image = m['og:image'];
|
|
461
|
+
if (p.name || p.price) { p.method = 'meta-tags'; return p; }
|
|
462
|
+
return null;
|
|
463
|
+
},
|
|
464
|
+
|
|
465
|
+
// Extract prices from visible text
|
|
466
|
+
getPrices() {
|
|
467
|
+
const patterns = [
|
|
468
|
+
/(?:\\$|USD|US\\$)\\s*([\\d,]+\\.?\\d*)/g,
|
|
469
|
+
/(?:€|EUR)\\s*([\\d,]+\\.?\\d*)/g,
|
|
470
|
+
/(?:£|GBP)\\s*([\\d,]+\\.?\\d*)/g,
|
|
471
|
+
/([\\d,]+\\.?\\d*)\\s*(?:SAR|ريال|AED|درهم|TND|دينار|EGP)/g,
|
|
472
|
+
/(?:[₺₹¥₩])\\s*([\\d,]+\\.?\\d*)/g,
|
|
473
|
+
];
|
|
474
|
+
const prices = [];
|
|
475
|
+
const seen = new Set();
|
|
476
|
+
|
|
477
|
+
// Target price-like containers first
|
|
478
|
+
const priceEls = document.querySelectorAll(
|
|
479
|
+
'[class*="price"], [class*="cost"], [class*="amount"], [class*="rate"], ' +
|
|
480
|
+
'[data-price], [data-product-price], [itemprop="price"], ' +
|
|
481
|
+
'[class*="tarif"], [class*="سعر"]'
|
|
482
|
+
);
|
|
483
|
+
priceEls.forEach(el => {
|
|
484
|
+
const text = el.textContent || el.getAttribute('data-price') || '';
|
|
485
|
+
for (const pat of patterns) {
|
|
486
|
+
pat.lastIndex = 0;
|
|
487
|
+
let m;
|
|
488
|
+
while ((m = pat.exec(text)) !== null) {
|
|
489
|
+
const num = parseFloat((m[1] || m[0].replace(/[^\\d.,]/g,'')).replace(/,/g,''));
|
|
490
|
+
if (num > 0 && num < 100000 && !seen.has(num)) {
|
|
491
|
+
seen.add(num);
|
|
492
|
+
prices.push({ price: num, raw: m[0].trim(), el: el.className });
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
return prices.sort((a,b) => a.price - b.price);
|
|
499
|
+
},
|
|
500
|
+
|
|
501
|
+
// Extract product cards (hotels, flights, items)
|
|
502
|
+
getProductCards() {
|
|
503
|
+
const cards = [];
|
|
504
|
+
const selectors = [
|
|
505
|
+
'[class*="product-card"]', '[class*="hotel-card"]', '[class*="listing-card"]',
|
|
506
|
+
'[class*="search-result"]', '[class*="offer-card"]', '[class*="deal-card"]',
|
|
507
|
+
'[class*="property-card"]', '[class*="sr_item"]', '[class*="result-item"]',
|
|
508
|
+
'[data-testid*="property"]', '[data-testid*="product"]', '[data-testid*="listing"]',
|
|
509
|
+
];
|
|
510
|
+
const allCards = document.querySelectorAll(selectors.join(','));
|
|
511
|
+
allCards.forEach((el, i) => {
|
|
512
|
+
if (i >= 20) return; // limit
|
|
513
|
+
const title = el.querySelector('[class*="title"], [class*="name"], h2, h3, h4')?.textContent?.trim()?.slice(0, 200);
|
|
514
|
+
const priceEl = el.querySelector('[class*="price"], [data-price], [itemprop="price"]');
|
|
515
|
+
const priceText = priceEl?.textContent?.trim() || priceEl?.getAttribute('data-price') || '';
|
|
516
|
+
const ratingEl = el.querySelector('[class*="rating"], [class*="score"], [aria-label*="rating"], [aria-label*="score"]');
|
|
517
|
+
const rating = ratingEl?.textContent?.trim() || ratingEl?.getAttribute('aria-label') || '';
|
|
518
|
+
const link = el.querySelector('a[href]')?.href || '';
|
|
519
|
+
const img = el.querySelector('img')?.src || '';
|
|
520
|
+
|
|
521
|
+
if (title || priceText) {
|
|
522
|
+
cards.push({ title, price: priceText, rating, link, image: img, index: i });
|
|
523
|
+
}
|
|
524
|
+
});
|
|
525
|
+
return cards;
|
|
526
|
+
},
|
|
527
|
+
|
|
528
|
+
// Full extraction — called by WAB Browser or extension
|
|
529
|
+
extract() {
|
|
530
|
+
const hasWabBridge = typeof window.AICommands !== 'undefined' || typeof window.__wab_bidi !== 'undefined';
|
|
531
|
+
return {
|
|
532
|
+
url: location.href,
|
|
533
|
+
domain: location.hostname,
|
|
534
|
+
title: document.title,
|
|
535
|
+
jsonLd: this.getJsonLd(),
|
|
536
|
+
meta: this.getMeta(),
|
|
537
|
+
prices: this.getPrices(),
|
|
538
|
+
cards: this.getProductCards(),
|
|
539
|
+
timestamp: Date.now(),
|
|
540
|
+
hasWabBridge,
|
|
541
|
+
wabBridgeType: hasWabBridge
|
|
542
|
+
? (typeof window.__wab_bidi !== 'undefined' ? 'bidi' : 'standard')
|
|
543
|
+
: null,
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
};
|
|
547
|
+
|
|
548
|
+
// Expose for WAB Browser / extension
|
|
549
|
+
window.__wab_universal = WAB_EXTRACT;
|
|
550
|
+
|
|
551
|
+
// Auto-report if WAB extension is present
|
|
552
|
+
if (window.__wab_extension_ready) {
|
|
553
|
+
window.postMessage({ type: 'wab-extract', data: WAB_EXTRACT.extract() }, '*');
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return WAB_EXTRACT.extract();
|
|
557
|
+
})();
|
|
558
|
+
`;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// ─── Helpers ─────────────────────────────────────────────────────────
|
|
562
|
+
|
|
563
|
+
function _extractDomain(url) {
|
|
564
|
+
try { return new URL(url).hostname.replace(/^www\./, ''); } catch (_) { return ''; }
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
function getPriceHistory(url, limit = 30) {
|
|
568
|
+
const urlHash = crypto.createHash('sha256').update(url).digest('hex').slice(0, 16);
|
|
569
|
+
return stmts.getHistory.all(urlHash, limit);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
function getScrapedByDomain(domain, limit = 50) {
|
|
573
|
+
return stmts.getByDomain.all(domain.replace(/^www\./, ''), limit);
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// ─── Process browser extraction data ─────────────────────────────────
|
|
577
|
+
// Data sent from WAB Browser webview or Chrome extension content script
|
|
578
|
+
|
|
579
|
+
function processBrowserExtraction(data) {
|
|
580
|
+
if (!data || !data.url) return { error: 'No URL' };
|
|
581
|
+
|
|
582
|
+
const domain = _extractDomain(data.url);
|
|
583
|
+
const products = [];
|
|
584
|
+
|
|
585
|
+
// Process JSON-LD products
|
|
586
|
+
if (data.jsonLd && data.jsonLd.length > 0) {
|
|
587
|
+
products.push(...data.jsonLd);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// Process meta tags
|
|
591
|
+
if (data.meta && data.meta.price) {
|
|
592
|
+
products.push(data.meta);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// Process product cards from DOM
|
|
596
|
+
if (data.cards && data.cards.length > 0) {
|
|
597
|
+
for (const card of data.cards) {
|
|
598
|
+
const prices = extractPrices(card.price || '');
|
|
599
|
+
if (prices.length > 0 || card.title) {
|
|
600
|
+
products.push({
|
|
601
|
+
name: card.title || 'Unknown',
|
|
602
|
+
price: prices[0]?.price || null,
|
|
603
|
+
currency: prices[0]?.currency || 'USD',
|
|
604
|
+
rating: parseFloat(card.rating) || null,
|
|
605
|
+
url: card.link || data.url,
|
|
606
|
+
image: card.image || null,
|
|
607
|
+
method: 'dom-cards',
|
|
608
|
+
});
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// Fallback: use raw prices
|
|
614
|
+
if (products.length === 0 && data.prices && data.prices.length > 0) {
|
|
615
|
+
products.push({
|
|
616
|
+
name: data.title || domain,
|
|
617
|
+
price: data.prices[0].price,
|
|
618
|
+
currency: 'USD',
|
|
619
|
+
method: 'dom-prices',
|
|
620
|
+
allPrices: data.prices,
|
|
621
|
+
});
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
// Store in database
|
|
625
|
+
for (const p of products) {
|
|
626
|
+
const id = crypto.randomUUID();
|
|
627
|
+
try {
|
|
628
|
+
stmts.insertScraped.run(
|
|
629
|
+
id, data.url, domain,
|
|
630
|
+
p.name || null, p.price || null, p.currency || 'USD',
|
|
631
|
+
p.originalPrice || null, p.availability || null,
|
|
632
|
+
p.rating || null, p.reviewCount || null,
|
|
633
|
+
p.seller || p.brand || null, p.category || null,
|
|
634
|
+
p.method || 'browser', JSON.stringify(p)
|
|
635
|
+
);
|
|
636
|
+
|
|
637
|
+
const urlHash = crypto.createHash('sha256').update(data.url).digest('hex').slice(0, 16);
|
|
638
|
+
stmts.insertHistory.run(
|
|
639
|
+
crypto.randomUUID(), urlHash, domain,
|
|
640
|
+
p.name || null, p.price || null, p.currency || 'USD'
|
|
641
|
+
);
|
|
642
|
+
} catch (_) {}
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
return { products, domain, url: data.url, hasWabBridge: data.hasWabBridge };
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
// ─── Exports ─────────────────────────────────────────────────────────
|
|
649
|
+
|
|
650
|
+
module.exports = {
|
|
651
|
+
extractPrices,
|
|
652
|
+
extractJsonLd,
|
|
653
|
+
extractMetaTags,
|
|
654
|
+
extractFromHtml,
|
|
655
|
+
fetchAndExtract,
|
|
656
|
+
getBrowserExtractionScript,
|
|
657
|
+
processBrowserExtraction,
|
|
658
|
+
getPriceHistory,
|
|
659
|
+
getScrapedByDomain,
|
|
660
|
+
normalizeCurrency,
|
|
661
|
+
toUSD,
|
|
662
|
+
};
|