web-agent-bridge 3.2.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. package/LICENSE +84 -72
  2. package/README.ar.md +1304 -1152
  3. package/README.md +298 -1635
  4. package/bin/agent-runner.js +474 -474
  5. package/bin/cli.js +237 -138
  6. package/bin/wab-init.js +223 -0
  7. package/bin/wab.js +80 -80
  8. package/examples/azure-dns-wab.js +83 -0
  9. package/examples/bidi-agent.js +119 -119
  10. package/examples/cloudflare-wab-dns.js +121 -0
  11. package/examples/cpanel-wab-dns.js +114 -0
  12. package/examples/cross-site-agent.js +91 -91
  13. package/examples/dns-discovery-agent.js +166 -0
  14. package/examples/gcp-dns-wab.js +76 -0
  15. package/examples/governance-agent.js +169 -0
  16. package/examples/mcp-agent.js +94 -94
  17. package/examples/next-app-router/README.md +44 -44
  18. package/examples/plesk-wab-dns.js +103 -0
  19. package/examples/puppeteer-agent.js +108 -108
  20. package/examples/route53-wab-dns.js +144 -0
  21. package/examples/saas-dashboard/README.md +55 -55
  22. package/examples/safe-mode-agent.js +96 -0
  23. package/examples/shopify-hydrogen/README.md +74 -74
  24. package/examples/vision-agent.js +171 -171
  25. package/examples/wab-sign.js +74 -0
  26. package/examples/wab-verify.js +60 -0
  27. package/examples/wordpress-elementor/README.md +77 -77
  28. package/package.json +19 -6
  29. package/public/.well-known/agent-tools.json +180 -180
  30. package/public/.well-known/ai-assets.json +59 -59
  31. package/public/.well-known/security.txt +8 -0
  32. package/public/.well-known/wab.json +28 -0
  33. package/public/activate.html +368 -0
  34. package/public/adoption-metrics.html +188 -0
  35. package/public/agent-workspace.html +349 -349
  36. package/public/ai.html +198 -198
  37. package/public/api.html +413 -412
  38. package/public/azure-dns-integration.html +289 -0
  39. package/public/browser.html +486 -486
  40. package/public/cloudflare-integration.html +380 -0
  41. package/public/commander-dashboard.html +243 -243
  42. package/public/cookies.html +210 -210
  43. package/public/cpanel-integration.html +398 -0
  44. package/public/css/agent-workspace.css +1713 -1713
  45. package/public/css/premium.css +317 -317
  46. package/public/css/styles.css +1263 -1235
  47. package/public/dashboard.html +707 -706
  48. package/public/dns.html +436 -0
  49. package/public/docs.html +588 -587
  50. package/public/feed.xml +89 -89
  51. package/public/gcp-dns-integration.html +318 -0
  52. package/public/growth.html +465 -463
  53. package/public/index.html +1266 -982
  54. package/public/integrations.html +556 -0
  55. package/public/js/activate.js +145 -0
  56. package/public/js/agent-workspace.js +1740 -1740
  57. package/public/js/auth-nav.js +65 -31
  58. package/public/js/auth-redirect.js +12 -12
  59. package/public/js/cookie-consent.js +56 -56
  60. package/public/js/dns.js +438 -0
  61. package/public/js/wab-demo-page.js +721 -721
  62. package/public/js/ws-client.js +74 -74
  63. package/public/llms-full.txt +360 -360
  64. package/public/llms.txt +125 -125
  65. package/public/login.html +85 -85
  66. package/public/mesh-dashboard.html +328 -328
  67. package/public/openapi.json +669 -580
  68. package/public/phone-shield.html +281 -0
  69. package/public/plesk-integration.html +375 -0
  70. package/public/premium-dashboard.html +2489 -2489
  71. package/public/premium.html +793 -793
  72. package/public/privacy.html +297 -297
  73. package/public/provider-onboarding.html +172 -0
  74. package/public/provider-sandbox.html +134 -0
  75. package/public/providers.html +359 -0
  76. package/public/register.html +105 -105
  77. package/public/registrar-integrations.html +141 -0
  78. package/public/robots.txt +99 -87
  79. package/public/route53-integration.html +531 -0
  80. package/public/script/wab-consent.d.ts +36 -36
  81. package/public/script/wab-consent.js +104 -104
  82. package/public/script/wab-schema.js +131 -131
  83. package/public/script/wab.d.ts +108 -108
  84. package/public/script/wab.min.js +580 -580
  85. package/public/security.txt +8 -0
  86. package/public/shieldqr.html +231 -0
  87. package/public/sitemap.xml +6 -0
  88. package/public/terms.html +256 -256
  89. package/public/wab-trust.html +200 -0
  90. package/public/wab-vs-protocols.html +210 -0
  91. package/public/whitepaper.html +449 -0
  92. package/script/ai-agent-bridge.js +1754 -1754
  93. package/sdk/README.md +99 -99
  94. package/sdk/agent-mesh.js +449 -449
  95. package/sdk/auto-discovery.js +288 -0
  96. package/sdk/commander.js +262 -262
  97. package/sdk/governance.js +262 -0
  98. package/sdk/index.d.ts +464 -464
  99. package/sdk/index.js +25 -1
  100. package/sdk/multi-agent.js +318 -318
  101. package/sdk/package.json +2 -2
  102. package/sdk/safe-mode.js +221 -0
  103. package/sdk/safety-shield.js +219 -0
  104. package/sdk/schema-discovery.js +83 -83
  105. package/server/adapters/index.js +520 -520
  106. package/server/config/plans.js +367 -367
  107. package/server/config/secrets.js +102 -102
  108. package/server/control-plane/index.js +301 -301
  109. package/server/data-plane/index.js +354 -354
  110. package/server/index.js +670 -427
  111. package/server/llm/index.js +404 -404
  112. package/server/middleware/adminAuth.js +35 -35
  113. package/server/middleware/auth.js +50 -50
  114. package/server/middleware/featureGate.js +88 -88
  115. package/server/middleware/rateLimits.js +100 -100
  116. package/server/middleware/sensitiveAction.js +157 -0
  117. package/server/migrations/001_add_analytics_indexes.sql +7 -7
  118. package/server/migrations/002_premium_features.sql +418 -418
  119. package/server/migrations/003_ads_integer_cents.sql +33 -33
  120. package/server/migrations/004_agent_os.sql +158 -158
  121. package/server/migrations/005_marketplace_metering.sql +126 -126
  122. package/server/migrations/007_governance.sql +106 -0
  123. package/server/migrations/008_plans.sql +144 -0
  124. package/server/migrations/009_shieldqr.sql +30 -0
  125. package/server/migrations/010_extended_trust.sql +33 -0
  126. package/server/models/adapters/index.js +33 -33
  127. package/server/models/adapters/mysql.js +183 -183
  128. package/server/models/adapters/postgresql.js +172 -172
  129. package/server/models/adapters/sqlite.js +7 -7
  130. package/server/models/db.js +740 -681
  131. package/server/observability/failure-analysis.js +337 -337
  132. package/server/observability/index.js +394 -394
  133. package/server/protocol/capabilities.js +223 -223
  134. package/server/protocol/index.js +243 -243
  135. package/server/protocol/schema.js +584 -584
  136. package/server/registry/certification.js +271 -271
  137. package/server/registry/index.js +326 -326
  138. package/server/routes/admin-plans.js +76 -0
  139. package/server/routes/admin-premium.js +673 -671
  140. package/server/routes/admin-shieldqr.js +90 -0
  141. package/server/routes/admin-trust-monitor.js +83 -0
  142. package/server/routes/admin.js +549 -261
  143. package/server/routes/ads.js +130 -130
  144. package/server/routes/agent-workspace.js +540 -540
  145. package/server/routes/api.js +150 -150
  146. package/server/routes/auth.js +71 -71
  147. package/server/routes/billing.js +57 -45
  148. package/server/routes/commander.js +316 -316
  149. package/server/routes/demo-showcase.js +332 -332
  150. package/server/routes/demo-store.js +154 -0
  151. package/server/routes/discovery.js +2348 -417
  152. package/server/routes/gateway.js +173 -157
  153. package/server/routes/governance.js +208 -0
  154. package/server/routes/license.js +251 -240
  155. package/server/routes/mesh.js +469 -469
  156. package/server/routes/noscript.js +543 -543
  157. package/server/routes/plans.js +33 -0
  158. package/server/routes/premium-v2.js +686 -686
  159. package/server/routes/premium.js +724 -724
  160. package/server/routes/providers.js +650 -0
  161. package/server/routes/runtime.js +2148 -2147
  162. package/server/routes/shieldqr.js +88 -0
  163. package/server/routes/sovereign.js +465 -385
  164. package/server/routes/universal.js +200 -185
  165. package/server/routes/wab-api.js +850 -501
  166. package/server/runtime/container-worker.js +111 -111
  167. package/server/runtime/container.js +448 -448
  168. package/server/runtime/distributed-worker.js +362 -362
  169. package/server/runtime/event-bus.js +210 -210
  170. package/server/runtime/index.js +253 -253
  171. package/server/runtime/queue.js +599 -599
  172. package/server/runtime/replay.js +666 -666
  173. package/server/runtime/sandbox.js +266 -266
  174. package/server/runtime/scheduler.js +534 -534
  175. package/server/runtime/session-engine.js +293 -293
  176. package/server/runtime/state-manager.js +188 -188
  177. package/server/security/cross-site-redactor.js +196 -0
  178. package/server/security/dry-run.js +180 -0
  179. package/server/security/human-gate-rate-limit.js +147 -0
  180. package/server/security/human-gate-transports.js +178 -0
  181. package/server/security/human-gate.js +281 -0
  182. package/server/security/index.js +368 -368
  183. package/server/security/intent-engine.js +245 -0
  184. package/server/security/reward-guard.js +171 -0
  185. package/server/security/rollback-store.js +239 -0
  186. package/server/security/token-scope.js +404 -0
  187. package/server/security/url-policy.js +139 -0
  188. package/server/services/agent-chat.js +506 -506
  189. package/server/services/agent-learning.js +601 -575
  190. package/server/services/agent-memory.js +625 -625
  191. package/server/services/agent-mesh.js +555 -539
  192. package/server/services/agent-symphony.js +717 -717
  193. package/server/services/agent-tasks.js +1807 -1807
  194. package/server/services/api-key-engine.js +292 -261
  195. package/server/services/cluster.js +894 -894
  196. package/server/services/commander.js +738 -738
  197. package/server/services/edge-compute.js +440 -440
  198. package/server/services/email.js +233 -204
  199. package/server/services/governance.js +466 -0
  200. package/server/services/hosted-runtime.js +205 -205
  201. package/server/services/lfd.js +635 -635
  202. package/server/services/local-ai.js +389 -389
  203. package/server/services/marketplace.js +270 -270
  204. package/server/services/metering.js +182 -182
  205. package/server/services/modules/affiliate-intelligence.js +93 -93
  206. package/server/services/modules/agent-firewall.js +90 -90
  207. package/server/services/modules/bounty.js +89 -89
  208. package/server/services/modules/collective-bargaining.js +92 -92
  209. package/server/services/modules/dark-pattern.js +66 -66
  210. package/server/services/modules/gov-intelligence.js +45 -45
  211. package/server/services/modules/neural.js +55 -55
  212. package/server/services/modules/notary.js +49 -49
  213. package/server/services/modules/price-time-machine.js +86 -86
  214. package/server/services/modules/protocol.js +104 -104
  215. package/server/services/negotiation.js +439 -439
  216. package/server/services/plans.js +214 -0
  217. package/server/services/plugins.js +771 -771
  218. package/server/services/premium.js +1 -1
  219. package/server/services/price-intelligence.js +566 -566
  220. package/server/services/price-shield.js +1137 -1137
  221. package/server/services/provider-clients.js +740 -0
  222. package/server/services/reputation.js +465 -465
  223. package/server/services/search-engine.js +357 -357
  224. package/server/services/security.js +513 -513
  225. package/server/services/self-healing.js +843 -843
  226. package/server/services/shieldqr.js +322 -0
  227. package/server/services/sovereign-shield.js +542 -0
  228. package/server/services/ssl-inspector.js +42 -0
  229. package/server/services/ssl-monitor.js +167 -0
  230. package/server/services/stripe.js +205 -192
  231. package/server/services/swarm.js +788 -788
  232. package/server/services/universal-scraper.js +662 -661
  233. package/server/services/verification.js +481 -481
  234. package/server/services/vision.js +1163 -1163
  235. package/server/services/wab-crypto.js +178 -0
  236. package/server/utils/cache.js +125 -125
  237. package/server/utils/migrate.js +81 -81
  238. package/server/utils/safe-fetch.js +228 -0
  239. package/server/utils/secureFields.js +50 -50
  240. package/server/ws.js +161 -161
  241. package/templates/artisan-marketplace.yaml +104 -104
  242. package/templates/book-price-scout.yaml +98 -98
  243. package/templates/electronics-price-tracker.yaml +108 -108
  244. package/templates/flight-deal-hunter.yaml +113 -113
  245. package/templates/freelancer-direct.yaml +116 -116
  246. package/templates/grocery-price-compare.yaml +93 -93
  247. package/templates/hotel-direct-booking.yaml +113 -113
  248. package/templates/local-services.yaml +98 -98
  249. package/templates/olive-oil-tunisia.yaml +88 -88
  250. package/templates/organic-farm-fresh.yaml +101 -101
  251. package/templates/restaurant-direct.yaml +97 -97
  252. package/public/score.html +0 -263
  253. package/server/migrations/006_growth_suite.sql +0 -138
  254. package/server/routes/growth.js +0 -962
  255. package/server/services/fairness-engine.js +0 -409
  256. package/server/services/fairness.js +0 -420
@@ -1,661 +1,662 @@
1
- /**
2
- * WAB Universal Scraper Engine
3
- * ═══════════════════════════════════════════════════════════════════
4
- * Works with ANY website — no script installation required.
5
- * Extracts prices, products, availability from raw HTML/DOM.
6
- *
7
- * Three extraction modes:
8
- * 1. Schema.org JSON-LD (structured, most reliable)
9
- * 2. Open Graph / Meta tags (semi-structured)
10
- * 3. DOM pattern matching (heuristic, any site)
11
- *
12
- * Used by: WAB Browser (webview), Chrome Extension (content script),
13
- * Server-side fetch (Node.js)
14
- */
15
-
16
- const crypto = require('crypto');
17
- const { db } = require('../models/db');
18
-
19
- // ─── Schema ──────────────────────────────────────────────────────────
20
-
21
- db.exec(`
22
- CREATE TABLE IF NOT EXISTS scraped_prices (
23
- id TEXT PRIMARY KEY,
24
- url TEXT NOT NULL,
25
- domain TEXT NOT NULL,
26
- product_name TEXT,
27
- price REAL,
28
- currency TEXT DEFAULT 'USD',
29
- original_price REAL,
30
- availability TEXT,
31
- rating REAL,
32
- review_count INTEGER,
33
- seller TEXT,
34
- category TEXT,
35
- extraction_method TEXT,
36
- raw_data TEXT DEFAULT '{}',
37
- scraped_at TEXT DEFAULT (datetime('now'))
38
- );
39
-
40
- CREATE TABLE IF NOT EXISTS universal_price_history (
41
- id TEXT PRIMARY KEY,
42
- url_hash TEXT NOT NULL,
43
- domain TEXT NOT NULL,
44
- product_name TEXT,
45
- price REAL,
46
- currency TEXT DEFAULT 'USD',
47
- recorded_at TEXT DEFAULT (datetime('now'))
48
- );
49
-
50
- CREATE INDEX IF NOT EXISTS idx_scraped_domain ON scraped_prices(domain);
51
- CREATE INDEX IF NOT EXISTS idx_scraped_url ON scraped_prices(url);
52
- CREATE INDEX IF NOT EXISTS idx_uph_hash ON universal_price_history(url_hash);
53
- `);
54
-
55
- const stmts = {
56
- insertScraped: db.prepare(`INSERT OR REPLACE INTO scraped_prices
57
- (id, url, domain, product_name, price, currency, original_price,
58
- availability, rating, review_count, seller, category, extraction_method, raw_data)
59
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`),
60
- getByUrl: db.prepare('SELECT * FROM scraped_prices WHERE url = ? ORDER BY scraped_at DESC LIMIT 1'),
61
- getByDomain: db.prepare('SELECT * FROM scraped_prices WHERE domain = ? ORDER BY scraped_at DESC LIMIT ?'),
62
- insertHistory: db.prepare(`INSERT INTO universal_price_history
63
- (id, url_hash, domain, product_name, price, currency) VALUES (?, ?, ?, ?, ?, ?)`),
64
- getHistory: db.prepare('SELECT * FROM universal_price_history WHERE url_hash = ? ORDER BY recorded_at DESC LIMIT ?'),
65
- };
66
-
67
- // ─── Currency Normalization ──────────────────────────────────────────
68
-
69
- const CURRENCY_MAP = {
70
- '$': 'USD', 'USD': 'USD', 'US$': 'USD',
71
- '': 'EUR', 'EUR': 'EUR',
72
- '£': 'GBP', 'GBP': 'GBP',
73
- 'SAR': 'SAR', 'ريال': 'SAR', 'ر.س': 'SAR',
74
- 'AED': 'AED', 'درهم': 'AED', 'د.إ': 'AED',
75
- 'TND': 'TND', 'دينار': 'TND', 'د.ت': 'TND',
76
- 'EGP': 'EGP', 'ج.م': 'EGP',
77
- 'MAD': 'MAD', 'د.م': 'MAD',
78
- 'TRY': 'TRY', '': 'TRY',
79
- 'JPY': 'JPY', '¥': 'JPY',
80
- 'INR': 'INR', '': 'INR',
81
- 'KRW': 'KRW', '': 'KRW',
82
- };
83
-
84
- // Approximate USD rates for comparison
85
- const TO_USD = {
86
- USD: 1, EUR: 1.08, GBP: 1.27, SAR: 0.27, AED: 0.27,
87
- TND: 0.32, EGP: 0.032, MAD: 0.10, TRY: 0.031,
88
- JPY: 0.0067, INR: 0.012, KRW: 0.00074,
89
- };
90
-
91
- function normalizeCurrency(symbol) {
92
- if (!symbol) return 'USD';
93
- const s = symbol.trim().toUpperCase();
94
- return CURRENCY_MAP[s] || CURRENCY_MAP[symbol.trim()] || 'USD';
95
- }
96
-
97
- function toUSD(price, currency) {
98
- const rate = TO_USD[currency] || 1;
99
- return Math.round(price * rate * 100) / 100;
100
- }
101
-
102
- // ─── Price Extraction ────────────────────────────────────────────────
103
-
104
- const PRICE_PATTERNS = [
105
- // $123.45 or $ 123.45
106
- /(?<currency>\$|USD|US\$)\s*(?<price>[\d,]+\.?\d*)/gi,
107
- // €123.45
108
- /(?<currency>€|EUR)\s*(?<price>[\d,]+\.?\d*)/gi,
109
- // £123.45
110
- /(?<currency>£|GBP)\s*(?<price>[\d,]+\.?\d*)/gi,
111
- // 123.45 SAR / ريال
112
- /(?<price>[\d,]+\.?\d*)\s*(?<currency>SAR|ريال|ر\.س|AED|درهم|د\.إ|TND|دينار|د\.ت|EGP|ج\.م|MAD|د\.م)/gi,
113
- // ₺ ₹ ¥ ₩ prefixed
114
- /(?<currency>[₺₹¥₩])\s*(?<price>[\d,]+\.?\d*)/gi,
115
- // 123.45$ (suffix dollar)
116
- /(?<price>[\d,]+\.?\d*)\s*(?<currency>\$|€|£)/gi,
117
- ];
118
-
119
- function extractPrices(text) {
120
- const prices = [];
121
- const seen = new Set();
122
-
123
- for (const pattern of PRICE_PATTERNS) {
124
- pattern.lastIndex = 0;
125
- let m;
126
- while ((m = pattern.exec(text)) !== null) {
127
- const raw = m.groups?.price || m[2] || m[1];
128
- const currSymbol = m.groups?.currency || m[1] || m[2];
129
- if (!raw) continue;
130
-
131
- const num = parseFloat(raw.replace(/,/g, ''));
132
- if (isNaN(num) || num <= 0 || num > 1000000) continue;
133
-
134
- const currency = normalizeCurrency(currSymbol);
135
- const key = `${num}-${currency}`;
136
- if (seen.has(key)) continue;
137
- seen.add(key);
138
-
139
- prices.push({ price: num, currency, usd: toUSD(num, currency), raw: m[0].trim() });
140
- }
141
- }
142
-
143
- return prices.sort((a, b) => a.usd - b.usd);
144
- }
145
-
146
- // ─── Schema.org JSON-LD Extraction ───────────────────────────────────
147
-
148
- function extractJsonLd(html) {
149
- const products = [];
150
- const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
151
- let m;
152
-
153
- while ((m = regex.exec(html)) !== null) {
154
- try {
155
- let data = JSON.parse(m[1].trim());
156
- if (Array.isArray(data)) data.forEach(d => processJsonLd(d, products));
157
- else processJsonLd(data, products);
158
- } catch (_) {}
159
- }
160
-
161
- return products;
162
- }
163
-
164
- function processJsonLd(data, products) {
165
- if (!data || typeof data !== 'object') return;
166
-
167
- // Handle @graph arrays
168
- if (data['@graph'] && Array.isArray(data['@graph'])) {
169
- data['@graph'].forEach(item => processJsonLd(item, products));
170
- return;
171
- }
172
-
173
- const type = (data['@type'] || '').toLowerCase();
174
-
175
- if (type === 'product' || type === 'hotel' || type === 'hotelroom' ||
176
- type === 'lodgingbusiness' || type === 'offer') {
177
- const offers = data.offers || data.priceSpecification || {};
178
- const offer = Array.isArray(offers) ? offers[0] : offers;
179
-
180
- products.push({
181
- name: data.name || data.headline || null,
182
- price: parseFloat(offer?.price || offer?.lowPrice || data.price) || null,
183
- originalPrice: parseFloat(offer?.highPrice) || null,
184
- currency: offer?.priceCurrency || 'USD',
185
- availability: offer?.availability?.replace('https://schema.org/', '').replace('http://schema.org/', '') || null,
186
- rating: parseFloat(data.aggregateRating?.ratingValue) || null,
187
- reviewCount: parseInt(data.aggregateRating?.reviewCount || data.aggregateRating?.ratingCount) || null,
188
- image: data.image?.url || (typeof data.image === 'string' ? data.image : null),
189
- description: (data.description || '').slice(0, 500),
190
- brand: data.brand?.name || data.brand || null,
191
- sku: data.sku || null,
192
- url: data.url || null,
193
- method: 'json-ld',
194
- });
195
- }
196
-
197
- // Recurse into nested objects
198
- for (const key of Object.keys(data)) {
199
- if (typeof data[key] === 'object' && data[key] !== null && key !== '@context') {
200
- if (Array.isArray(data[key])) {
201
- data[key].forEach(item => {
202
- if (typeof item === 'object') processJsonLd(item, products);
203
- });
204
- } else {
205
- processJsonLd(data[key], products);
206
- }
207
- }
208
- }
209
- }
210
-
211
- // ─── Open Graph / Meta Tag Extraction ────────────────────────────────
212
-
213
- function extractMetaTags(html) {
214
- const meta = {};
215
- const metaRegex = /<meta\s+(?:[^>]*?(?:property|name)=["']([^"']+)["'][^>]*?content=["']([^"']*?)["']|[^>]*?content=["']([^"']*?)["'][^>]*?(?:property|name)=["']([^"']+)["'])[^>]*\/?>/gi;
216
- let m;
217
-
218
- while ((m = metaRegex.exec(html)) !== null) {
219
- const key = (m[1] || m[4] || '').toLowerCase();
220
- const value = m[2] || m[3] || '';
221
- if (key && value) meta[key] = value;
222
- }
223
-
224
- const product = {};
225
- if (meta['og:title']) product.name = meta['og:title'];
226
- if (meta['product:price:amount']) product.price = parseFloat(meta['product:price:amount']);
227
- if (meta['product:price:currency']) product.currency = meta['product:price:currency'];
228
- if (meta['og:description']) product.description = meta['og:description'].slice(0, 500);
229
- if (meta['og:image']) product.image = meta['og:image'];
230
- if (meta['product:availability']) product.availability = meta['product:availability'];
231
- if (meta['product:brand']) product.brand = meta['product:brand'];
232
- if (meta['og:type']) product.type = meta['og:type'];
233
- if (meta['og:url']) product.url = meta['og:url'];
234
-
235
- if (product.name || product.price) {
236
- product.method = 'meta-tags';
237
- return product;
238
- }
239
- return null;
240
- }
241
-
242
- // ─── DOM Heuristic Extraction ────────────────────────────────────────
243
- // This runs either on server (from fetched HTML) or in browser (content script)
244
-
245
- function extractFromHtml(html, url) {
246
- const results = [];
247
- const domain = _extractDomain(url);
248
-
249
- // 1. Try JSON-LD first (most reliable)
250
- const jsonLdProducts = extractJsonLd(html);
251
- if (jsonLdProducts.length > 0) {
252
- results.push(...jsonLdProducts);
253
- }
254
-
255
- // 2. Try Open Graph meta tags
256
- const metaProduct = extractMetaTags(html);
257
- if (metaProduct && metaProduct.price) {
258
- results.push(metaProduct);
259
- }
260
-
261
- // 3. Heuristic price extraction from common patterns
262
- const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
263
- const pageTitle = titleMatch ? titleMatch[1].replace(/\s+/g, ' ').trim() : '';
264
-
265
- // Extract potential product containers
266
- const priceContainerPatterns = [
267
- // Common e-commerce price selectors reflected as class names
268
- /class="[^"]*(?:price|cost|amount|tarif|سعر|ثمن)[^"]*"[^>]*>([^<]{1,100})</g,
269
- /class="[^"]*(?:product-price|item-price|sale-price|offer-price|current-price)[^"]*"[^>]*>([^<]{1,100})</g,
270
- /class="[^"]*(?:room-price|rate-price|nightly-rate|total-price)[^"]*"[^>]*>([^<]{1,100})</g,
271
- // data-price attributes
272
- /data-price=["']([^"']+)["']/gi,
273
- /data-product-price=["']([^"']+)["']/gi,
274
- ];
275
-
276
- const rawPrices = [];
277
- for (const pattern of priceContainerPatterns) {
278
- let m;
279
- while ((m = pattern.exec(html)) !== null) {
280
- const extracted = extractPrices(m[1]);
281
- rawPrices.push(...extracted);
282
- }
283
- }
284
-
285
- // If no structured prices found, do a broad sweep
286
- if (results.length === 0 && rawPrices.length === 0) {
287
- // Extract from visible text areas (skip scripts/styles)
288
- const cleaned = html
289
- .replace(/<script[\s\S]*?<\/script>/gi, '')
290
- .replace(/<style[\s\S]*?<\/style>/gi, '')
291
- .replace(/<[^>]+>/g, ' ')
292
- .replace(/\s+/g, ' ');
293
- const broadPrices = extractPrices(cleaned);
294
- rawPrices.push(...broadPrices);
295
- }
296
-
297
- // Deduplicate and create heuristic results
298
- if (rawPrices.length > 0 && results.length === 0) {
299
- // Filter: likely product prices (not phone numbers, years, etc.)
300
- const validPrices = rawPrices.filter(p =>
301
- p.usd >= 1 && p.usd <= 50000 &&
302
- !`${p.price}`.match(/^(19|20)\d{2}$/) // not a year
303
- );
304
-
305
- if (validPrices.length > 0) {
306
- const sorted = validPrices.sort((a, b) => a.usd - b.usd);
307
- results.push({
308
- name: pageTitle || domain,
309
- price: sorted[0].price,
310
- currency: sorted[0].currency,
311
- originalPrice: sorted.length > 1 ? sorted[sorted.length - 1].price : null,
312
- method: 'heuristic',
313
- allPrices: sorted.slice(0, 10),
314
- });
315
- }
316
- }
317
-
318
- return results;
319
- }
320
-
321
- // ─── Server-side Fetch & Extract ─────────────────────────────────────
322
-
323
- const UA_LIST = [
324
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
325
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
326
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
327
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
328
- ];
329
-
330
- async function fetchAndExtract(url, options = {}) {
331
- const domain = _extractDomain(url);
332
-
333
- // Check cache (< 1 hour old)
334
- const cached = stmts.getByUrl.get(url);
335
- if (cached && !options.force) {
336
- const age = Date.now() - new Date(cached.scraped_at).getTime();
337
- if (age < 3600000) return { cached: true, ...JSON.parse(cached.raw_data), products: [cached] };
338
- }
339
-
340
- try {
341
- const controller = new AbortController();
342
- const timeout = setTimeout(() => controller.abort(), options.timeout || 10000);
343
- const ua = UA_LIST[Math.floor(Math.random() * UA_LIST.length)];
344
-
345
- const resp = await fetch(url, {
346
- headers: {
347
- 'User-Agent': ua,
348
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
349
- 'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8',
350
- 'Accept-Encoding': 'gzip, deflate, br',
351
- 'Sec-Fetch-Dest': 'document',
352
- 'Sec-Fetch-Mode': 'navigate',
353
- 'Sec-Fetch-Site': 'none',
354
- },
355
- signal: controller.signal,
356
- redirect: 'follow',
357
- });
358
- clearTimeout(timeout);
359
-
360
- if (!resp.ok) return { error: `HTTP ${resp.status}`, products: [] };
361
-
362
- const contentType = resp.headers.get('content-type') || '';
363
- if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
364
- return { error: 'Not HTML', products: [] };
365
- }
366
-
367
- const html = await resp.text();
368
- const products = extractFromHtml(html, url);
369
-
370
- // Store results
371
- for (const p of products) {
372
- const id = crypto.randomUUID();
373
- stmts.insertScraped.run(
374
- id, url, domain,
375
- p.name || null, p.price || null, p.currency || 'USD',
376
- p.originalPrice || null, p.availability || null,
377
- p.rating || null, p.reviewCount || null,
378
- p.seller || p.brand || null, p.category || null,
379
- p.method || 'unknown', JSON.stringify(p)
380
- );
381
-
382
- // Record price history
383
- const urlHash = crypto.createHash('sha256').update(url).digest('hex').slice(0, 16);
384
- stmts.insertHistory.run(
385
- crypto.randomUUID(), urlHash, domain,
386
- p.name || null, p.price || null, p.currency || 'USD'
387
- );
388
- }
389
-
390
- return { products, domain, url, fetchedAt: new Date().toISOString() };
391
- } catch (err) {
392
- return { error: err.message, products: [] };
393
- }
394
- }
395
-
396
- // ─── Browser-side extraction (for content script / WAB Browser) ──────
397
- // This generates a script that can be injected into any page via
398
- // webview.executeJavaScript() or chrome content script
399
-
400
- function getBrowserExtractionScript() {
401
- return `
402
- (function() {
403
- 'use strict';
404
- const WAB_EXTRACT = {
405
- // Extract JSON-LD products
406
- getJsonLd() {
407
- const products = [];
408
- document.querySelectorAll('script[type="application/ld+json"]').forEach(el => {
409
- try {
410
- let data = JSON.parse(el.textContent);
411
- if (Array.isArray(data)) data.forEach(d => this._processLd(d, products));
412
- else this._processLd(data, products);
413
- } catch(_) {}
414
- });
415
- return products;
416
- },
417
-
418
- _processLd(data, products) {
419
- if (!data || typeof data !== 'object') return;
420
- if (data['@graph']) { data['@graph'].forEach(i => this._processLd(i, products)); return; }
421
- const type = (data['@type'] || '').toLowerCase();
422
- if (['product','hotel','hotelroom','lodgingbusiness','offer'].includes(type)) {
423
- const offers = data.offers || {};
424
- const offer = Array.isArray(offers) ? offers[0] : offers;
425
- products.push({
426
- name: data.name || null,
427
- price: parseFloat(offer?.price || offer?.lowPrice || data.price) || null,
428
- originalPrice: parseFloat(offer?.highPrice) || null,
429
- currency: offer?.priceCurrency || 'USD',
430
- availability: (offer?.availability || '').replace(/https?:\\/\\/schema\\.org\\//,''),
431
- rating: parseFloat(data.aggregateRating?.ratingValue) || null,
432
- reviewCount: parseInt(data.aggregateRating?.reviewCount) || null,
433
- image: typeof data.image === 'string' ? data.image : data.image?.url || null,
434
- brand: data.brand?.name || data.brand || null,
435
- method: 'json-ld'
436
- });
437
- }
438
- for (const key of Object.keys(data)) {
439
- if (typeof data[key] === 'object' && data[key] !== null && key !== '@context') {
440
- if (Array.isArray(data[key])) data[key].forEach(i => { if (typeof i === 'object') this._processLd(i, products); });
441
- else this._processLd(data[key], products);
442
- }
443
- }
444
- },
445
-
446
- // Extract Open Graph meta
447
- getMeta() {
448
- const m = {};
449
- document.querySelectorAll('meta[property], meta[name]').forEach(el => {
450
- const key = (el.getAttribute('property') || el.getAttribute('name') || '').toLowerCase();
451
- const val = el.getAttribute('content');
452
- if (key && val) m[key] = val;
453
- });
454
- const p = {};
455
- if (m['og:title']) p.name = m['og:title'];
456
- if (m['product:price:amount']) p.price = parseFloat(m['product:price:amount']);
457
- if (m['product:price:currency']) p.currency = m['product:price:currency'];
458
- if (m['og:description']) p.description = m['og:description'];
459
- if (m['og:image']) p.image = m['og:image'];
460
- if (p.name || p.price) { p.method = 'meta-tags'; return p; }
461
- return null;
462
- },
463
-
464
- // Extract prices from visible text
465
- getPrices() {
466
- const patterns = [
467
- /(?:\\$|USD|US\\$)\\s*([\\d,]+\\.?\\d*)/g,
468
- /(?:€|EUR)\\s*([\\d,]+\\.?\\d*)/g,
469
- /(?:£|GBP)\\s*([\\d,]+\\.?\\d*)/g,
470
- /([\\d,]+\\.?\\d*)\\s*(?:SAR|ريال|AED|درهم|TND|دينار|EGP)/g,
471
- /(?:[₺₹¥₩])\\s*([\\d,]+\\.?\\d*)/g,
472
- ];
473
- const prices = [];
474
- const seen = new Set();
475
-
476
- // Target price-like containers first
477
- const priceEls = document.querySelectorAll(
478
- '[class*="price"], [class*="cost"], [class*="amount"], [class*="rate"], ' +
479
- '[data-price], [data-product-price], [itemprop="price"], ' +
480
- '[class*="tarif"], [class*="سعر"]'
481
- );
482
- priceEls.forEach(el => {
483
- const text = el.textContent || el.getAttribute('data-price') || '';
484
- for (const pat of patterns) {
485
- pat.lastIndex = 0;
486
- let m;
487
- while ((m = pat.exec(text)) !== null) {
488
- const num = parseFloat((m[1] || m[0].replace(/[^\\d.,]/g,'')).replace(/,/g,''));
489
- if (num > 0 && num < 100000 && !seen.has(num)) {
490
- seen.add(num);
491
- prices.push({ price: num, raw: m[0].trim(), el: el.className });
492
- }
493
- }
494
- }
495
- });
496
-
497
- return prices.sort((a,b) => a.price - b.price);
498
- },
499
-
500
- // Extract product cards (hotels, flights, items)
501
- getProductCards() {
502
- const cards = [];
503
- const selectors = [
504
- '[class*="product-card"]', '[class*="hotel-card"]', '[class*="listing-card"]',
505
- '[class*="search-result"]', '[class*="offer-card"]', '[class*="deal-card"]',
506
- '[class*="property-card"]', '[class*="sr_item"]', '[class*="result-item"]',
507
- '[data-testid*="property"]', '[data-testid*="product"]', '[data-testid*="listing"]',
508
- ];
509
- const allCards = document.querySelectorAll(selectors.join(','));
510
- allCards.forEach((el, i) => {
511
- if (i >= 20) return; // limit
512
- const title = el.querySelector('[class*="title"], [class*="name"], h2, h3, h4')?.textContent?.trim()?.slice(0, 200);
513
- const priceEl = el.querySelector('[class*="price"], [data-price], [itemprop="price"]');
514
- const priceText = priceEl?.textContent?.trim() || priceEl?.getAttribute('data-price') || '';
515
- const ratingEl = el.querySelector('[class*="rating"], [class*="score"], [aria-label*="rating"], [aria-label*="score"]');
516
- const rating = ratingEl?.textContent?.trim() || ratingEl?.getAttribute('aria-label') || '';
517
- const link = el.querySelector('a[href]')?.href || '';
518
- const img = el.querySelector('img')?.src || '';
519
-
520
- if (title || priceText) {
521
- cards.push({ title, price: priceText, rating, link, image: img, index: i });
522
- }
523
- });
524
- return cards;
525
- },
526
-
527
- // Full extraction — called by WAB Browser or extension
528
- extract() {
529
- const hasWabBridge = typeof window.AICommands !== 'undefined' || typeof window.__wab_bidi !== 'undefined';
530
- return {
531
- url: location.href,
532
- domain: location.hostname,
533
- title: document.title,
534
- jsonLd: this.getJsonLd(),
535
- meta: this.getMeta(),
536
- prices: this.getPrices(),
537
- cards: this.getProductCards(),
538
- timestamp: Date.now(),
539
- hasWabBridge,
540
- wabBridgeType: hasWabBridge
541
- ? (typeof window.__wab_bidi !== 'undefined' ? 'bidi' : 'standard')
542
- : null,
543
- };
544
- }
545
- };
546
-
547
- // Expose for WAB Browser / extension
548
- window.__wab_universal = WAB_EXTRACT;
549
-
550
- // Auto-report if WAB extension is present
551
- if (window.__wab_extension_ready) {
552
- window.postMessage({ type: 'wab-extract', data: WAB_EXTRACT.extract() }, '*');
553
- }
554
-
555
- return WAB_EXTRACT.extract();
556
- })();
557
- `;
558
- }
559
-
560
- // ─── Helpers ─────────────────────────────────────────────────────────
561
-
562
- function _extractDomain(url) {
563
- try { return new URL(url).hostname.replace(/^www\./, ''); } catch (_) { return ''; }
564
- }
565
-
566
- function getPriceHistory(url, limit = 30) {
567
- const urlHash = crypto.createHash('sha256').update(url).digest('hex').slice(0, 16);
568
- return stmts.getHistory.all(urlHash, limit);
569
- }
570
-
571
- function getScrapedByDomain(domain, limit = 50) {
572
- return stmts.getByDomain.all(domain.replace(/^www\./, ''), limit);
573
- }
574
-
575
- // ─── Process browser extraction data ─────────────────────────────────
576
- // Data sent from WAB Browser webview or Chrome extension content script
577
-
578
- function processBrowserExtraction(data) {
579
- if (!data || !data.url) return { error: 'No URL' };
580
-
581
- const domain = _extractDomain(data.url);
582
- const products = [];
583
-
584
- // Process JSON-LD products
585
- if (data.jsonLd && data.jsonLd.length > 0) {
586
- products.push(...data.jsonLd);
587
- }
588
-
589
- // Process meta tags
590
- if (data.meta && data.meta.price) {
591
- products.push(data.meta);
592
- }
593
-
594
- // Process product cards from DOM
595
- if (data.cards && data.cards.length > 0) {
596
- for (const card of data.cards) {
597
- const prices = extractPrices(card.price || '');
598
- if (prices.length > 0 || card.title) {
599
- products.push({
600
- name: card.title || 'Unknown',
601
- price: prices[0]?.price || null,
602
- currency: prices[0]?.currency || 'USD',
603
- rating: parseFloat(card.rating) || null,
604
- url: card.link || data.url,
605
- image: card.image || null,
606
- method: 'dom-cards',
607
- });
608
- }
609
- }
610
- }
611
-
612
- // Fallback: use raw prices
613
- if (products.length === 0 && data.prices && data.prices.length > 0) {
614
- products.push({
615
- name: data.title || domain,
616
- price: data.prices[0].price,
617
- currency: 'USD',
618
- method: 'dom-prices',
619
- allPrices: data.prices,
620
- });
621
- }
622
-
623
- // Store in database
624
- for (const p of products) {
625
- const id = crypto.randomUUID();
626
- try {
627
- stmts.insertScraped.run(
628
- id, data.url, domain,
629
- p.name || null, p.price || null, p.currency || 'USD',
630
- p.originalPrice || null, p.availability || null,
631
- p.rating || null, p.reviewCount || null,
632
- p.seller || p.brand || null, p.category || null,
633
- p.method || 'browser', JSON.stringify(p)
634
- );
635
-
636
- const urlHash = crypto.createHash('sha256').update(data.url).digest('hex').slice(0, 16);
637
- stmts.insertHistory.run(
638
- crypto.randomUUID(), urlHash, domain,
639
- p.name || null, p.price || null, p.currency || 'USD'
640
- );
641
- } catch (_) {}
642
- }
643
-
644
- return { products, domain, url: data.url, hasWabBridge: data.hasWabBridge };
645
- }
646
-
647
- // ─── Exports ─────────────────────────────────────────────────────────
648
-
649
- module.exports = {
650
- extractPrices,
651
- extractJsonLd,
652
- extractMetaTags,
653
- extractFromHtml,
654
- fetchAndExtract,
655
- getBrowserExtractionScript,
656
- processBrowserExtraction,
657
- getPriceHistory,
658
- getScrapedByDomain,
659
- normalizeCurrency,
660
- toUSD,
661
- };
1
+ /**
2
+ * WAB Universal Scraper Engine
3
+ * ═══════════════════════════════════════════════════════════════════
4
+ * Works with ANY website — no script installation required.
5
+ * Extracts prices, products, availability from raw HTML/DOM.
6
+ *
7
+ * Three extraction modes:
8
+ * 1. Schema.org JSON-LD (structured, most reliable)
9
+ * 2. Open Graph / Meta tags (semi-structured)
10
+ * 3. DOM pattern matching (heuristic, any site)
11
+ *
12
+ * Used by: WAB Browser (webview), Chrome Extension (content script),
13
+ * Server-side fetch (Node.js)
14
+ */
15
+
16
+ const crypto = require('crypto');
17
+ const { db } = require('../models/db');
18
+ const { safeFetch } = require('../utils/safe-fetch');
19
+
20
+ // ─── Schema ──────────────────────────────────────────────────────────
21
+
22
+ db.exec(`
23
+ CREATE TABLE IF NOT EXISTS scraped_prices (
24
+ id TEXT PRIMARY KEY,
25
+ url TEXT NOT NULL,
26
+ domain TEXT NOT NULL,
27
+ product_name TEXT,
28
+ price REAL,
29
+ currency TEXT DEFAULT 'USD',
30
+ original_price REAL,
31
+ availability TEXT,
32
+ rating REAL,
33
+ review_count INTEGER,
34
+ seller TEXT,
35
+ category TEXT,
36
+ extraction_method TEXT,
37
+ raw_data TEXT DEFAULT '{}',
38
+ scraped_at TEXT DEFAULT (datetime('now'))
39
+ );
40
+
41
+ CREATE TABLE IF NOT EXISTS universal_price_history (
42
+ id TEXT PRIMARY KEY,
43
+ url_hash TEXT NOT NULL,
44
+ domain TEXT NOT NULL,
45
+ product_name TEXT,
46
+ price REAL,
47
+ currency TEXT DEFAULT 'USD',
48
+ recorded_at TEXT DEFAULT (datetime('now'))
49
+ );
50
+
51
+ CREATE INDEX IF NOT EXISTS idx_scraped_domain ON scraped_prices(domain);
52
+ CREATE INDEX IF NOT EXISTS idx_scraped_url ON scraped_prices(url);
53
+ CREATE INDEX IF NOT EXISTS idx_uph_hash ON universal_price_history(url_hash);
54
+ `);
55
+
56
+ const stmts = {
57
+ insertScraped: db.prepare(`INSERT OR REPLACE INTO scraped_prices
58
+ (id, url, domain, product_name, price, currency, original_price,
59
+ availability, rating, review_count, seller, category, extraction_method, raw_data)
60
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`),
61
+ getByUrl: db.prepare('SELECT * FROM scraped_prices WHERE url = ? ORDER BY scraped_at DESC LIMIT 1'),
62
+ getByDomain: db.prepare('SELECT * FROM scraped_prices WHERE domain = ? ORDER BY scraped_at DESC LIMIT ?'),
63
+ insertHistory: db.prepare(`INSERT INTO universal_price_history
64
+ (id, url_hash, domain, product_name, price, currency) VALUES (?, ?, ?, ?, ?, ?)`),
65
+ getHistory: db.prepare('SELECT * FROM universal_price_history WHERE url_hash = ? ORDER BY recorded_at DESC LIMIT ?'),
66
+ };
67
+
68
+ // ─── Currency Normalization ──────────────────────────────────────────
69
+
70
+ const CURRENCY_MAP = {
71
+ '$': 'USD', 'USD': 'USD', 'US$': 'USD',
72
+ '': 'EUR', 'EUR': 'EUR',
73
+ '£': 'GBP', 'GBP': 'GBP',
74
+ 'SAR': 'SAR', 'ريال': 'SAR', 'ر.س': 'SAR',
75
+ 'AED': 'AED', 'درهم': 'AED', 'د.إ': 'AED',
76
+ 'TND': 'TND', 'دينار': 'TND', 'د.ت': 'TND',
77
+ 'EGP': 'EGP', 'ج.م': 'EGP',
78
+ 'MAD': 'MAD', 'د.م': 'MAD',
79
+ 'TRY': 'TRY', '': 'TRY',
80
+ 'JPY': 'JPY', '¥': 'JPY',
81
+ 'INR': 'INR', '': 'INR',
82
+ 'KRW': 'KRW', '₩': 'KRW',
83
+ };
84
+
85
+ // Approximate USD rates for comparison
86
+ const TO_USD = {
87
+ USD: 1, EUR: 1.08, GBP: 1.27, SAR: 0.27, AED: 0.27,
88
+ TND: 0.32, EGP: 0.032, MAD: 0.10, TRY: 0.031,
89
+ JPY: 0.0067, INR: 0.012, KRW: 0.00074,
90
+ };
91
+
92
+ function normalizeCurrency(symbol) {
93
+ if (!symbol) return 'USD';
94
+ const s = symbol.trim().toUpperCase();
95
+ return CURRENCY_MAP[s] || CURRENCY_MAP[symbol.trim()] || 'USD';
96
+ }
97
+
98
+ function toUSD(price, currency) {
99
+ const rate = TO_USD[currency] || 1;
100
+ return Math.round(price * rate * 100) / 100;
101
+ }
102
+
103
+ // ─── Price Extraction ────────────────────────────────────────────────
104
+
105
+ const PRICE_PATTERNS = [
106
+ // $123.45 or $ 123.45
107
+ /(?<currency>\$|USD|US\$)\s*(?<price>[\d,]+\.?\d*)/gi,
108
+ // €123.45
109
+ /(?<currency>€|EUR)\s*(?<price>[\d,]+\.?\d*)/gi,
110
+ // £123.45
111
+ /(?<currency>£|GBP)\s*(?<price>[\d,]+\.?\d*)/gi,
112
+ // 123.45 SAR / ريال
113
+ /(?<price>[\d,]+\.?\d*)\s*(?<currency>SAR|ريال|ر\.س|AED|درهم|د\.إ|TND|دينار|د\.ت|EGP|ج\.م|MAD|د\.م)/gi,
114
+ // ₺ ₹ ¥ ₩ prefixed
115
+ /(?<currency>[₺₹¥₩])\s*(?<price>[\d,]+\.?\d*)/gi,
116
+ // 123.45$ (suffix dollar)
117
+ /(?<price>[\d,]+\.?\d*)\s*(?<currency>\$|€|£)/gi,
118
+ ];
119
+
120
+ function extractPrices(text) {
121
+ const prices = [];
122
+ const seen = new Set();
123
+
124
+ for (const pattern of PRICE_PATTERNS) {
125
+ pattern.lastIndex = 0;
126
+ let m;
127
+ while ((m = pattern.exec(text)) !== null) {
128
+ const raw = m.groups?.price || m[2] || m[1];
129
+ const currSymbol = m.groups?.currency || m[1] || m[2];
130
+ if (!raw) continue;
131
+
132
+ const num = parseFloat(raw.replace(/,/g, ''));
133
+ if (isNaN(num) || num <= 0 || num > 1000000) continue;
134
+
135
+ const currency = normalizeCurrency(currSymbol);
136
+ const key = `${num}-${currency}`;
137
+ if (seen.has(key)) continue;
138
+ seen.add(key);
139
+
140
+ prices.push({ price: num, currency, usd: toUSD(num, currency), raw: m[0].trim() });
141
+ }
142
+ }
143
+
144
+ return prices.sort((a, b) => a.usd - b.usd);
145
+ }
146
+
147
+ // ─── Schema.org JSON-LD Extraction ───────────────────────────────────
148
+
149
+ function extractJsonLd(html) {
150
+ const products = [];
151
+ const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
152
+ let m;
153
+
154
+ while ((m = regex.exec(html)) !== null) {
155
+ try {
156
+ let data = JSON.parse(m[1].trim());
157
+ if (Array.isArray(data)) data.forEach(d => processJsonLd(d, products));
158
+ else processJsonLd(data, products);
159
+ } catch (_) {}
160
+ }
161
+
162
+ return products;
163
+ }
164
+
165
+ function processJsonLd(data, products) {
166
+ if (!data || typeof data !== 'object') return;
167
+
168
+ // Handle @graph arrays
169
+ if (data['@graph'] && Array.isArray(data['@graph'])) {
170
+ data['@graph'].forEach(item => processJsonLd(item, products));
171
+ return;
172
+ }
173
+
174
+ const type = (data['@type'] || '').toLowerCase();
175
+
176
+ if (type === 'product' || type === 'hotel' || type === 'hotelroom' ||
177
+ type === 'lodgingbusiness' || type === 'offer') {
178
+ const offers = data.offers || data.priceSpecification || {};
179
+ const offer = Array.isArray(offers) ? offers[0] : offers;
180
+
181
+ products.push({
182
+ name: data.name || data.headline || null,
183
+ price: parseFloat(offer?.price || offer?.lowPrice || data.price) || null,
184
+ originalPrice: parseFloat(offer?.highPrice) || null,
185
+ currency: offer?.priceCurrency || 'USD',
186
+ availability: offer?.availability?.replace('https://schema.org/', '').replace('http://schema.org/', '') || null,
187
+ rating: parseFloat(data.aggregateRating?.ratingValue) || null,
188
+ reviewCount: parseInt(data.aggregateRating?.reviewCount || data.aggregateRating?.ratingCount) || null,
189
+ image: data.image?.url || (typeof data.image === 'string' ? data.image : null),
190
+ description: (data.description || '').slice(0, 500),
191
+ brand: data.brand?.name || data.brand || null,
192
+ sku: data.sku || null,
193
+ url: data.url || null,
194
+ method: 'json-ld',
195
+ });
196
+ }
197
+
198
+ // Recurse into nested objects
199
+ for (const key of Object.keys(data)) {
200
+ if (typeof data[key] === 'object' && data[key] !== null && key !== '@context') {
201
+ if (Array.isArray(data[key])) {
202
+ data[key].forEach(item => {
203
+ if (typeof item === 'object') processJsonLd(item, products);
204
+ });
205
+ } else {
206
+ processJsonLd(data[key], products);
207
+ }
208
+ }
209
+ }
210
+ }
211
+
212
+ // ─── Open Graph / Meta Tag Extraction ────────────────────────────────
213
+
214
+ function extractMetaTags(html) {
215
+ const meta = {};
216
+ const metaRegex = /<meta\s+(?:[^>]*?(?:property|name)=["']([^"']+)["'][^>]*?content=["']([^"']*?)["']|[^>]*?content=["']([^"']*?)["'][^>]*?(?:property|name)=["']([^"']+)["'])[^>]*\/?>/gi;
217
+ let m;
218
+
219
+ while ((m = metaRegex.exec(html)) !== null) {
220
+ const key = (m[1] || m[4] || '').toLowerCase();
221
+ const value = m[2] || m[3] || '';
222
+ if (key && value) meta[key] = value;
223
+ }
224
+
225
+ const product = {};
226
+ if (meta['og:title']) product.name = meta['og:title'];
227
+ if (meta['product:price:amount']) product.price = parseFloat(meta['product:price:amount']);
228
+ if (meta['product:price:currency']) product.currency = meta['product:price:currency'];
229
+ if (meta['og:description']) product.description = meta['og:description'].slice(0, 500);
230
+ if (meta['og:image']) product.image = meta['og:image'];
231
+ if (meta['product:availability']) product.availability = meta['product:availability'];
232
+ if (meta['product:brand']) product.brand = meta['product:brand'];
233
+ if (meta['og:type']) product.type = meta['og:type'];
234
+ if (meta['og:url']) product.url = meta['og:url'];
235
+
236
+ if (product.name || product.price) {
237
+ product.method = 'meta-tags';
238
+ return product;
239
+ }
240
+ return null;
241
+ }
242
+
243
+ // ─── DOM Heuristic Extraction ────────────────────────────────────────
244
+ // This runs either on server (from fetched HTML) or in browser (content script)
245
+
246
+ function extractFromHtml(html, url) {
247
+ const results = [];
248
+ const domain = _extractDomain(url);
249
+
250
+ // 1. Try JSON-LD first (most reliable)
251
+ const jsonLdProducts = extractJsonLd(html);
252
+ if (jsonLdProducts.length > 0) {
253
+ results.push(...jsonLdProducts);
254
+ }
255
+
256
+ // 2. Try Open Graph meta tags
257
+ const metaProduct = extractMetaTags(html);
258
+ if (metaProduct && metaProduct.price) {
259
+ results.push(metaProduct);
260
+ }
261
+
262
+ // 3. Heuristic price extraction from common patterns
263
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
264
+ const pageTitle = titleMatch ? titleMatch[1].replace(/\s+/g, ' ').trim() : '';
265
+
266
+ // Extract potential product containers
267
+ const priceContainerPatterns = [
268
+ // Common e-commerce price selectors reflected as class names
269
+ /class="[^"]*(?:price|cost|amount|tarif|سعر|ثمن)[^"]*"[^>]*>([^<]{1,100})</g,
270
+ /class="[^"]*(?:product-price|item-price|sale-price|offer-price|current-price)[^"]*"[^>]*>([^<]{1,100})</g,
271
+ /class="[^"]*(?:room-price|rate-price|nightly-rate|total-price)[^"]*"[^>]*>([^<]{1,100})</g,
272
+ // data-price attributes
273
+ /data-price=["']([^"']+)["']/gi,
274
+ /data-product-price=["']([^"']+)["']/gi,
275
+ ];
276
+
277
+ const rawPrices = [];
278
+ for (const pattern of priceContainerPatterns) {
279
+ let m;
280
+ while ((m = pattern.exec(html)) !== null) {
281
+ const extracted = extractPrices(m[1]);
282
+ rawPrices.push(...extracted);
283
+ }
284
+ }
285
+
286
+ // If no structured prices found, do a broad sweep
287
+ if (results.length === 0 && rawPrices.length === 0) {
288
+ // Extract from visible text areas (skip scripts/styles)
289
+ const cleaned = html
290
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
291
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
292
+ .replace(/<[^>]+>/g, ' ')
293
+ .replace(/\s+/g, ' ');
294
+ const broadPrices = extractPrices(cleaned);
295
+ rawPrices.push(...broadPrices);
296
+ }
297
+
298
+ // Deduplicate and create heuristic results
299
+ if (rawPrices.length > 0 && results.length === 0) {
300
+ // Filter: likely product prices (not phone numbers, years, etc.)
301
+ const validPrices = rawPrices.filter(p =>
302
+ p.usd >= 1 && p.usd <= 50000 &&
303
+ !`${p.price}`.match(/^(19|20)\d{2}$/) // not a year
304
+ );
305
+
306
+ if (validPrices.length > 0) {
307
+ const sorted = validPrices.sort((a, b) => a.usd - b.usd);
308
+ results.push({
309
+ name: pageTitle || domain,
310
+ price: sorted[0].price,
311
+ currency: sorted[0].currency,
312
+ originalPrice: sorted.length > 1 ? sorted[sorted.length - 1].price : null,
313
+ method: 'heuristic',
314
+ allPrices: sorted.slice(0, 10),
315
+ });
316
+ }
317
+ }
318
+
319
+ return results;
320
+ }
321
+
322
+ // ─── Server-side Fetch & Extract ─────────────────────────────────────
323
+
324
+ const UA_LIST = [
325
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
326
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
327
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
328
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
329
+ ];
330
+
331
+ async function fetchAndExtract(url, options = {}) {
332
+ const domain = _extractDomain(url);
333
+
334
+ // Check cache (< 1 hour old)
335
+ const cached = stmts.getByUrl.get(url);
336
+ if (cached && !options.force) {
337
+ const age = Date.now() - new Date(cached.scraped_at).getTime();
338
+ if (age < 3600000) return { cached: true, ...JSON.parse(cached.raw_data), products: [cached] };
339
+ }
340
+
341
+ try {
342
+ const ua = UA_LIST[Math.floor(Math.random() * UA_LIST.length)];
343
+
344
+ const resp = await safeFetch(url, {
345
+ headers: {
346
+ 'User-Agent': ua,
347
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
348
+ 'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8',
349
+ 'Accept-Encoding': 'gzip, deflate, br',
350
+ 'Sec-Fetch-Dest': 'document',
351
+ 'Sec-Fetch-Mode': 'navigate',
352
+ 'Sec-Fetch-Site': 'none',
353
+ },
354
+ }, {
355
+ timeoutMs: options.timeout || 10000,
356
+ maxBytes: options.maxBytes || 5 * 1024 * 1024,
357
+ maxRedirects: 3,
358
+ allowedContentTypes: ['text/html', 'application/xhtml', 'application/xml'],
359
+ });
360
+
361
+ if (!resp.ok) return { error: `HTTP ${resp.status}`, products: [] };
362
+
363
+ const contentType = resp.headers.get('content-type') || '';
364
+ if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
365
+ return { error: 'Not HTML', products: [] };
366
+ }
367
+
368
+ const html = await resp.text();
369
+ const products = extractFromHtml(html, url);
370
+
371
+ // Store results
372
+ for (const p of products) {
373
+ const id = crypto.randomUUID();
374
+ stmts.insertScraped.run(
375
+ id, url, domain,
376
+ p.name || null, p.price || null, p.currency || 'USD',
377
+ p.originalPrice || null, p.availability || null,
378
+ p.rating || null, p.reviewCount || null,
379
+ p.seller || p.brand || null, p.category || null,
380
+ p.method || 'unknown', JSON.stringify(p)
381
+ );
382
+
383
+ // Record price history
384
+ const urlHash = crypto.createHash('sha256').update(url).digest('hex').slice(0, 16);
385
+ stmts.insertHistory.run(
386
+ crypto.randomUUID(), urlHash, domain,
387
+ p.name || null, p.price || null, p.currency || 'USD'
388
+ );
389
+ }
390
+
391
+ return { products, domain, url, fetchedAt: new Date().toISOString() };
392
+ } catch (err) {
393
+ return { error: err.message, products: [] };
394
+ }
395
+ }
396
+
397
+ // ─── Browser-side extraction (for content script / WAB Browser) ──────
398
+ // This generates a script that can be injected into any page via
399
+ // webview.executeJavaScript() or chrome content script
400
+
401
+ function getBrowserExtractionScript() {
402
+ return `
403
+ (function() {
404
+ 'use strict';
405
+ const WAB_EXTRACT = {
406
+ // Extract JSON-LD products
407
+ getJsonLd() {
408
+ const products = [];
409
+ document.querySelectorAll('script[type="application/ld+json"]').forEach(el => {
410
+ try {
411
+ let data = JSON.parse(el.textContent);
412
+ if (Array.isArray(data)) data.forEach(d => this._processLd(d, products));
413
+ else this._processLd(data, products);
414
+ } catch(_) {}
415
+ });
416
+ return products;
417
+ },
418
+
419
+ _processLd(data, products) {
420
+ if (!data || typeof data !== 'object') return;
421
+ if (data['@graph']) { data['@graph'].forEach(i => this._processLd(i, products)); return; }
422
+ const type = (data['@type'] || '').toLowerCase();
423
+ if (['product','hotel','hotelroom','lodgingbusiness','offer'].includes(type)) {
424
+ const offers = data.offers || {};
425
+ const offer = Array.isArray(offers) ? offers[0] : offers;
426
+ products.push({
427
+ name: data.name || null,
428
+ price: parseFloat(offer?.price || offer?.lowPrice || data.price) || null,
429
+ originalPrice: parseFloat(offer?.highPrice) || null,
430
+ currency: offer?.priceCurrency || 'USD',
431
+ availability: (offer?.availability || '').replace(/https?:\\/\\/schema\\.org\\//,''),
432
+ rating: parseFloat(data.aggregateRating?.ratingValue) || null,
433
+ reviewCount: parseInt(data.aggregateRating?.reviewCount) || null,
434
+ image: typeof data.image === 'string' ? data.image : data.image?.url || null,
435
+ brand: data.brand?.name || data.brand || null,
436
+ method: 'json-ld'
437
+ });
438
+ }
439
+ for (const key of Object.keys(data)) {
440
+ if (typeof data[key] === 'object' && data[key] !== null && key !== '@context') {
441
+ if (Array.isArray(data[key])) data[key].forEach(i => { if (typeof i === 'object') this._processLd(i, products); });
442
+ else this._processLd(data[key], products);
443
+ }
444
+ }
445
+ },
446
+
447
+ // Extract Open Graph meta
448
+ getMeta() {
449
+ const m = {};
450
+ document.querySelectorAll('meta[property], meta[name]').forEach(el => {
451
+ const key = (el.getAttribute('property') || el.getAttribute('name') || '').toLowerCase();
452
+ const val = el.getAttribute('content');
453
+ if (key && val) m[key] = val;
454
+ });
455
+ const p = {};
456
+ if (m['og:title']) p.name = m['og:title'];
457
+ if (m['product:price:amount']) p.price = parseFloat(m['product:price:amount']);
458
+ if (m['product:price:currency']) p.currency = m['product:price:currency'];
459
+ if (m['og:description']) p.description = m['og:description'];
460
+ if (m['og:image']) p.image = m['og:image'];
461
+ if (p.name || p.price) { p.method = 'meta-tags'; return p; }
462
+ return null;
463
+ },
464
+
465
+ // Extract prices from visible text
466
+ getPrices() {
467
+ const patterns = [
468
+ /(?:\\$|USD|US\\$)\\s*([\\d,]+\\.?\\d*)/g,
469
+ /(?:€|EUR)\\s*([\\d,]+\\.?\\d*)/g,
470
+ /(?:£|GBP)\\s*([\\d,]+\\.?\\d*)/g,
471
+ /([\\d,]+\\.?\\d*)\\s*(?:SAR|ريال|AED|درهم|TND|دينار|EGP)/g,
472
+ /(?:[₺₹¥₩])\\s*([\\d,]+\\.?\\d*)/g,
473
+ ];
474
+ const prices = [];
475
+ const seen = new Set();
476
+
477
+ // Target price-like containers first
478
+ const priceEls = document.querySelectorAll(
479
+ '[class*="price"], [class*="cost"], [class*="amount"], [class*="rate"], ' +
480
+ '[data-price], [data-product-price], [itemprop="price"], ' +
481
+ '[class*="tarif"], [class*="سعر"]'
482
+ );
483
+ priceEls.forEach(el => {
484
+ const text = el.textContent || el.getAttribute('data-price') || '';
485
+ for (const pat of patterns) {
486
+ pat.lastIndex = 0;
487
+ let m;
488
+ while ((m = pat.exec(text)) !== null) {
489
+ const num = parseFloat((m[1] || m[0].replace(/[^\\d.,]/g,'')).replace(/,/g,''));
490
+ if (num > 0 && num < 100000 && !seen.has(num)) {
491
+ seen.add(num);
492
+ prices.push({ price: num, raw: m[0].trim(), el: el.className });
493
+ }
494
+ }
495
+ }
496
+ });
497
+
498
+ return prices.sort((a,b) => a.price - b.price);
499
+ },
500
+
501
+ // Extract product cards (hotels, flights, items)
502
+ getProductCards() {
503
+ const cards = [];
504
+ const selectors = [
505
+ '[class*="product-card"]', '[class*="hotel-card"]', '[class*="listing-card"]',
506
+ '[class*="search-result"]', '[class*="offer-card"]', '[class*="deal-card"]',
507
+ '[class*="property-card"]', '[class*="sr_item"]', '[class*="result-item"]',
508
+ '[data-testid*="property"]', '[data-testid*="product"]', '[data-testid*="listing"]',
509
+ ];
510
+ const allCards = document.querySelectorAll(selectors.join(','));
511
+ allCards.forEach((el, i) => {
512
+ if (i >= 20) return; // limit
513
+ const title = el.querySelector('[class*="title"], [class*="name"], h2, h3, h4')?.textContent?.trim()?.slice(0, 200);
514
+ const priceEl = el.querySelector('[class*="price"], [data-price], [itemprop="price"]');
515
+ const priceText = priceEl?.textContent?.trim() || priceEl?.getAttribute('data-price') || '';
516
+ const ratingEl = el.querySelector('[class*="rating"], [class*="score"], [aria-label*="rating"], [aria-label*="score"]');
517
+ const rating = ratingEl?.textContent?.trim() || ratingEl?.getAttribute('aria-label') || '';
518
+ const link = el.querySelector('a[href]')?.href || '';
519
+ const img = el.querySelector('img')?.src || '';
520
+
521
+ if (title || priceText) {
522
+ cards.push({ title, price: priceText, rating, link, image: img, index: i });
523
+ }
524
+ });
525
+ return cards;
526
+ },
527
+
528
+ // Full extraction — called by WAB Browser or extension
529
+ extract() {
530
+ const hasWabBridge = typeof window.AICommands !== 'undefined' || typeof window.__wab_bidi !== 'undefined';
531
+ return {
532
+ url: location.href,
533
+ domain: location.hostname,
534
+ title: document.title,
535
+ jsonLd: this.getJsonLd(),
536
+ meta: this.getMeta(),
537
+ prices: this.getPrices(),
538
+ cards: this.getProductCards(),
539
+ timestamp: Date.now(),
540
+ hasWabBridge,
541
+ wabBridgeType: hasWabBridge
542
+ ? (typeof window.__wab_bidi !== 'undefined' ? 'bidi' : 'standard')
543
+ : null,
544
+ };
545
+ }
546
+ };
547
+
548
+ // Expose for WAB Browser / extension
549
+ window.__wab_universal = WAB_EXTRACT;
550
+
551
+ // Auto-report if WAB extension is present
552
+ if (window.__wab_extension_ready) {
553
+ window.postMessage({ type: 'wab-extract', data: WAB_EXTRACT.extract() }, '*');
554
+ }
555
+
556
+ return WAB_EXTRACT.extract();
557
+ })();
558
+ `;
559
+ }
560
+
561
+ // ─── Helpers ─────────────────────────────────────────────────────────
562
+
563
+ function _extractDomain(url) {
564
+ try { return new URL(url).hostname.replace(/^www\./, ''); } catch (_) { return ''; }
565
+ }
566
+
567
+ function getPriceHistory(url, limit = 30) {
568
+ const urlHash = crypto.createHash('sha256').update(url).digest('hex').slice(0, 16);
569
+ return stmts.getHistory.all(urlHash, limit);
570
+ }
571
+
572
+ function getScrapedByDomain(domain, limit = 50) {
573
+ return stmts.getByDomain.all(domain.replace(/^www\./, ''), limit);
574
+ }
575
+
576
+ // ─── Process browser extraction data ─────────────────────────────────
577
+ // Data sent from WAB Browser webview or Chrome extension content script
578
+
579
+ function processBrowserExtraction(data) {
580
+ if (!data || !data.url) return { error: 'No URL' };
581
+
582
+ const domain = _extractDomain(data.url);
583
+ const products = [];
584
+
585
+ // Process JSON-LD products
586
+ if (data.jsonLd && data.jsonLd.length > 0) {
587
+ products.push(...data.jsonLd);
588
+ }
589
+
590
+ // Process meta tags
591
+ if (data.meta && data.meta.price) {
592
+ products.push(data.meta);
593
+ }
594
+
595
+ // Process product cards from DOM
596
+ if (data.cards && data.cards.length > 0) {
597
+ for (const card of data.cards) {
598
+ const prices = extractPrices(card.price || '');
599
+ if (prices.length > 0 || card.title) {
600
+ products.push({
601
+ name: card.title || 'Unknown',
602
+ price: prices[0]?.price || null,
603
+ currency: prices[0]?.currency || 'USD',
604
+ rating: parseFloat(card.rating) || null,
605
+ url: card.link || data.url,
606
+ image: card.image || null,
607
+ method: 'dom-cards',
608
+ });
609
+ }
610
+ }
611
+ }
612
+
613
+ // Fallback: use raw prices
614
+ if (products.length === 0 && data.prices && data.prices.length > 0) {
615
+ products.push({
616
+ name: data.title || domain,
617
+ price: data.prices[0].price,
618
+ currency: 'USD',
619
+ method: 'dom-prices',
620
+ allPrices: data.prices,
621
+ });
622
+ }
623
+
624
+ // Store in database
625
+ for (const p of products) {
626
+ const id = crypto.randomUUID();
627
+ try {
628
+ stmts.insertScraped.run(
629
+ id, data.url, domain,
630
+ p.name || null, p.price || null, p.currency || 'USD',
631
+ p.originalPrice || null, p.availability || null,
632
+ p.rating || null, p.reviewCount || null,
633
+ p.seller || p.brand || null, p.category || null,
634
+ p.method || 'browser', JSON.stringify(p)
635
+ );
636
+
637
+ const urlHash = crypto.createHash('sha256').update(data.url).digest('hex').slice(0, 16);
638
+ stmts.insertHistory.run(
639
+ crypto.randomUUID(), urlHash, domain,
640
+ p.name || null, p.price || null, p.currency || 'USD'
641
+ );
642
+ } catch (_) {}
643
+ }
644
+
645
+ return { products, domain, url: data.url, hasWabBridge: data.hasWabBridge };
646
+ }
647
+
648
+ // ─── Exports ─────────────────────────────────────────────────────────
649
+
650
+ module.exports = {
651
+ extractPrices,
652
+ extractJsonLd,
653
+ extractMetaTags,
654
+ extractFromHtml,
655
+ fetchAndExtract,
656
+ getBrowserExtractionScript,
657
+ processBrowserExtraction,
658
+ getPriceHistory,
659
+ getScrapedByDomain,
660
+ normalizeCurrency,
661
+ toUSD,
662
+ };