@doppelgangerdev/doppelganger 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/.dockerignore +9 -0
  2. package/.github/workflows/docker-publish.yml +59 -0
  3. package/CODE_OF_CONDUCT.md +28 -0
  4. package/CONTRIBUTING.md +42 -0
  5. package/Dockerfile +44 -0
  6. package/LICENSE +163 -0
  7. package/README.md +133 -0
  8. package/TERMS.md +16 -0
  9. package/THIRD_PARTY_LICENSES.md +3502 -0
  10. package/agent.js +1240 -0
  11. package/headful.js +171 -0
  12. package/index.html +21 -0
  13. package/n8n-nodes-doppelganger/LICENSE +201 -0
  14. package/n8n-nodes-doppelganger/README.md +42 -0
  15. package/n8n-nodes-doppelganger/package-lock.json +6128 -0
  16. package/n8n-nodes-doppelganger/package.json +36 -0
  17. package/n8n-nodes-doppelganger/src/credentials/DoppelgangerApi.credentials.ts +35 -0
  18. package/n8n-nodes-doppelganger/src/index.ts +4 -0
  19. package/n8n-nodes-doppelganger/src/nodes/Doppelganger/Doppelganger.node.ts +147 -0
  20. package/n8n-nodes-doppelganger/src/nodes/Doppelganger/icon.png +0 -0
  21. package/n8n-nodes-doppelganger/tsconfig.json +14 -0
  22. package/package.json +45 -0
  23. package/postcss.config.js +6 -0
  24. package/public/icon.png +0 -0
  25. package/public/novnc.html +151 -0
  26. package/public/styles.css +86 -0
  27. package/scrape.js +389 -0
  28. package/server.js +875 -0
  29. package/src/App.tsx +722 -0
  30. package/src/components/AuthScreen.tsx +95 -0
  31. package/src/components/CodeEditor.tsx +70 -0
  32. package/src/components/DashboardScreen.tsx +133 -0
  33. package/src/components/EditorScreen.tsx +1519 -0
  34. package/src/components/ExecutionDetailScreen.tsx +115 -0
  35. package/src/components/ExecutionsScreen.tsx +156 -0
  36. package/src/components/LoadingScreen.tsx +26 -0
  37. package/src/components/NotFoundScreen.tsx +34 -0
  38. package/src/components/RichInput.tsx +68 -0
  39. package/src/components/SettingsScreen.tsx +228 -0
  40. package/src/components/Sidebar.tsx +61 -0
  41. package/src/components/app/CenterAlert.tsx +44 -0
  42. package/src/components/app/CenterConfirm.tsx +33 -0
  43. package/src/components/app/EditorLoader.tsx +89 -0
  44. package/src/components/editor/ActionPalette.tsx +79 -0
  45. package/src/components/editor/JsonEditorPane.tsx +71 -0
  46. package/src/components/editor/ResultsPane.tsx +641 -0
  47. package/src/components/editor/actionCatalog.ts +23 -0
  48. package/src/components/settings/AgentAiPanel.tsx +105 -0
  49. package/src/components/settings/ApiKeyPanel.tsx +68 -0
  50. package/src/components/settings/CookiesPanel.tsx +154 -0
  51. package/src/components/settings/LayoutPanel.tsx +46 -0
  52. package/src/components/settings/ScreenshotsPanel.tsx +64 -0
  53. package/src/components/settings/SettingsHeader.tsx +28 -0
  54. package/src/components/settings/StoragePanel.tsx +35 -0
  55. package/src/index.css +287 -0
  56. package/src/main.tsx +13 -0
  57. package/src/types.ts +114 -0
  58. package/src/utils/syntaxHighlight.ts +140 -0
  59. package/start-vnc.sh +52 -0
  60. package/tailwind.config.js +22 -0
  61. package/tsconfig.json +39 -0
  62. package/tsconfig.node.json +12 -0
  63. package/vite.config.mts +27 -0
package/scrape.js ADDED
@@ -0,0 +1,389 @@
1
+ const { chromium } = require('playwright');
2
+ const { JSDOM } = require('jsdom');
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+
6
+ const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
7
+ const STORAGE_STATE_FILE = (() => {
8
+ try {
9
+ if (fs.existsSync(STORAGE_STATE_PATH)) {
10
+ const stat = fs.statSync(STORAGE_STATE_PATH);
11
+ if (stat.isDirectory()) {
12
+ return path.join(STORAGE_STATE_PATH, 'storage_state.json');
13
+ }
14
+ }
15
+ } catch {}
16
+ return STORAGE_STATE_PATH;
17
+ })();
18
+
19
+ const userAgents = [
20
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
21
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
22
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
23
+ ];
24
+
25
+ const csvEscape = (value) => {
26
+ const text = value === undefined || value === null ? '' : String(value);
27
+ if (/[",\n\r]/.test(text) || /^\s|\s$/.test(text)) {
28
+ return `"${text.replace(/"/g, '""')}"`;
29
+ }
30
+ return text;
31
+ };
32
+
33
+ const toCsvString = (raw) => {
34
+ if (raw === undefined || raw === null) return '';
35
+ if (typeof raw === 'string') {
36
+ const trimmed = raw.trim();
37
+ if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
38
+ try {
39
+ return toCsvString(JSON.parse(trimmed));
40
+ } catch {
41
+ return raw;
42
+ }
43
+ }
44
+ return raw;
45
+ }
46
+ const rows = Array.isArray(raw) ? raw : [raw];
47
+ if (rows.length === 0) return '';
48
+
49
+ const allKeys = [];
50
+ rows.forEach((row) => {
51
+ if (row && typeof row === 'object' && !Array.isArray(row)) {
52
+ Object.keys(row).forEach((key) => {
53
+ if (!allKeys.includes(key)) allKeys.push(key);
54
+ });
55
+ }
56
+ });
57
+
58
+ if (allKeys.length === 0) {
59
+ const lines = rows.map((row) => {
60
+ if (Array.isArray(row)) return row.map(csvEscape).join(',');
61
+ return csvEscape(row);
62
+ });
63
+ return lines.join('\n');
64
+ }
65
+
66
+ const headerLine = allKeys.map(csvEscape).join(',');
67
+ const lines = rows.map((row) => {
68
+ const obj = row && typeof row === 'object' ? row : {};
69
+ return allKeys.map((key) => csvEscape(obj[key])).join(',');
70
+ });
71
+ return [headerLine, ...lines].join('\n');
72
+ };
73
+
74
+ async function handleScrape(req, res) {
75
+ const url = req.body.url || req.query.url;
76
+ const customHeaders = req.body.headers || {};
77
+ const userSelector = req.body.selector || req.query.selector;
78
+ const waitInput = req.body.wait || req.query.wait;
79
+ const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
80
+ const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
81
+ const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
82
+ const includeShadowDom = includeShadowDomRaw === undefined
83
+ ? true
84
+ : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
85
+ const extractionScript = req.body.extractionScript || req.query.extractionScript;
86
+ const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
87
+
88
+ if (!url) {
89
+ return res.status(400).json({ error: 'URL is required.' });
90
+ }
91
+
92
+ console.log(`Scraping: ${url}`);
93
+
94
+ // Pick a random UA if rotation is enabled, otherwise use the first one
95
+ const selectedUA = rotateUserAgents
96
+ ? userAgents[Math.floor(Math.random() * userAgents.length)]
97
+ : userAgents[0];
98
+
99
+ let browser;
100
+ try {
101
+ // Use 'chrome' channel to use a real installed browser instead of default Chromium
102
+ browser = await chromium.launch({
103
+ headless: true,
104
+ channel: 'chrome',
105
+ args: [
106
+ '--no-sandbox',
107
+ '--disable-setuid-sandbox',
108
+ '--disable-blink-features=AutomationControlled',
109
+ '--hide-scrollbars',
110
+ '--mute-audio'
111
+ ]
112
+ });
113
+
114
+ const contextOptions = {
115
+ userAgent: selectedUA,
116
+ extraHTTPHeaders: customHeaders,
117
+ viewport: { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) },
118
+ deviceScaleFactor: 1,
119
+ locale: 'en-US',
120
+ timezoneId: 'America/New_York',
121
+ colorScheme: 'dark',
122
+ permissions: ['geolocation']
123
+ };
124
+
125
+ if (fs.existsSync(STORAGE_STATE_FILE)) {
126
+ contextOptions.storageState = STORAGE_STATE_FILE;
127
+ }
128
+
129
+ const context = await browser.newContext(contextOptions);
130
+
131
+ // Manual WebDriver Patch
132
+ await context.addInitScript(() => {
133
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
134
+ });
135
+ if (includeShadowDom) {
136
+ await context.addInitScript(() => {
137
+ if (!Element.prototype.attachShadow) return;
138
+ const original = Element.prototype.attachShadow;
139
+ Element.prototype.attachShadow = function (init) {
140
+ const options = init ? { ...init, mode: 'open' } : { mode: 'open' };
141
+ return original.call(this, options);
142
+ };
143
+ });
144
+ }
145
+
146
+ const page = await context.newPage();
147
+
148
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
149
+
150
+ // Auto-scroll logic
151
+ await page.evaluate(async () => {
152
+ await new Promise((resolve) => {
153
+ let totalHeight = 0;
154
+ const distance = 400;
155
+ const timer = setInterval(() => {
156
+ const scrollHeight = document.body.scrollHeight;
157
+ window.scrollBy(0, distance);
158
+ totalHeight += distance;
159
+ if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); }
160
+ }, 100);
161
+ });
162
+ window.scrollTo(0, 0);
163
+ });
164
+
165
+ await page.waitForTimeout(waitTime);
166
+
167
+ let productHtml = '';
168
+ let usedFallback = false;
169
+
170
+ if (userSelector) {
171
+ if (includeShadowDom) {
172
+ productHtml = await page.evaluate((selector) => {
173
+ const stripUseless = (root) => {
174
+ const useless = root.querySelectorAll('script, style, svg, link, noscript');
175
+ useless.forEach(node => node.remove());
176
+ };
177
+
178
+ const cloneWithShadow = (root) => {
179
+ const clone = root.cloneNode(true);
180
+ const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
181
+ const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
182
+
183
+ while (walkerOrig.nextNode() && walkerClone.nextNode()) {
184
+ const orig = walkerOrig.currentNode;
185
+ const cloned = walkerClone.currentNode;
186
+ if (orig.shadowRoot) {
187
+ const template = document.createElement('template');
188
+ template.setAttribute('data-shadowroot', 'open');
189
+ template.innerHTML = orig.shadowRoot.innerHTML;
190
+ cloned.appendChild(template);
191
+ }
192
+ }
193
+
194
+ stripUseless(clone);
195
+ return clone;
196
+ };
197
+
198
+ const elements = Array.from(document.querySelectorAll(selector));
199
+ return elements.map(el => cloneWithShadow(el).outerHTML).join('\n');
200
+ }, userSelector);
201
+ } else {
202
+ productHtml = await page.$$eval(userSelector, (elements) => {
203
+ return elements.map(el => {
204
+ const useless = el.querySelectorAll('script, style, svg, link, noscript');
205
+ useless.forEach(node => node.remove());
206
+ return el.outerHTML;
207
+ }).join('\n');
208
+ });
209
+ }
210
+ if (!productHtml || productHtml.trim() === '') usedFallback = true;
211
+ } else {
212
+ usedFallback = true;
213
+ }
214
+
215
+ if (usedFallback) {
216
+ productHtml = await page.evaluate((withShadow) => {
217
+ const stripUseless = (root) => {
218
+ const useless = root.querySelectorAll('script, style, svg, link, noscript');
219
+ useless.forEach(node => node.remove());
220
+ };
221
+
222
+ const cloneWithShadow = (root) => {
223
+ const clone = root.cloneNode(true);
224
+ const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
225
+ const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
226
+
227
+ while (walkerOrig.nextNode() && walkerClone.nextNode()) {
228
+ const orig = walkerOrig.currentNode;
229
+ const cloned = walkerClone.currentNode;
230
+ if (orig.shadowRoot) {
231
+ const template = document.createElement('template');
232
+ template.setAttribute('data-shadowroot', 'open');
233
+ template.innerHTML = orig.shadowRoot.innerHTML;
234
+ cloned.appendChild(template);
235
+ }
236
+ }
237
+
238
+ stripUseless(clone);
239
+ return clone;
240
+ };
241
+
242
+ if (withShadow) {
243
+ return cloneWithShadow(document.body).innerHTML;
244
+ }
245
+
246
+ const body = document.body.cloneNode(true);
247
+ stripUseless(body);
248
+ return body.innerHTML;
249
+ }, includeShadowDom);
250
+ }
251
+
252
+ const runExtractionScript = async (script, html, pageUrl) => {
253
+ if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
254
+ try {
255
+ const dom = new JSDOM(html || '');
256
+ const { window } = dom;
257
+ const logBuffer = [];
258
+ const consoleProxy = {
259
+ log: (...args) => logBuffer.push(args.join(' ')),
260
+ warn: (...args) => logBuffer.push(args.join(' ')),
261
+ error: (...args) => logBuffer.push(args.join(' '))
262
+ };
263
+ const shadowHelpers = (() => {
264
+ const shadowQueryAll = (selector, root = window.document) => {
265
+ const results = [];
266
+ const walk = (node) => {
267
+ if (!node) return;
268
+ if (node.nodeType === 1) {
269
+ const el = node;
270
+ if (selector && el.matches && el.matches(selector)) results.push(el);
271
+ if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
272
+ walk(el.content);
273
+ }
274
+ } else if (node.nodeType === 11) {
275
+ // DocumentFragment
276
+ }
277
+ if (node.childNodes) {
278
+ node.childNodes.forEach((child) => walk(child));
279
+ }
280
+ };
281
+ walk(root);
282
+ return results;
283
+ };
284
+
285
+ const shadowText = (root = window.document) => {
286
+ const texts = [];
287
+ const walk = (node) => {
288
+ if (!node) return;
289
+ if (node.nodeType === 3) {
290
+ const text = node.nodeValue ? node.nodeValue.trim() : '';
291
+ if (text) texts.push(text);
292
+ return;
293
+ }
294
+ if (node.nodeType === 1) {
295
+ const el = node;
296
+ if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
297
+ walk(el.content);
298
+ }
299
+ }
300
+ if (node.childNodes) {
301
+ node.childNodes.forEach((child) => walk(child));
302
+ }
303
+ };
304
+ walk(root);
305
+ return texts;
306
+ };
307
+
308
+ return { shadowQueryAll, shadowText };
309
+ })();
310
+
311
+ const executor = new Function(
312
+ '$$data',
313
+ 'window',
314
+ 'document',
315
+ 'DOMParser',
316
+ 'console',
317
+ `"use strict"; return (async () => { ${script}\n})();`
318
+ );
319
+ const $$data = {
320
+ html: () => html || '',
321
+ url: () => pageUrl || '',
322
+ window,
323
+ document: window.document,
324
+ shadowQueryAll: includeShadowDom ? shadowHelpers.shadowQueryAll : undefined,
325
+ shadowText: includeShadowDom ? shadowHelpers.shadowText : undefined
326
+ };
327
+ const result = await executor($$data, window, window.document, window.DOMParser, consoleProxy);
328
+ return { result, logs: logBuffer };
329
+ } catch (e) {
330
+ return { result: `Extraction script error: ${e.message}`, logs: [] };
331
+ }
332
+ };
333
+
334
+ const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
335
+
336
+ // Ensure the public/screenshots directory exists
337
+ const screenshotsDir = path.join(__dirname, 'public', 'screenshots');
338
+ if (!fs.existsSync(screenshotsDir)) {
339
+ fs.mkdirSync(screenshotsDir, { recursive: true });
340
+ }
341
+
342
+ const screenshotName = `scrape_${Date.now()}.png`;
343
+ const screenshotPath = path.join(screenshotsDir, screenshotName);
344
+ try {
345
+ await page.screenshot({ path: screenshotPath, fullPage: false });
346
+ } catch (e) {
347
+ console.error('Screenshot failed:', e.message);
348
+ }
349
+
350
+ // Simple HTML Formatter
351
+ const formatHTML = (html) => {
352
+ let indent = 0;
353
+ return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
354
+ if (slash) indent--;
355
+ const result = ' '.repeat(Math.max(0, indent)) + match;
356
+ if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
357
+ return '\n' + result;
358
+ }).trim();
359
+ };
360
+
361
+ const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
362
+ const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
363
+
364
+ const data = {
365
+ title: await page.title(),
366
+ url: page.url(),
367
+ html: formatHTML(productHtml),
368
+ data: formattedExtraction,
369
+ is_partial: !usedFallback,
370
+ selector_used: usedFallback ? (userSelector ? `${userSelector} (not found, using body)` : 'body (default)') : userSelector,
371
+ links: await page.$$eval('a[href]', elements => {
372
+ return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
373
+ }),
374
+ screenshot_url: `/screenshots/${screenshotName}`
375
+ };
376
+
377
+ // Save session state
378
+ await context.storageState({ path: STORAGE_STATE_FILE });
379
+
380
+ await browser.close();
381
+ res.json(data);
382
+ } catch (error) {
383
+ console.error('Scrape Error:', error);
384
+ if (browser) await browser.close();
385
+ res.status(500).json({ error: 'Failed to scrape', details: error.message });
386
+ }
387
+ }
388
+
389
+ module.exports = { handleScrape };