mcp-camoufox 0.3.3 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +312 -1
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -61,7 +61,7 @@ const SNAPSHOT_JS = `(() => {
61
61
  name: el.getAttribute('name') || '',
62
62
  placeholder: el.getAttribute('placeholder') || '',
63
63
  aria: el.getAttribute('aria-label') || '',
64
- href: el.tagName === 'A' ? (el.href || '').slice(0, 120) : '',
64
+ href: el.tagName === 'A' ? (el.href || '').slice(0, 500) : '',
65
65
  checked: el.checked || false,
66
66
  disabled: el.disabled || false,
67
67
  };
@@ -917,6 +917,317 @@ server.tool("export_har", "Export network traffic as HAR file.", {
917
917
  writeFileSync(target, JSON.stringify(har, null, 2));
918
918
  return { content: [{ type: "text", text: `HAR exported: ${target} (${entries.length} entries)` }] };
919
919
  });
920
+ // ── Tools: Scraping / Extraction ───────────────────────────────────────────
921
+ server.tool("detect_content_pattern", "Auto-detect repeated content patterns (cards, listings, rows) and suggest CSS selectors. Run this BEFORE extract_structured to find the right selectors.", {
922
+ min_items: z.number().default(3).describe("Minimum repeated items to detect as pattern"),
923
+ }, async ({ min_items }) => {
924
+ const page = getPage();
925
+ const patterns = await page.evaluate(`(() => {
926
+ // Count children with same tag+class per parent
927
+ var candidates = [];
928
+ var parents = document.querySelectorAll('main, [role="main"], section, div, ul, ol, tbody');
929
+ for (var p = 0; p < parents.length; p++) {
930
+ var parent = parents[p];
931
+ var childMap = {};
932
+ for (var c = 0; c < parent.children.length; c++) {
933
+ var child = parent.children[c];
934
+ var key = child.tagName;
935
+ if (child.className) key += '.' + child.className.split(' ').filter(function(c){return c.length>0}).slice(0,2).join('.');
936
+ if (!childMap[key]) childMap[key] = { count: 0, tag: child.tagName.toLowerCase(), cls: child.className, sample: '' };
937
+ childMap[key].count++;
938
+ if (!childMap[key].sample) childMap[key].sample = (child.innerText || '').trim().slice(0, 150);
939
+ }
940
+ var keys = Object.keys(childMap);
941
+ for (var k = 0; k < keys.length; k++) {
942
+ if (childMap[keys[k]].count >= ${min_items}) {
943
+ var info = childMap[keys[k]];
944
+ // Build selector
945
+ var sel = info.tag;
946
+ if (info.cls) {
947
+ var classes = info.cls.split(' ').filter(function(c){return c.length > 0 && c.length < 40}).slice(0,2);
948
+ if (classes.length > 0) sel = info.tag + '.' + classes.join('.');
949
+ }
950
+ // Find child elements for field suggestions
951
+ var firstItem = parent.querySelector(sel);
952
+ var fieldHints = [];
953
+ if (firstItem) {
954
+ var links = firstItem.querySelectorAll('a[href]');
955
+ if (links.length > 0) fieldHints.push({ name: 'url', selector: 'a', attribute: 'href', sample: links[0].href.slice(0, 100) });
956
+ var headings = firstItem.querySelectorAll('h1,h2,h3,h4,h5,h6');
957
+ if (headings.length > 0) fieldHints.push({ name: 'title', selector: headings[0].tagName.toLowerCase(), attribute: '', sample: headings[0].innerText.trim().slice(0, 60) });
958
+ var imgs = firstItem.querySelectorAll('img[src]');
959
+ if (imgs.length > 0) fieldHints.push({ name: 'image', selector: 'img', attribute: 'src', sample: imgs[0].src.slice(0, 80) });
960
+ // Find text-heavy spans/divs
961
+ var texts = firstItem.querySelectorAll('span, p, div');
962
+ var textItems = [];
963
+ for (var t = 0; t < texts.length; t++) {
964
+ var txt = texts[t].innerText.trim();
965
+ if (txt.length > 5 && txt.length < 100 && texts[t].children.length === 0) {
966
+ var tSel = texts[t].tagName.toLowerCase();
967
+ if (texts[t].className) tSel += '.' + texts[t].className.split(' ').filter(function(c){return c.length>0&&c.length<40}).slice(0,1).join('.');
968
+ textItems.push({ selector: tSel, sample: txt.slice(0, 60) });
969
+ }
970
+ }
971
+ for (var ti = 0; ti < Math.min(textItems.length, 3); ti++) {
972
+ fieldHints.push({ name: 'field_' + ti, selector: textItems[ti].selector, attribute: '', sample: textItems[ti].sample });
973
+ }
974
+ }
975
+ candidates.push({
976
+ selector: sel,
977
+ count: info.count,
978
+ sample_text: info.sample.slice(0, 100),
979
+ suggested_fields: fieldHints
980
+ });
981
+ }
982
+ }
983
+ }
984
+ // Sort by count desc, deduplicate by selector
985
+ candidates.sort(function(a,b){ return b.count - a.count; });
986
+ var seen = {};
987
+ var unique = [];
988
+ for (var u = 0; u < candidates.length; u++) {
989
+ if (!seen[candidates[u].selector]) {
990
+ seen[candidates[u].selector] = true;
991
+ unique.push(candidates[u]);
992
+ }
993
+ }
994
+ return unique.slice(0, 10);
995
+ })()`);
996
+ const arr = patterns;
997
+ if (arr.length === 0) {
998
+ return { content: [{ type: "text", text: "No repeated content patterns detected. Try scrolling down to load more content." }] };
999
+ }
1000
+ let text = `Detected ${arr.length} content pattern(s):\n\n`;
1001
+ for (const p of arr) {
1002
+ text += `--- ${p.count} items: ${p.selector} ---\n`;
1003
+ text += `Sample: "${p.sample_text}"\n`;
1004
+ if (p.suggested_fields?.length) {
1005
+ text += `Suggested extract_structured call:\n`;
1006
+ text += ` container_selector: "${p.selector}"\n`;
1007
+ text += ` fields:\n`;
1008
+ for (const f of p.suggested_fields) {
1009
+ text += ` - {name: "${f.name}", selector: "${f.selector}"${f.attribute ? `, attribute: "${f.attribute}"` : ''}} → "${f.sample}"\n`;
1010
+ }
1011
+ }
1012
+ text += `\n`;
1013
+ }
1014
+ return { content: [{ type: "text", text }] };
1015
+ });
1016
+ server.tool("extract_structured", "Extract structured data from repeated elements (cards, rows, listings). Auto-deduplicates, filters empty items, extracts direct text only. Use detect_content_pattern first to find correct selectors.", {
1017
+ container_selector: z.string().describe("CSS selector for each repeated item. Use detect_content_pattern to find this."),
1018
+ fields: z.array(z.object({
1019
+ name: z.string().describe("Field name in output"),
1020
+ selector: z.string().describe("CSS selector within each item"),
1021
+ attribute: z.string().default("").describe("Attribute to extract (empty = direct text only)"),
1022
+ })).describe("Fields to extract from each item"),
1023
+ limit: z.number().default(50).describe("Max items to extract"),
1024
+ deduplicate_by: z.string().default("").describe("Field name to deduplicate by (empty = auto)"),
1025
+ direct_text_only: z.boolean().default(true).describe("Extract only direct text of matched element, not children text (prevents field mixing)"),
1026
+ }, async ({ container_selector, fields, limit, deduplicate_by, direct_text_only }) => {
1027
+ const page = getPage();
1028
+ const fieldsDef = JSON.stringify(fields);
1029
+ const results = await page.evaluate(`(() => {
1030
+ // Helper: get direct text only (no children text) to prevent field mixing
1031
+ function directText(el) {
1032
+ var text = '';
1033
+ for (var n = 0; n < el.childNodes.length; n++) {
1034
+ if (el.childNodes[n].nodeType === 3) text += el.childNodes[n].textContent;
1035
+ }
1036
+ text = text.trim();
1037
+ // If direct text empty, fall back to first line of innerText
1038
+ if (!text) {
1039
+ var lines = (el.innerText || '').trim().split('\\n');
1040
+ text = lines[0] || '';
1041
+ }
1042
+ return text.trim();
1043
+ }
1044
+
1045
+ // Get ALL matching containers, then filter to only top-level (not nested)
1046
+ var allContainers = document.querySelectorAll("${container_selector.replace(/"/g, '\\"')}");
1047
+ var containers = [];
1048
+ for (var c = 0; c < allContainers.length; c++) {
1049
+ var isNested = false;
1050
+ var parent = allContainers[c].parentElement;
1051
+ while (parent) {
1052
+ if (parent.matches && parent.matches("${container_selector.replace(/"/g, '\\"')}")) {
1053
+ isNested = true;
1054
+ break;
1055
+ }
1056
+ parent = parent.parentElement;
1057
+ }
1058
+ if (!isNested) containers.push(allContainers[c]);
1059
+ }
1060
+
1061
+ var fields = ${fieldsDef};
1062
+ var directOnly = ${direct_text_only};
1063
+ var out = [];
1064
+ var seenKeys = {};
1065
+ var dedup = "${deduplicate_by}";
1066
+
1067
+ for (var i = 0; i < Math.min(containers.length, ${limit * 2}); i++) {
1068
+ var item = {};
1069
+ var nonEmptyCount = 0;
1070
+
1071
+ for (var j = 0; j < fields.length; j++) {
1072
+ var f = fields[j];
1073
+ var el = containers[i].querySelector(f.selector);
1074
+ if (el) {
1075
+ var val;
1076
+ if (f.attribute) {
1077
+ val = el.getAttribute(f.attribute) || '';
1078
+ } else if (directOnly) {
1079
+ val = directText(el);
1080
+ } else {
1081
+ val = (el.innerText || '').trim();
1082
+ }
1083
+ item[f.name] = val;
1084
+ if (val) nonEmptyCount++;
1085
+ } else {
1086
+ item[f.name] = '';
1087
+ }
1088
+ }
1089
+
1090
+ // P0: Skip items where all fields are empty
1091
+ if (nonEmptyCount === 0) continue;
1092
+
1093
+ // P0: Deduplicate
1094
+ var dedupKey = '';
1095
+ if (dedup && item[dedup]) {
1096
+ dedupKey = item[dedup];
1097
+ } else {
1098
+ for (var d = 0; d < fields.length; d++) {
1099
+ if (item[fields[d].name]) { dedupKey = item[fields[d].name]; break; }
1100
+ }
1101
+ }
1102
+ if (dedupKey && seenKeys[dedupKey]) continue;
1103
+ if (dedupKey) seenKeys[dedupKey] = true;
1104
+
1105
+ out.push(item);
1106
+ if (out.length >= ${limit}) break;
1107
+ }
1108
+
1109
+ return {
1110
+ total_on_page: allContainers.length,
1111
+ top_level: containers.length,
1112
+ unique_extracted: out.length,
1113
+ items: out
1114
+ };
1115
+ })()`);
1116
+ return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] };
1117
+ });
1118
+ server.tool("extract_table", "Extract data from an HTML table as JSON array.", {
1119
+ selector: z.string().default("table").describe("CSS selector for the table"),
1120
+ limit: z.number().default(100).describe("Max rows"),
1121
+ }, async ({ selector, limit }) => {
1122
+ const page = getPage();
1123
+ const results = await page.evaluate(`(() => {
1124
+ var table = document.querySelector("${selector.replace(/"/g, '\\"')}");
1125
+ if (!table) return { error: 'Table not found' };
1126
+ var headers = [];
1127
+ var ths = table.querySelectorAll('thead th, thead td, tr:first-child th, tr:first-child td');
1128
+ for (var i = 0; i < ths.length; i++) headers.push(ths[i].innerText.trim());
1129
+ var rows = table.querySelectorAll('tbody tr, tr');
1130
+ var out = [];
1131
+ var start = headers.length > 0 ? 1 : 0;
1132
+ for (var r = start; r < Math.min(rows.length, ${limit} + start); r++) {
1133
+ var cells = rows[r].querySelectorAll('td, th');
1134
+ var row = {};
1135
+ for (var c = 0; c < cells.length; c++) {
1136
+ var key = headers[c] || ('col_' + c);
1137
+ var link = cells[c].querySelector('a');
1138
+ row[key] = cells[c].innerText.trim();
1139
+ if (link) row[key + '_url'] = link.href;
1140
+ }
1141
+ out.push(row);
1142
+ }
1143
+ return { headers: headers, total_rows: rows.length - start, extracted: out.length, rows: out };
1144
+ })()`);
1145
+ return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] };
1146
+ });
1147
+ server.tool("scrape_page", "Smart page scraper — auto-detect and extract main content, links, metadata. Strips nav/footer noise.", {
1148
+ include_links: z.boolean().default(true),
1149
+ include_meta: z.boolean().default(true),
1150
+ max_text_length: z.number().default(8000).describe("Max text chars (truncates at paragraph boundary)"),
1151
+ only_main_content: z.boolean().default(true).describe("Strip nav, header, footer, sidebar — extract only main content area"),
1152
+ }, async ({ include_links, include_meta, max_text_length, only_main_content }) => {
1153
+ const page = getPage();
1154
+ const data = await page.evaluate(`(() => {
1155
+ var result = {};
1156
+ result.title = document.title;
1157
+ result.url = location.href;
1158
+
1159
+ // Meta
1160
+ if (${include_meta}) {
1161
+ var metas = {};
1162
+ var metaEls = document.querySelectorAll('meta[name], meta[property]');
1163
+ for (var i = 0; i < metaEls.length; i++) {
1164
+ var key = metaEls[i].getAttribute('name') || metaEls[i].getAttribute('property');
1165
+ metas[key] = metaEls[i].getAttribute('content') || '';
1166
+ }
1167
+ result.meta = metas;
1168
+ }
1169
+
1170
+ // Find main content area
1171
+ var textSource;
1172
+ if (${only_main_content}) {
1173
+ textSource = document.querySelector('main, [role="main"], #main-content, .main-content, #content, .content');
1174
+ // Exclude nav/footer/sidebar from the source
1175
+ if (textSource) {
1176
+ var clone = textSource.cloneNode(true);
1177
+ var noise = clone.querySelectorAll('nav, header, footer, aside, [role="navigation"], [role="banner"], [role="contentinfo"], .sidebar, .nav, .footer, .header');
1178
+ for (var n = 0; n < noise.length; n++) noise[n].remove();
1179
+ var fullText = clone.innerText.trim();
1180
+ } else {
1181
+ textSource = document.body;
1182
+ var fullText = textSource.innerText.trim();
1183
+ }
1184
+ } else {
1185
+ textSource = document.body;
1186
+ var fullText = textSource.innerText.trim();
1187
+ }
1188
+
1189
+ // Smart truncation: cut at paragraph/newline boundary, not mid-word
1190
+ var totalLen = fullText.length;
1191
+ if (fullText.length > ${max_text_length}) {
1192
+ var cutText = fullText.slice(0, ${max_text_length});
1193
+ var lastNewline = cutText.lastIndexOf('\\n');
1194
+ if (lastNewline > ${max_text_length} * 0.8) {
1195
+ cutText = cutText.slice(0, lastNewline);
1196
+ }
1197
+ result.text = cutText;
1198
+ result.truncated = true;
1199
+ result.total_text_length = totalLen;
1200
+ } else {
1201
+ result.text = fullText;
1202
+ result.truncated = false;
1203
+ result.total_text_length = totalLen;
1204
+ }
1205
+
1206
+ // Links from main content area
1207
+ if (${include_links}) {
1208
+ var linkSource = textSource || document.body;
1209
+ var links = linkSource.querySelectorAll('a[href]');
1210
+ var linkList = [];
1211
+ for (var j = 0; j < Math.min(links.length, 50); j++) {
1212
+ var text = (links[j].innerText || '').trim().slice(0, 80);
1213
+ if (text) linkList.push({ text: text, href: links[j].href });
1214
+ }
1215
+ result.links = linkList;
1216
+ }
1217
+
1218
+ // Headings
1219
+ var headingSource = textSource || document.body;
1220
+ var headings = [];
1221
+ var hs = headingSource.querySelectorAll('h1, h2, h3');
1222
+ for (var k = 0; k < Math.min(hs.length, 20); k++) {
1223
+ headings.push({ level: hs[k].tagName, text: hs[k].innerText.trim().slice(0, 100) });
1224
+ }
1225
+ result.headings = headings;
1226
+
1227
+ return result;
1228
+ })()`);
1229
+ return { content: [{ type: "text", text: JSON.stringify(data, null, 2) }] };
1230
+ });
920
1231
  // ── Start Server ───────────────────────────────────────────────────────────
921
1232
  async function main() {
922
1233
  const transport = new StdioServerTransport();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcp-camoufox",
3
- "version": "0.3.3",
3
+ "version": "0.4.1",
4
4
  "description": "MCP server for stealth browser automation via Camoufox — 39 tools, Chrome DevTools MCP-level power with anti-bot stealth",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",