mcp-camoufox 0.3.3 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +312 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -61,7 +61,7 @@ const SNAPSHOT_JS = `(() => {
|
|
|
61
61
|
name: el.getAttribute('name') || '',
|
|
62
62
|
placeholder: el.getAttribute('placeholder') || '',
|
|
63
63
|
aria: el.getAttribute('aria-label') || '',
|
|
64
|
-
href: el.tagName === 'A' ? (el.href || '').slice(0,
|
|
64
|
+
href: el.tagName === 'A' ? (el.href || '').slice(0, 500) : '',
|
|
65
65
|
checked: el.checked || false,
|
|
66
66
|
disabled: el.disabled || false,
|
|
67
67
|
};
|
|
@@ -917,6 +917,317 @@ server.tool("export_har", "Export network traffic as HAR file.", {
|
|
|
917
917
|
writeFileSync(target, JSON.stringify(har, null, 2));
|
|
918
918
|
return { content: [{ type: "text", text: `HAR exported: ${target} (${entries.length} entries)` }] };
|
|
919
919
|
});
|
|
920
|
+
// ── Tools: Scraping / Extraction ───────────────────────────────────────────
|
|
921
|
+
server.tool("detect_content_pattern", "Auto-detect repeated content patterns (cards, listings, rows) and suggest CSS selectors. Run this BEFORE extract_structured to find the right selectors.", {
|
|
922
|
+
min_items: z.number().default(3).describe("Minimum repeated items to detect as pattern"),
|
|
923
|
+
}, async ({ min_items }) => {
|
|
924
|
+
const page = getPage();
|
|
925
|
+
const patterns = await page.evaluate(`(() => {
|
|
926
|
+
// Count children with same tag+class per parent
|
|
927
|
+
var candidates = [];
|
|
928
|
+
var parents = document.querySelectorAll('main, [role="main"], section, div, ul, ol, tbody');
|
|
929
|
+
for (var p = 0; p < parents.length; p++) {
|
|
930
|
+
var parent = parents[p];
|
|
931
|
+
var childMap = {};
|
|
932
|
+
for (var c = 0; c < parent.children.length; c++) {
|
|
933
|
+
var child = parent.children[c];
|
|
934
|
+
var key = child.tagName;
|
|
935
|
+
if (child.className) key += '.' + child.className.split(' ').filter(function(c){return c.length>0}).slice(0,2).join('.');
|
|
936
|
+
if (!childMap[key]) childMap[key] = { count: 0, tag: child.tagName.toLowerCase(), cls: child.className, sample: '' };
|
|
937
|
+
childMap[key].count++;
|
|
938
|
+
if (!childMap[key].sample) childMap[key].sample = (child.innerText || '').trim().slice(0, 150);
|
|
939
|
+
}
|
|
940
|
+
var keys = Object.keys(childMap);
|
|
941
|
+
for (var k = 0; k < keys.length; k++) {
|
|
942
|
+
if (childMap[keys[k]].count >= ${min_items}) {
|
|
943
|
+
var info = childMap[keys[k]];
|
|
944
|
+
// Build selector
|
|
945
|
+
var sel = info.tag;
|
|
946
|
+
if (info.cls) {
|
|
947
|
+
var classes = info.cls.split(' ').filter(function(c){return c.length > 0 && c.length < 40}).slice(0,2);
|
|
948
|
+
if (classes.length > 0) sel = info.tag + '.' + classes.join('.');
|
|
949
|
+
}
|
|
950
|
+
// Find child elements for field suggestions
|
|
951
|
+
var firstItem = parent.querySelector(sel);
|
|
952
|
+
var fieldHints = [];
|
|
953
|
+
if (firstItem) {
|
|
954
|
+
var links = firstItem.querySelectorAll('a[href]');
|
|
955
|
+
if (links.length > 0) fieldHints.push({ name: 'url', selector: 'a', attribute: 'href', sample: links[0].href.slice(0, 100) });
|
|
956
|
+
var headings = firstItem.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
957
|
+
if (headings.length > 0) fieldHints.push({ name: 'title', selector: headings[0].tagName.toLowerCase(), attribute: '', sample: headings[0].innerText.trim().slice(0, 60) });
|
|
958
|
+
var imgs = firstItem.querySelectorAll('img[src]');
|
|
959
|
+
if (imgs.length > 0) fieldHints.push({ name: 'image', selector: 'img', attribute: 'src', sample: imgs[0].src.slice(0, 80) });
|
|
960
|
+
// Find text-heavy spans/divs
|
|
961
|
+
var texts = firstItem.querySelectorAll('span, p, div');
|
|
962
|
+
var textItems = [];
|
|
963
|
+
for (var t = 0; t < texts.length; t++) {
|
|
964
|
+
var txt = texts[t].innerText.trim();
|
|
965
|
+
if (txt.length > 5 && txt.length < 100 && texts[t].children.length === 0) {
|
|
966
|
+
var tSel = texts[t].tagName.toLowerCase();
|
|
967
|
+
if (texts[t].className) tSel += '.' + texts[t].className.split(' ').filter(function(c){return c.length>0&&c.length<40}).slice(0,1).join('.');
|
|
968
|
+
textItems.push({ selector: tSel, sample: txt.slice(0, 60) });
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
for (var ti = 0; ti < Math.min(textItems.length, 3); ti++) {
|
|
972
|
+
fieldHints.push({ name: 'field_' + ti, selector: textItems[ti].selector, attribute: '', sample: textItems[ti].sample });
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
candidates.push({
|
|
976
|
+
selector: sel,
|
|
977
|
+
count: info.count,
|
|
978
|
+
sample_text: info.sample.slice(0, 100),
|
|
979
|
+
suggested_fields: fieldHints
|
|
980
|
+
});
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
// Sort by count desc, deduplicate by selector
|
|
985
|
+
candidates.sort(function(a,b){ return b.count - a.count; });
|
|
986
|
+
var seen = {};
|
|
987
|
+
var unique = [];
|
|
988
|
+
for (var u = 0; u < candidates.length; u++) {
|
|
989
|
+
if (!seen[candidates[u].selector]) {
|
|
990
|
+
seen[candidates[u].selector] = true;
|
|
991
|
+
unique.push(candidates[u]);
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
return unique.slice(0, 10);
|
|
995
|
+
})()`);
|
|
996
|
+
const arr = patterns;
|
|
997
|
+
if (arr.length === 0) {
|
|
998
|
+
return { content: [{ type: "text", text: "No repeated content patterns detected. Try scrolling down to load more content." }] };
|
|
999
|
+
}
|
|
1000
|
+
let text = `Detected ${arr.length} content pattern(s):\n\n`;
|
|
1001
|
+
for (const p of arr) {
|
|
1002
|
+
text += `--- ${p.count} items: ${p.selector} ---\n`;
|
|
1003
|
+
text += `Sample: "${p.sample_text}"\n`;
|
|
1004
|
+
if (p.suggested_fields?.length) {
|
|
1005
|
+
text += `Suggested extract_structured call:\n`;
|
|
1006
|
+
text += ` container_selector: "${p.selector}"\n`;
|
|
1007
|
+
text += ` fields:\n`;
|
|
1008
|
+
for (const f of p.suggested_fields) {
|
|
1009
|
+
text += ` - {name: "${f.name}", selector: "${f.selector}"${f.attribute ? `, attribute: "${f.attribute}"` : ''}} → "${f.sample}"\n`;
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
text += `\n`;
|
|
1013
|
+
}
|
|
1014
|
+
return { content: [{ type: "text", text }] };
|
|
1015
|
+
});
|
|
1016
|
+
server.tool("extract_structured", "Extract structured data from repeated elements (cards, rows, listings). Auto-deduplicates, filters empty items, extracts direct text only. Use detect_content_pattern first to find correct selectors.", {
|
|
1017
|
+
container_selector: z.string().describe("CSS selector for each repeated item. Use detect_content_pattern to find this."),
|
|
1018
|
+
fields: z.array(z.object({
|
|
1019
|
+
name: z.string().describe("Field name in output"),
|
|
1020
|
+
selector: z.string().describe("CSS selector within each item"),
|
|
1021
|
+
attribute: z.string().default("").describe("Attribute to extract (empty = direct text only)"),
|
|
1022
|
+
})).describe("Fields to extract from each item"),
|
|
1023
|
+
limit: z.number().default(50).describe("Max items to extract"),
|
|
1024
|
+
deduplicate_by: z.string().default("").describe("Field name to deduplicate by (empty = auto)"),
|
|
1025
|
+
direct_text_only: z.boolean().default(true).describe("Extract only direct text of matched element, not children text (prevents field mixing)"),
|
|
1026
|
+
}, async ({ container_selector, fields, limit, deduplicate_by, direct_text_only }) => {
|
|
1027
|
+
const page = getPage();
|
|
1028
|
+
const fieldsDef = JSON.stringify(fields);
|
|
1029
|
+
const results = await page.evaluate(`(() => {
|
|
1030
|
+
// Helper: get direct text only (no children text) to prevent field mixing
|
|
1031
|
+
function directText(el) {
|
|
1032
|
+
var text = '';
|
|
1033
|
+
for (var n = 0; n < el.childNodes.length; n++) {
|
|
1034
|
+
if (el.childNodes[n].nodeType === 3) text += el.childNodes[n].textContent;
|
|
1035
|
+
}
|
|
1036
|
+
text = text.trim();
|
|
1037
|
+
// If direct text empty, fall back to first line of innerText
|
|
1038
|
+
if (!text) {
|
|
1039
|
+
var lines = (el.innerText || '').trim().split('\\n');
|
|
1040
|
+
text = lines[0] || '';
|
|
1041
|
+
}
|
|
1042
|
+
return text.trim();
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// Get ALL matching containers, then filter to only top-level (not nested)
|
|
1046
|
+
var allContainers = document.querySelectorAll("${container_selector.replace(/"/g, '\\"')}");
|
|
1047
|
+
var containers = [];
|
|
1048
|
+
for (var c = 0; c < allContainers.length; c++) {
|
|
1049
|
+
var isNested = false;
|
|
1050
|
+
var parent = allContainers[c].parentElement;
|
|
1051
|
+
while (parent) {
|
|
1052
|
+
if (parent.matches && parent.matches("${container_selector.replace(/"/g, '\\"')}")) {
|
|
1053
|
+
isNested = true;
|
|
1054
|
+
break;
|
|
1055
|
+
}
|
|
1056
|
+
parent = parent.parentElement;
|
|
1057
|
+
}
|
|
1058
|
+
if (!isNested) containers.push(allContainers[c]);
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
var fields = ${fieldsDef};
|
|
1062
|
+
var directOnly = ${direct_text_only};
|
|
1063
|
+
var out = [];
|
|
1064
|
+
var seenKeys = {};
|
|
1065
|
+
var dedup = "${deduplicate_by}";
|
|
1066
|
+
|
|
1067
|
+
for (var i = 0; i < Math.min(containers.length, ${limit * 2}); i++) {
|
|
1068
|
+
var item = {};
|
|
1069
|
+
var nonEmptyCount = 0;
|
|
1070
|
+
|
|
1071
|
+
for (var j = 0; j < fields.length; j++) {
|
|
1072
|
+
var f = fields[j];
|
|
1073
|
+
var el = containers[i].querySelector(f.selector);
|
|
1074
|
+
if (el) {
|
|
1075
|
+
var val;
|
|
1076
|
+
if (f.attribute) {
|
|
1077
|
+
val = el.getAttribute(f.attribute) || '';
|
|
1078
|
+
} else if (directOnly) {
|
|
1079
|
+
val = directText(el);
|
|
1080
|
+
} else {
|
|
1081
|
+
val = (el.innerText || '').trim();
|
|
1082
|
+
}
|
|
1083
|
+
item[f.name] = val;
|
|
1084
|
+
if (val) nonEmptyCount++;
|
|
1085
|
+
} else {
|
|
1086
|
+
item[f.name] = '';
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
// P0: Skip items where all fields are empty
|
|
1091
|
+
if (nonEmptyCount === 0) continue;
|
|
1092
|
+
|
|
1093
|
+
// P0: Deduplicate
|
|
1094
|
+
var dedupKey = '';
|
|
1095
|
+
if (dedup && item[dedup]) {
|
|
1096
|
+
dedupKey = item[dedup];
|
|
1097
|
+
} else {
|
|
1098
|
+
for (var d = 0; d < fields.length; d++) {
|
|
1099
|
+
if (item[fields[d].name]) { dedupKey = item[fields[d].name]; break; }
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
if (dedupKey && seenKeys[dedupKey]) continue;
|
|
1103
|
+
if (dedupKey) seenKeys[dedupKey] = true;
|
|
1104
|
+
|
|
1105
|
+
out.push(item);
|
|
1106
|
+
if (out.length >= ${limit}) break;
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
return {
|
|
1110
|
+
total_on_page: allContainers.length,
|
|
1111
|
+
top_level: containers.length,
|
|
1112
|
+
unique_extracted: out.length,
|
|
1113
|
+
items: out
|
|
1114
|
+
};
|
|
1115
|
+
})()`);
|
|
1116
|
+
return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] };
|
|
1117
|
+
});
|
|
1118
|
+
server.tool("extract_table", "Extract data from an HTML table as JSON array.", {
|
|
1119
|
+
selector: z.string().default("table").describe("CSS selector for the table"),
|
|
1120
|
+
limit: z.number().default(100).describe("Max rows"),
|
|
1121
|
+
}, async ({ selector, limit }) => {
|
|
1122
|
+
const page = getPage();
|
|
1123
|
+
const results = await page.evaluate(`(() => {
|
|
1124
|
+
var table = document.querySelector("${selector.replace(/"/g, '\\"')}");
|
|
1125
|
+
if (!table) return { error: 'Table not found' };
|
|
1126
|
+
var headers = [];
|
|
1127
|
+
var ths = table.querySelectorAll('thead th, thead td, tr:first-child th, tr:first-child td');
|
|
1128
|
+
for (var i = 0; i < ths.length; i++) headers.push(ths[i].innerText.trim());
|
|
1129
|
+
var rows = table.querySelectorAll('tbody tr, tr');
|
|
1130
|
+
var out = [];
|
|
1131
|
+
var start = headers.length > 0 ? 1 : 0;
|
|
1132
|
+
for (var r = start; r < Math.min(rows.length, ${limit} + start); r++) {
|
|
1133
|
+
var cells = rows[r].querySelectorAll('td, th');
|
|
1134
|
+
var row = {};
|
|
1135
|
+
for (var c = 0; c < cells.length; c++) {
|
|
1136
|
+
var key = headers[c] || ('col_' + c);
|
|
1137
|
+
var link = cells[c].querySelector('a');
|
|
1138
|
+
row[key] = cells[c].innerText.trim();
|
|
1139
|
+
if (link) row[key + '_url'] = link.href;
|
|
1140
|
+
}
|
|
1141
|
+
out.push(row);
|
|
1142
|
+
}
|
|
1143
|
+
return { headers: headers, total_rows: rows.length - start, extracted: out.length, rows: out };
|
|
1144
|
+
})()`);
|
|
1145
|
+
return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] };
|
|
1146
|
+
});
|
|
1147
|
+
server.tool("scrape_page", "Smart page scraper — auto-detect and extract main content, links, metadata. Strips nav/footer noise.", {
|
|
1148
|
+
include_links: z.boolean().default(true),
|
|
1149
|
+
include_meta: z.boolean().default(true),
|
|
1150
|
+
max_text_length: z.number().default(8000).describe("Max text chars (truncates at paragraph boundary)"),
|
|
1151
|
+
only_main_content: z.boolean().default(true).describe("Strip nav, header, footer, sidebar — extract only main content area"),
|
|
1152
|
+
}, async ({ include_links, include_meta, max_text_length, only_main_content }) => {
|
|
1153
|
+
const page = getPage();
|
|
1154
|
+
const data = await page.evaluate(`(() => {
|
|
1155
|
+
var result = {};
|
|
1156
|
+
result.title = document.title;
|
|
1157
|
+
result.url = location.href;
|
|
1158
|
+
|
|
1159
|
+
// Meta
|
|
1160
|
+
if (${include_meta}) {
|
|
1161
|
+
var metas = {};
|
|
1162
|
+
var metaEls = document.querySelectorAll('meta[name], meta[property]');
|
|
1163
|
+
for (var i = 0; i < metaEls.length; i++) {
|
|
1164
|
+
var key = metaEls[i].getAttribute('name') || metaEls[i].getAttribute('property');
|
|
1165
|
+
metas[key] = metaEls[i].getAttribute('content') || '';
|
|
1166
|
+
}
|
|
1167
|
+
result.meta = metas;
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
// Find main content area
|
|
1171
|
+
var textSource;
|
|
1172
|
+
if (${only_main_content}) {
|
|
1173
|
+
textSource = document.querySelector('main, [role="main"], #main-content, .main-content, #content, .content');
|
|
1174
|
+
// Exclude nav/footer/sidebar from the source
|
|
1175
|
+
if (textSource) {
|
|
1176
|
+
var clone = textSource.cloneNode(true);
|
|
1177
|
+
var noise = clone.querySelectorAll('nav, header, footer, aside, [role="navigation"], [role="banner"], [role="contentinfo"], .sidebar, .nav, .footer, .header');
|
|
1178
|
+
for (var n = 0; n < noise.length; n++) noise[n].remove();
|
|
1179
|
+
var fullText = clone.innerText.trim();
|
|
1180
|
+
} else {
|
|
1181
|
+
textSource = document.body;
|
|
1182
|
+
var fullText = textSource.innerText.trim();
|
|
1183
|
+
}
|
|
1184
|
+
} else {
|
|
1185
|
+
textSource = document.body;
|
|
1186
|
+
var fullText = textSource.innerText.trim();
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
// Smart truncation: cut at paragraph/newline boundary, not mid-word
|
|
1190
|
+
var totalLen = fullText.length;
|
|
1191
|
+
if (fullText.length > ${max_text_length}) {
|
|
1192
|
+
var cutText = fullText.slice(0, ${max_text_length});
|
|
1193
|
+
var lastNewline = cutText.lastIndexOf('\\n');
|
|
1194
|
+
if (lastNewline > ${max_text_length} * 0.8) {
|
|
1195
|
+
cutText = cutText.slice(0, lastNewline);
|
|
1196
|
+
}
|
|
1197
|
+
result.text = cutText;
|
|
1198
|
+
result.truncated = true;
|
|
1199
|
+
result.total_text_length = totalLen;
|
|
1200
|
+
} else {
|
|
1201
|
+
result.text = fullText;
|
|
1202
|
+
result.truncated = false;
|
|
1203
|
+
result.total_text_length = totalLen;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
// Links from main content area
|
|
1207
|
+
if (${include_links}) {
|
|
1208
|
+
var linkSource = textSource || document.body;
|
|
1209
|
+
var links = linkSource.querySelectorAll('a[href]');
|
|
1210
|
+
var linkList = [];
|
|
1211
|
+
for (var j = 0; j < Math.min(links.length, 50); j++) {
|
|
1212
|
+
var text = (links[j].innerText || '').trim().slice(0, 80);
|
|
1213
|
+
if (text) linkList.push({ text: text, href: links[j].href });
|
|
1214
|
+
}
|
|
1215
|
+
result.links = linkList;
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
// Headings
|
|
1219
|
+
var headingSource = textSource || document.body;
|
|
1220
|
+
var headings = [];
|
|
1221
|
+
var hs = headingSource.querySelectorAll('h1, h2, h3');
|
|
1222
|
+
for (var k = 0; k < Math.min(hs.length, 20); k++) {
|
|
1223
|
+
headings.push({ level: hs[k].tagName, text: hs[k].innerText.trim().slice(0, 100) });
|
|
1224
|
+
}
|
|
1225
|
+
result.headings = headings;
|
|
1226
|
+
|
|
1227
|
+
return result;
|
|
1228
|
+
})()`);
|
|
1229
|
+
return { content: [{ type: "text", text: JSON.stringify(data, null, 2) }] };
|
|
1230
|
+
});
|
|
920
1231
|
// ── Start Server ───────────────────────────────────────────────────────────
|
|
921
1232
|
async function main() {
|
|
922
1233
|
const transport = new StdioServerTransport();
|
package/package.json
CHANGED