mcp-scraper 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/bin/api-server.cjs +957 -243
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +540 -158
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +2 -1
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +36 -5
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +5 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
- package/dist/chunk-ZK456YXN.js.map +1 -0
- package/dist/chunk-ZMOWIBMK.js +36 -0
- package/dist/chunk-ZMOWIBMK.js.map +1 -0
- package/dist/index.cjs +34 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
- package/docs/adr/README.md +11 -0
- package/docs/mcp-tool-quality-spec.md +238 -0
- package/package.json +5 -4
- package/dist/chunk-6TWZS2FQ.js.map +0 -1
- package/dist/chunk-7HB7NDOY.js.map +0 -1
- package/dist/chunk-W4P2U5VF.js.map +0 -1
- package/dist/server-2Y27U4TO.js.map +0 -1
- package/dist/worker-UT4ZQU2T.js.map +0 -1
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// src/errors.ts
|
|
2
|
+
var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
3
|
+
function sanitizeVendorName(message) {
|
|
4
|
+
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
5
|
+
}
|
|
6
|
+
var CaptchaError = class extends Error {
|
|
7
|
+
constructor(instructions) {
|
|
8
|
+
super(`CAPTCHA detected. ${instructions}`);
|
|
9
|
+
this.instructions = instructions;
|
|
10
|
+
}
|
|
11
|
+
instructions;
|
|
12
|
+
name = "CaptchaError";
|
|
13
|
+
};
|
|
14
|
+
var ExtractionError = class extends Error {
|
|
15
|
+
constructor(message, cause) {
|
|
16
|
+
super(message);
|
|
17
|
+
this.cause = cause;
|
|
18
|
+
}
|
|
19
|
+
cause;
|
|
20
|
+
name = "ExtractionError";
|
|
21
|
+
};
|
|
22
|
+
var RequestAbortedError = class extends Error {
|
|
23
|
+
name = "RequestAbortedError";
|
|
24
|
+
constructor(message = "Request aborted before harvest completed") {
|
|
25
|
+
super(message);
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
export {
|
|
30
|
+
RECAPTCHA_INSTRUCTIONS,
|
|
31
|
+
sanitizeVendorName,
|
|
32
|
+
CaptchaError,
|
|
33
|
+
ExtractionError,
|
|
34
|
+
RequestAbortedError
|
|
35
|
+
};
|
|
36
|
+
//# sourceMappingURL=chunk-ZMOWIBMK.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/errors.ts"],"sourcesContent":["export const RECAPTCHA_INSTRUCTIONS = 'Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.'\n\nexport function sanitizeVendorName(message: string): string {\n return message\n .replace(/kernel\\.sh\\s+sessions?/gi, 'sessions')\n .replace(/kernel\\.sh\\s+session/gi, 'this session')\n .replace(/kernel\\.sh/gi, 'the service')\n .replace(/kernel\\s+sessions?/gi, 'sessions')\n .replace(/kernel\\s+session/gi, 'this session')\n .replace(/\\bkernel\\b/gi, 'the service')\n .replace(/ +/g, ' ')\n .trim()\n}\n\nexport class CaptchaError extends Error {\n readonly name = 'CaptchaError'\n constructor(public readonly instructions: string) {\n super(`CAPTCHA detected. ${instructions}`)\n }\n}\n\nexport class ExtractionError extends Error {\n readonly name = 'ExtractionError'\n constructor(message: string, public readonly cause?: unknown) {\n super(message)\n }\n}\n\nexport class RequestAbortedError extends Error {\n readonly name = 'RequestAbortedError'\n constructor(message = 'Request aborted before harvest completed') {\n super(message)\n }\n}\n"],"mappings":";AAAO,IAAM,yBAAyB;AAE/B,SAAS,mBAAmB,SAAyB;AAC1D,SAAO,QACJ,QAAQ,4BAA4B,UAAU,EAC9C,QAAQ,0BAA0B,cAAc,EAChD,QAAQ,gBAAgB,aAAa,EACrC,QAAQ,wBAAwB,UAAU,EAC1C,QAAQ,sBAAsB,cAAc,EAC5C,QAAQ,gBAAgB,aAAa,EACrC,QAAQ,QAAQ,GAAG,EACnB,KAAK;AACV;AAEO,IAAM,eAAN,cAA2B,MAAM;AAAA,EAEtC,YAA4B,cAAsB;AAChD,UAAM,qBAAqB,YAAY,EAAE;AADf;AAAA,EAE5B;AAAA,EAF4B;AAAA,EADnB,OAAO;AAIlB;AAEO,IAAM,kBAAN,cAA8B,MAAM;AAAA,EAEzC,YAAY,SAAiC,OAAiB;AAC5D,UAAM,OAAO;AAD8B;AAAA,EAE7C;AAAA,EAF6C;AAAA,EADpC,OAAO;AAIlB;AAEO,IAAM,sBAAN,cAAkC,MAAM;AAAA,EACpC,OAAO;AAAA,EAChB,YAAY,UAAU,4CAA4C;AAChE,UAAM,OAAO;AAAA,EACf;AACF;","names":[]}
|
package/dist/index.cjs
CHANGED
|
@@ -71,6 +71,16 @@ var MapsPlaceOptionsSchema = import_zod.z.object({
|
|
|
71
71
|
kernelProxyId: import_zod.z.string().optional(),
|
|
72
72
|
headless: import_zod.z.boolean().default(true)
|
|
73
73
|
});
|
|
74
|
+
var MapsSearchOptionsSchema = import_zod.z.object({
|
|
75
|
+
query: import_zod.z.string().min(1),
|
|
76
|
+
location: import_zod.z.string().optional(),
|
|
77
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
78
|
+
hl: import_zod.z.string().length(2).default("en"),
|
|
79
|
+
maxResults: import_zod.z.number().int().min(1).max(50).default(10),
|
|
80
|
+
kernelApiKey: import_zod.z.string().optional(),
|
|
81
|
+
kernelProxyId: import_zod.z.string().optional(),
|
|
82
|
+
headless: import_zod.z.boolean().default(true)
|
|
83
|
+
});
|
|
74
84
|
var RawPAAItemSchema = import_zod.z.object({
|
|
75
85
|
question: import_zod.z.string().min(1),
|
|
76
86
|
answer: import_zod.z.string().optional(),
|
|
@@ -118,6 +128,16 @@ var RawMapsAboutAttributeSchema = import_zod.z.object({
|
|
|
118
128
|
attribute: import_zod.z.string()
|
|
119
129
|
});
|
|
120
130
|
|
|
131
|
+
// src/lib/browser-service-env.ts
|
|
132
|
+
function browserServiceApiKey() {
|
|
133
|
+
const value = (process.env.BROWSER_SERVICE_API_KEY ?? process.env.KERNEL_API_KEY)?.trim();
|
|
134
|
+
return value || void 0;
|
|
135
|
+
}
|
|
136
|
+
function browserServiceProxyId() {
|
|
137
|
+
const value = (process.env.BROWSER_SERVICE_PROXY_ID ?? process.env.KERNEL_PROXY_ID)?.trim();
|
|
138
|
+
return value || void 0;
|
|
139
|
+
}
|
|
140
|
+
|
|
121
141
|
// src/driver/BrowserDriver.ts
|
|
122
142
|
var import_playwright_extra = require("playwright-extra");
|
|
123
143
|
var import_puppeteer_extra_plugin_stealth = __toESM(require("puppeteer-extra-plugin-stealth"), 1);
|
|
@@ -934,8 +954,19 @@ function addCandidate(candidates, city, region, example) {
|
|
|
934
954
|
}
|
|
935
955
|
candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
|
|
936
956
|
}
|
|
957
|
+
function decodeSerpText(text) {
|
|
958
|
+
try {
|
|
959
|
+
return decodeURIComponent(text);
|
|
960
|
+
} catch {
|
|
961
|
+
}
|
|
962
|
+
try {
|
|
963
|
+
return decodeURIComponent(text.replace(/%(?![0-9a-fA-F]{2})/g, "%25"));
|
|
964
|
+
} catch {
|
|
965
|
+
return text;
|
|
966
|
+
}
|
|
967
|
+
}
|
|
937
968
|
function scanText(candidates, text) {
|
|
938
|
-
const normalized =
|
|
969
|
+
const normalized = decodeSerpText(text).replace(/[+/|_-]+/g, " ");
|
|
939
970
|
for (const match of normalized.matchAll(CITY_STATE_RE)) {
|
|
940
971
|
addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
|
|
941
972
|
}
|
|
@@ -2603,8 +2634,8 @@ async function harvest(rawOptions) {
|
|
|
2603
2634
|
const onAttemptEvent = getAttemptLogSink(rawOptions);
|
|
2604
2635
|
const requestedProxyMode = raw.proxyMode;
|
|
2605
2636
|
const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
|
|
2606
|
-
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() :
|
|
2607
|
-
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() :
|
|
2637
|
+
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : browserServiceApiKey();
|
|
2638
|
+
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : browserServiceProxyId();
|
|
2608
2639
|
const proxyOpts = {
|
|
2609
2640
|
kernelApiKey,
|
|
2610
2641
|
proxyMode,
|