mcp-scraper 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +13 -2
  2. package/dist/bin/api-server.cjs +957 -243
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/mcp-stdio-server.cjs +540 -158
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +2 -1
  8. package/dist/bin/mcp-stdio-server.js.map +1 -1
  9. package/dist/bin/paa-harvest.cjs +36 -5
  10. package/dist/bin/paa-harvest.cjs.map +1 -1
  11. package/dist/bin/paa-harvest.js +5 -3
  12. package/dist/bin/paa-harvest.js.map +1 -1
  13. package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
  14. package/dist/chunk-RE6HCRYC.js.map +1 -0
  15. package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
  16. package/dist/chunk-TM22BLWP.js.map +1 -0
  17. package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
  18. package/dist/chunk-ZK456YXN.js.map +1 -0
  19. package/dist/chunk-ZMOWIBMK.js +36 -0
  20. package/dist/chunk-ZMOWIBMK.js.map +1 -0
  21. package/dist/index.cjs +34 -3
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.js +2 -1
  24. package/dist/index.js.map +1 -1
  25. package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
  26. package/dist/server-QXVVTKJP.js.map +1 -0
  27. package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
  28. package/dist/worker-AUCXFHEL.js.map +1 -0
  29. package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
  30. package/docs/adr/README.md +11 -0
  31. package/docs/mcp-tool-quality-spec.md +238 -0
  32. package/package.json +5 -4
  33. package/dist/chunk-6TWZS2FQ.js.map +0 -1
  34. package/dist/chunk-7HB7NDOY.js.map +0 -1
  35. package/dist/chunk-W4P2U5VF.js.map +0 -1
  36. package/dist/server-2Y27U4TO.js.map +0 -1
  37. package/dist/worker-UT4ZQU2T.js.map +0 -1
@@ -0,0 +1,36 @@
1
+ // src/errors.ts
2
+ var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
3
+ function sanitizeVendorName(message) {
4
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
5
+ }
6
+ var CaptchaError = class extends Error {
7
+ constructor(instructions) {
8
+ super(`CAPTCHA detected. ${instructions}`);
9
+ this.instructions = instructions;
10
+ }
11
+ instructions;
12
+ name = "CaptchaError";
13
+ };
14
+ var ExtractionError = class extends Error {
15
+ constructor(message, cause) {
16
+ super(message);
17
+ this.cause = cause;
18
+ }
19
+ cause;
20
+ name = "ExtractionError";
21
+ };
22
+ var RequestAbortedError = class extends Error {
23
+ name = "RequestAbortedError";
24
+ constructor(message = "Request aborted before harvest completed") {
25
+ super(message);
26
+ }
27
+ };
28
+
29
+ export {
30
+ RECAPTCHA_INSTRUCTIONS,
31
+ sanitizeVendorName,
32
+ CaptchaError,
33
+ ExtractionError,
34
+ RequestAbortedError
35
+ };
36
+ //# sourceMappingURL=chunk-ZMOWIBMK.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/errors.ts"],"sourcesContent":["export const RECAPTCHA_INSTRUCTIONS = 'Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.'\n\nexport function sanitizeVendorName(message: string): string {\n return message\n .replace(/kernel\\.sh\\s+sessions?/gi, 'sessions')\n .replace(/kernel\\.sh\\s+session/gi, 'this session')\n .replace(/kernel\\.sh/gi, 'the service')\n .replace(/kernel\\s+sessions?/gi, 'sessions')\n .replace(/kernel\\s+session/gi, 'this session')\n .replace(/\\bkernel\\b/gi, 'the service')\n .replace(/ +/g, ' ')\n .trim()\n}\n\nexport class CaptchaError extends Error {\n readonly name = 'CaptchaError'\n constructor(public readonly instructions: string) {\n super(`CAPTCHA detected. ${instructions}`)\n }\n}\n\nexport class ExtractionError extends Error {\n readonly name = 'ExtractionError'\n constructor(message: string, public readonly cause?: unknown) {\n super(message)\n }\n}\n\nexport class RequestAbortedError extends Error {\n readonly name = 'RequestAbortedError'\n constructor(message = 'Request aborted before harvest completed') {\n super(message)\n }\n}\n"],"mappings":";AAAO,IAAM,yBAAyB;AAE/B,SAAS,mBAAmB,SAAyB;AAC1D,SAAO,QACJ,QAAQ,4BAA4B,UAAU,EAC9C,QAAQ,0BAA0B,cAAc,EAChD,QAAQ,gBAAgB,aAAa,EACrC,QAAQ,wBAAwB,UAAU,EAC1C,QAAQ,sBAAsB,cAAc,EAC5C,QAAQ,gBAAgB,aAAa,EACrC,QAAQ,QAAQ,GAAG,EACnB,KAAK;AACV;AAEO,IAAM,eAAN,cAA2B,MAAM;AAAA,EAEtC,YAA4B,cAAsB;AAChD,UAAM,qBAAqB,YAAY,EAAE;AADf;AAAA,EAE5B;AAAA,EAF4B;AAAA,EADnB,OAAO;AAIlB;AAEO,IAAM,kBAAN,cAA8B,MAAM;AAAA,EAEzC,YAAY,SAAiC,OAAiB;AAC5D,UAAM,OAAO;AAD8B;AAAA,EAE7C;AAAA,EAF6C;AAAA,EADpC,OAAO;AAIlB;AAEO,IAAM,sBAAN,cAAkC,MAAM;AAAA,EACpC,OAAO;AAAA,EAChB,YAAY,UAAU,4CAA4C;AAChE,UAAM,OAAO;AAAA,EACf;AACF;","names":[]}
package/dist/index.cjs CHANGED
@@ -71,6 +71,16 @@ var MapsPlaceOptionsSchema = import_zod.z.object({
71
71
  kernelProxyId: import_zod.z.string().optional(),
72
72
  headless: import_zod.z.boolean().default(true)
73
73
  });
74
+ var MapsSearchOptionsSchema = import_zod.z.object({
75
+ query: import_zod.z.string().min(1),
76
+ location: import_zod.z.string().optional(),
77
+ gl: import_zod.z.string().length(2).default("us"),
78
+ hl: import_zod.z.string().length(2).default("en"),
79
+ maxResults: import_zod.z.number().int().min(1).max(50).default(10),
80
+ kernelApiKey: import_zod.z.string().optional(),
81
+ kernelProxyId: import_zod.z.string().optional(),
82
+ headless: import_zod.z.boolean().default(true)
83
+ });
74
84
  var RawPAAItemSchema = import_zod.z.object({
75
85
  question: import_zod.z.string().min(1),
76
86
  answer: import_zod.z.string().optional(),
@@ -118,6 +128,16 @@ var RawMapsAboutAttributeSchema = import_zod.z.object({
118
128
  attribute: import_zod.z.string()
119
129
  });
120
130
 
131
+ // src/lib/browser-service-env.ts
132
+ function browserServiceApiKey() {
133
+ const value = (process.env.BROWSER_SERVICE_API_KEY ?? process.env.KERNEL_API_KEY)?.trim();
134
+ return value || void 0;
135
+ }
136
+ function browserServiceProxyId() {
137
+ const value = (process.env.BROWSER_SERVICE_PROXY_ID ?? process.env.KERNEL_PROXY_ID)?.trim();
138
+ return value || void 0;
139
+ }
140
+
121
141
  // src/driver/BrowserDriver.ts
122
142
  var import_playwright_extra = require("playwright-extra");
123
143
  var import_puppeteer_extra_plugin_stealth = __toESM(require("puppeteer-extra-plugin-stealth"), 1);
@@ -934,8 +954,19 @@ function addCandidate(candidates, city, region, example) {
934
954
  }
935
955
  candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
936
956
  }
957
+ function decodeSerpText(text) {
958
+ try {
959
+ return decodeURIComponent(text);
960
+ } catch {
961
+ }
962
+ try {
963
+ return decodeURIComponent(text.replace(/%(?![0-9a-fA-F]{2})/g, "%25"));
964
+ } catch {
965
+ return text;
966
+ }
967
+ }
937
968
  function scanText(candidates, text) {
938
- const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
969
+ const normalized = decodeSerpText(text).replace(/[+/|_-]+/g, " ");
939
970
  for (const match of normalized.matchAll(CITY_STATE_RE)) {
940
971
  addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
941
972
  }
@@ -2603,8 +2634,8 @@ async function harvest(rawOptions) {
2603
2634
  const onAttemptEvent = getAttemptLogSink(rawOptions);
2604
2635
  const requestedProxyMode = raw.proxyMode;
2605
2636
  const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
2606
- const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
2607
- const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
2637
+ const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : browserServiceApiKey();
2638
+ const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : browserServiceProxyId();
2608
2639
  const proxyOpts = {
2609
2640
  kernelApiKey,
2610
2641
  proxyMode,