mcp-scraper 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +13 -2
  2. package/dist/bin/api-server.cjs +573 -172
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/mcp-stdio-server.cjs +300 -150
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +2 -1
  8. package/dist/bin/mcp-stdio-server.js.map +1 -1
  9. package/dist/bin/paa-harvest.cjs +22 -1
  10. package/dist/bin/paa-harvest.cjs.map +1 -1
  11. package/dist/bin/paa-harvest.js +2 -1
  12. package/dist/bin/paa-harvest.js.map +1 -1
  13. package/dist/{chunk-4OHPDEZM.js → chunk-3OIRNUF5.js} +303 -151
  14. package/dist/chunk-3OIRNUF5.js.map +1 -0
  15. package/dist/{chunk-W4P2U5VF.js → chunk-LUBDFS67.js} +32 -32
  16. package/dist/chunk-LUBDFS67.js.map +1 -0
  17. package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
  18. package/dist/chunk-ZK456YXN.js.map +1 -0
  19. package/dist/chunk-ZMOWIBMK.js +36 -0
  20. package/dist/chunk-ZMOWIBMK.js.map +1 -0
  21. package/dist/index.cjs +22 -1
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.js +2 -1
  24. package/dist/index.js.map +1 -1
  25. package/dist/{server-V5XMVRYE.js → server-YNJHP5PU.js} +235 -22
  26. package/dist/server-YNJHP5PU.js.map +1 -0
  27. package/dist/{worker-UT4ZQU2T.js → worker-PBG6LGET.js} +4 -3
  28. package/dist/{worker-UT4ZQU2T.js.map → worker-PBG6LGET.js.map} +1 -1
  29. package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
  30. package/docs/adr/README.md +11 -0
  31. package/docs/mcp-tool-quality-spec.md +238 -0
  32. package/package.json +5 -4
  33. package/dist/chunk-4OHPDEZM.js.map +0 -1
  34. package/dist/chunk-7HB7NDOY.js.map +0 -1
  35. package/dist/chunk-W4P2U5VF.js.map +0 -1
  36. package/dist/server-V5XMVRYE.js.map +0 -1
@@ -0,0 +1,36 @@
1
+ // src/errors.ts
2
+ var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
3
+ function sanitizeVendorName(message) {
4
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
5
+ }
6
+ var CaptchaError = class extends Error {
7
+ constructor(instructions) {
8
+ super(`CAPTCHA detected. ${instructions}`);
9
+ this.instructions = instructions;
10
+ }
11
+ instructions;
12
+ name = "CaptchaError";
13
+ };
14
+ var ExtractionError = class extends Error {
15
+ constructor(message, cause) {
16
+ super(message);
17
+ this.cause = cause;
18
+ }
19
+ cause;
20
+ name = "ExtractionError";
21
+ };
22
+ var RequestAbortedError = class extends Error {
23
+ name = "RequestAbortedError";
24
+ constructor(message = "Request aborted before harvest completed") {
25
+ super(message);
26
+ }
27
+ };
28
+
29
+ export {
30
+ RECAPTCHA_INSTRUCTIONS,
31
+ sanitizeVendorName,
32
+ CaptchaError,
33
+ ExtractionError,
34
+ RequestAbortedError
35
+ };
36
+ //# sourceMappingURL=chunk-ZMOWIBMK.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/errors.ts"],"sourcesContent":["export const RECAPTCHA_INSTRUCTIONS = 'Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.'\n\nexport function sanitizeVendorName(message: string): string {\n return message\n .replace(/kernel\\.sh\\s+sessions?/gi, 'sessions')\n .replace(/kernel\\.sh\\s+session/gi, 'this session')\n .replace(/kernel\\.sh/gi, 'the service')\n .replace(/kernel\\s+sessions?/gi, 'sessions')\n .replace(/kernel\\s+session/gi, 'this session')\n .replace(/\\bkernel\\b/gi, 'the service')\n .replace(/ +/g, ' ')\n .trim()\n}\n\nexport class CaptchaError extends Error {\n readonly name = 'CaptchaError'\n constructor(public readonly instructions: string) {\n super(`CAPTCHA detected. ${instructions}`)\n }\n}\n\nexport class ExtractionError extends Error {\n readonly name = 'ExtractionError'\n constructor(message: string, public readonly cause?: unknown) {\n super(message)\n }\n}\n\nexport class RequestAbortedError extends Error {\n readonly name = 'RequestAbortedError'\n constructor(message = 'Request aborted before harvest completed') {\n super(message)\n }\n}\n"],"mappings":";AAAO,IAAM,yBAAyB;AAE/B,SAAS,mBAAmB,SAAyB;AAC1D,SAAO,QACJ,QAAQ,4BAA4B,UAAU,EAC9C,QAAQ,0BAA0B,cAAc,EAChD,QAAQ,gBAAgB,aAAa,EACrC,QAAQ,wBAAwB,UAAU,EAC1C,QAAQ,sBAAsB,cAAc,EAC5C,QAAQ,gBAAgB,aAAa,EACrC,QAAQ,QAAQ,GAAG,EACnB,KAAK;AACV;AAEO,IAAM,eAAN,cAA2B,MAAM;AAAA,EAEtC,YAA4B,cAAsB;AAChD,UAAM,qBAAqB,YAAY,EAAE;AADf;AAAA,EAE5B;AAAA,EAF4B;AAAA,EADnB,OAAO;AAIlB;AAEO,IAAM,kBAAN,cAA8B,MAAM;AAAA,EAEzC,YAAY,SAAiC,OAAiB;AAC5D,UAAM,OAAO;AAD8B;AAAA,EAE7C;AAAA,EAF6C;AAAA,EADpC,OAAO;AAIlB;AAEO,IAAM,sBAAN,cAAkC,MAAM;AAAA,EACpC,OAAO;AAAA,EAChB,YAAY,UAAU,4CAA4C;AAChE,UAAM,OAAO;AAAA,EACf;AACF;","names":[]}
package/dist/index.cjs CHANGED
@@ -71,6 +71,16 @@ var MapsPlaceOptionsSchema = import_zod.z.object({
71
71
  kernelProxyId: import_zod.z.string().optional(),
72
72
  headless: import_zod.z.boolean().default(true)
73
73
  });
74
+ var MapsSearchOptionsSchema = import_zod.z.object({
75
+ query: import_zod.z.string().min(1),
76
+ location: import_zod.z.string().optional(),
77
+ gl: import_zod.z.string().length(2).default("us"),
78
+ hl: import_zod.z.string().length(2).default("en"),
79
+ maxResults: import_zod.z.number().int().min(1).max(50).default(10),
80
+ kernelApiKey: import_zod.z.string().optional(),
81
+ kernelProxyId: import_zod.z.string().optional(),
82
+ headless: import_zod.z.boolean().default(true)
83
+ });
74
84
  var RawPAAItemSchema = import_zod.z.object({
75
85
  question: import_zod.z.string().min(1),
76
86
  answer: import_zod.z.string().optional(),
@@ -934,8 +944,19 @@ function addCandidate(candidates, city, region, example) {
934
944
  }
935
945
  candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
936
946
  }
947
+ function decodeSerpText(text) {
948
+ try {
949
+ return decodeURIComponent(text);
950
+ } catch {
951
+ }
952
+ try {
953
+ return decodeURIComponent(text.replace(/%(?![0-9a-fA-F]{2})/g, "%25"));
954
+ } catch {
955
+ return text;
956
+ }
957
+ }
937
958
  function scanText(candidates, text) {
938
- const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
959
+ const normalized = decodeSerpText(text).replace(/[+/|_-]+/g, " ");
939
960
  for (const match of normalized.matchAll(CITY_STATE_RE)) {
940
961
  addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
941
962
  }