@galihvsx/gmr-scraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -0
- package/LICENSE +21 -0
- package/README.md +335 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +681 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +724 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +150 -0
- package/dist/index.d.ts +150 -0
- package/dist/index.js +707 -0
- package/dist/index.js.map +1 -0
- package/package.json +74 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import chalk from 'chalk';
|
|
3
|
+
import { Command } from 'commander';
|
|
4
|
+
import ora from 'ora';
|
|
5
|
+
|
|
6
|
+
// src/cache.ts
|
|
7
|
+
var Cache = class {
|
|
8
|
+
cache = /* @__PURE__ */ new Map();
|
|
9
|
+
ttl;
|
|
10
|
+
maxSize;
|
|
11
|
+
constructor(options = {}) {
|
|
12
|
+
this.ttl = options.ttl ?? 3e5;
|
|
13
|
+
this.maxSize = options.maxSize ?? 100;
|
|
14
|
+
}
|
|
15
|
+
set(key, value) {
|
|
16
|
+
if (this.cache.size >= this.maxSize) {
|
|
17
|
+
const firstKey = this.cache.keys().next().value;
|
|
18
|
+
if (firstKey) {
|
|
19
|
+
this.cache.delete(firstKey);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
this.cache.set(key, {
|
|
23
|
+
value,
|
|
24
|
+
expiresAt: Date.now() + this.ttl
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
get(key) {
|
|
28
|
+
const entry = this.cache.get(key);
|
|
29
|
+
if (!entry) {
|
|
30
|
+
return void 0;
|
|
31
|
+
}
|
|
32
|
+
if (Date.now() > entry.expiresAt) {
|
|
33
|
+
this.cache.delete(key);
|
|
34
|
+
return void 0;
|
|
35
|
+
}
|
|
36
|
+
return entry.value;
|
|
37
|
+
}
|
|
38
|
+
has(key) {
|
|
39
|
+
return this.get(key) !== void 0;
|
|
40
|
+
}
|
|
41
|
+
delete(key) {
|
|
42
|
+
this.cache.delete(key);
|
|
43
|
+
}
|
|
44
|
+
clear() {
|
|
45
|
+
this.cache.clear();
|
|
46
|
+
}
|
|
47
|
+
size() {
|
|
48
|
+
this.cleanup();
|
|
49
|
+
return this.cache.size;
|
|
50
|
+
}
|
|
51
|
+
cleanup() {
|
|
52
|
+
const now = Date.now();
|
|
53
|
+
for (const [key, entry] of this.cache.entries()) {
|
|
54
|
+
if (now > entry.expiresAt) {
|
|
55
|
+
this.cache.delete(key);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
function createCacheKey(placeId, sort, page, query, lang) {
|
|
61
|
+
return `${placeId}:${sort}:${page}:${query}:${lang}`;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// src/rate-limiter.ts
|
|
65
|
+
var RateLimiter = class {
|
|
66
|
+
tokens;
|
|
67
|
+
lastRefill;
|
|
68
|
+
requestsPerSecond;
|
|
69
|
+
burstSize;
|
|
70
|
+
refillRate;
|
|
71
|
+
constructor(options = {}) {
|
|
72
|
+
this.requestsPerSecond = options.requestsPerSecond ?? 2;
|
|
73
|
+
this.burstSize = options.burstSize ?? 5;
|
|
74
|
+
this.tokens = this.burstSize;
|
|
75
|
+
this.lastRefill = Date.now();
|
|
76
|
+
this.refillRate = 1e3 / this.requestsPerSecond;
|
|
77
|
+
}
|
|
78
|
+
refill() {
|
|
79
|
+
const now = Date.now();
|
|
80
|
+
const timePassed = now - this.lastRefill;
|
|
81
|
+
const tokensToAdd = Math.floor(timePassed / this.refillRate);
|
|
82
|
+
if (tokensToAdd > 0) {
|
|
83
|
+
this.tokens = Math.min(this.burstSize, this.tokens + tokensToAdd);
|
|
84
|
+
this.lastRefill = now;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async acquire() {
|
|
88
|
+
this.refill();
|
|
89
|
+
if (this.tokens > 0) {
|
|
90
|
+
this.tokens--;
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
const waitTime = this.refillRate - (Date.now() - this.lastRefill);
|
|
94
|
+
if (waitTime > 0) {
|
|
95
|
+
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
|
96
|
+
return this.acquire();
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
async execute(fn) {
|
|
100
|
+
await this.acquire();
|
|
101
|
+
return fn();
|
|
102
|
+
}
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
// src/types.ts
|
|
106
|
+
var SortEnum = /* @__PURE__ */ ((SortEnum3) => {
|
|
107
|
+
SortEnum3[SortEnum3["relevant"] = 1] = "relevant";
|
|
108
|
+
SortEnum3[SortEnum3["newest"] = 2] = "newest";
|
|
109
|
+
SortEnum3[SortEnum3["highest_rating"] = 3] = "highest_rating";
|
|
110
|
+
SortEnum3[SortEnum3["lowest_rating"] = 4] = "lowest_rating";
|
|
111
|
+
return SortEnum3;
|
|
112
|
+
})(SortEnum || {});
|
|
113
|
+
|
|
114
|
+
// src/errors.ts
|
|
115
|
+
var ScraperError = class extends Error {
|
|
116
|
+
constructor(message, code) {
|
|
117
|
+
super(message);
|
|
118
|
+
this.code = code;
|
|
119
|
+
this.name = "ScraperError";
|
|
120
|
+
Error.captureStackTrace(this, this.constructor);
|
|
121
|
+
}
|
|
122
|
+
};
|
|
123
|
+
var InvalidUrlError = class extends ScraperError {
|
|
124
|
+
constructor(message) {
|
|
125
|
+
super(message, "INVALID_URL");
|
|
126
|
+
this.name = "InvalidUrlError";
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
var FetchError = class extends ScraperError {
|
|
130
|
+
constructor(message, statusCode, response) {
|
|
131
|
+
super(message, "FETCH_ERROR");
|
|
132
|
+
this.statusCode = statusCode;
|
|
133
|
+
this.response = response;
|
|
134
|
+
this.name = "FetchError";
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
var TimeoutError = class extends ScraperError {
|
|
138
|
+
constructor(message = "Request timeout") {
|
|
139
|
+
super(message, "TIMEOUT");
|
|
140
|
+
this.name = "TimeoutError";
|
|
141
|
+
}
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
// src/extractors.ts
|
|
145
|
+
async function resolveUrl(url) {
|
|
146
|
+
if (url.includes("goo.gl") || url.includes("maps.app.goo.gl")) {
|
|
147
|
+
const response = await fetch(url, { redirect: "follow", method: "HEAD" });
|
|
148
|
+
return response.url;
|
|
149
|
+
}
|
|
150
|
+
return url;
|
|
151
|
+
}
|
|
152
|
+
function extractPlaceId(url) {
|
|
153
|
+
const match1 = url.match(/!1s(0x[0-9a-fA-F]+:0x[0-9a-fA-F]+)/);
|
|
154
|
+
if (match1 && match1[1]) {
|
|
155
|
+
return match1[1];
|
|
156
|
+
}
|
|
157
|
+
const match2 = url.match(/!1s([a-zA-Z0-9_:]+)!/);
|
|
158
|
+
if (match2 && match2[1]) {
|
|
159
|
+
return match2[1];
|
|
160
|
+
}
|
|
161
|
+
throw new Error(
|
|
162
|
+
"Could not extract Place ID from URL. Please ensure it is a valid Google Maps Place URL."
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
function hexToDec(hex) {
|
|
166
|
+
return BigInt(hex).toString();
|
|
167
|
+
}
|
|
168
|
+
function buildRpcUrl(placeId, sort, nextPageToken = "", searchQuery = "", lang = "en") {
|
|
169
|
+
const parts = placeId.split(":");
|
|
170
|
+
if (parts.length !== 2) {
|
|
171
|
+
throw new Error(
|
|
172
|
+
`Invalid Place ID format for listentitiesreviews: ${placeId}`
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
const h1 = hexToDec(parts[0]);
|
|
176
|
+
const h2 = hexToDec(parts[1]);
|
|
177
|
+
const constantToken = "dzvaXrvAMImImAXHsLPICA";
|
|
178
|
+
const paginationBlock = nextPageToken ? `!2m2!2i10!3s${nextPageToken}` : `!2m1!2i10`;
|
|
179
|
+
return `https://www.google.com/maps/preview/review/listentitiesreviews?authuser=0&hl=${lang}&gl=in&pb=!1m2!1y${h1}!2y${h2}${paginationBlock}!3e${sort}!4m5!3b1!4b1!5b1!6b1!7b1!5m2!1s${constantToken}!7e81`;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// src/parser.ts
|
|
183
|
+
function parseReviews(reviews) {
|
|
184
|
+
return reviews.map((review) => {
|
|
185
|
+
const authorInfo = review[0] || [];
|
|
186
|
+
const images = null;
|
|
187
|
+
const response = null;
|
|
188
|
+
return {
|
|
189
|
+
review_id: review[10],
|
|
190
|
+
time: {
|
|
191
|
+
published: review[1],
|
|
192
|
+
last_edited: null
|
|
193
|
+
},
|
|
194
|
+
author: {
|
|
195
|
+
name: authorInfo[1],
|
|
196
|
+
profile_url: authorInfo[0],
|
|
197
|
+
url: authorInfo[0],
|
|
198
|
+
id: review[6]
|
|
199
|
+
},
|
|
200
|
+
review: {
|
|
201
|
+
rating: review[4],
|
|
202
|
+
text: parseText(review[3]).text,
|
|
203
|
+
original_text: parseText(review[3]).original_text,
|
|
204
|
+
language: review[32]
|
|
205
|
+
},
|
|
206
|
+
images,
|
|
207
|
+
source: review[10],
|
|
208
|
+
response
|
|
209
|
+
};
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
function parseText(text) {
|
|
213
|
+
if (!text) return { text: null, original_text: null };
|
|
214
|
+
const splitPattern = /\n\n\((.+?)\)\n/g;
|
|
215
|
+
const match = splitPattern.exec(text);
|
|
216
|
+
if (match) {
|
|
217
|
+
const splitIndex = match.index;
|
|
218
|
+
const originalHeaderLength = match[0].length;
|
|
219
|
+
const firstPart = text.substring(0, splitIndex);
|
|
220
|
+
const secondPart = text.substring(splitIndex + originalHeaderLength);
|
|
221
|
+
const translationParamMatch = firstPart.match(/^\((.+?)\) /);
|
|
222
|
+
let translatedText = firstPart;
|
|
223
|
+
if (translationParamMatch) {
|
|
224
|
+
translatedText = firstPart.substring(translationParamMatch[0].length);
|
|
225
|
+
}
|
|
226
|
+
return {
|
|
227
|
+
text: translatedText,
|
|
228
|
+
original_text: secondPart
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
return { text, original_text: null };
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// src/retry.ts
|
|
235
|
+
var DEFAULT_RETRY_OPTIONS = {
|
|
236
|
+
maxAttempts: 3,
|
|
237
|
+
initialDelay: 1e3,
|
|
238
|
+
maxDelay: 1e4,
|
|
239
|
+
backoffMultiplier: 2,
|
|
240
|
+
timeout: 3e4,
|
|
241
|
+
retryCondition: (error) => {
|
|
242
|
+
if (error.name === "InvalidUrlError") return false;
|
|
243
|
+
if (error.name === "ParseError") return false;
|
|
244
|
+
return true;
|
|
245
|
+
}
|
|
246
|
+
};
|
|
247
|
+
async function sleep(ms) {
|
|
248
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
249
|
+
}
|
|
250
|
+
function calculateDelay(attempt, initialDelay, maxDelay, backoffMultiplier) {
|
|
251
|
+
const delay = initialDelay * Math.pow(backoffMultiplier, attempt - 1);
|
|
252
|
+
return Math.min(delay, maxDelay);
|
|
253
|
+
}
|
|
254
|
+
async function withRetry(fn, options = {}) {
|
|
255
|
+
const opts = { ...DEFAULT_RETRY_OPTIONS, ...options };
|
|
256
|
+
let lastError;
|
|
257
|
+
for (let attempt = 1; attempt <= opts.maxAttempts; attempt++) {
|
|
258
|
+
try {
|
|
259
|
+
if (opts.timeout) {
|
|
260
|
+
return await withTimeout(fn(), opts.timeout);
|
|
261
|
+
}
|
|
262
|
+
return await fn();
|
|
263
|
+
} catch (error) {
|
|
264
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
265
|
+
if (attempt === opts.maxAttempts || !opts.retryCondition(lastError)) {
|
|
266
|
+
throw lastError;
|
|
267
|
+
}
|
|
268
|
+
const delay = calculateDelay(
|
|
269
|
+
attempt,
|
|
270
|
+
opts.initialDelay,
|
|
271
|
+
opts.maxDelay,
|
|
272
|
+
opts.backoffMultiplier
|
|
273
|
+
);
|
|
274
|
+
console.warn(
|
|
275
|
+
`Attempt ${attempt}/${opts.maxAttempts} failed: ${lastError.message}. Retrying in ${delay}ms...`
|
|
276
|
+
);
|
|
277
|
+
await sleep(delay);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
throw lastError;
|
|
281
|
+
}
|
|
282
|
+
async function withTimeout(promise, timeoutMs) {
|
|
283
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
284
|
+
setTimeout(() => {
|
|
285
|
+
reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`));
|
|
286
|
+
}, timeoutMs);
|
|
287
|
+
});
|
|
288
|
+
return Promise.race([promise, timeoutPromise]);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// src/utils.ts
|
|
292
|
+
function validateParams(url, sortType, pages, clean) {
|
|
293
|
+
if (!url) {
|
|
294
|
+
throw new InvalidUrlError("URL is required");
|
|
295
|
+
}
|
|
296
|
+
if (sortType && !SortEnum[sortType]) {
|
|
297
|
+
throw new InvalidUrlError(
|
|
298
|
+
`Invalid sort type: ${sortType}. Valid options: ${Object.keys(SortEnum).filter((k) => isNaN(Number(k))).join(", ")}`
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
if (pages !== void 0 && pages !== "max" && (typeof pages !== "number" || pages < 1)) {
|
|
302
|
+
throw new InvalidUrlError("Pages must be 'max' or a positive number");
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
async function fetchReviews(placeId, sort, nextPageToken = "", searchQuery = "", lang = "en", cache, rateLimiter, retryOptions) {
|
|
306
|
+
const cacheKey = cache ? createCacheKey(placeId, sort, nextPageToken, searchQuery, lang) : "";
|
|
307
|
+
if (cache && cacheKey) {
|
|
308
|
+
const cached = cache.get(cacheKey);
|
|
309
|
+
if (cached) {
|
|
310
|
+
return cached;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
const fetchFn = async () => {
|
|
314
|
+
const apiUrl = buildRpcUrl(placeId, sort, nextPageToken, searchQuery, lang);
|
|
315
|
+
const actualFetch = async () => {
|
|
316
|
+
const response = await fetch(apiUrl, {
|
|
317
|
+
headers: {
|
|
318
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
319
|
+
}
|
|
320
|
+
});
|
|
321
|
+
if (!response.ok) {
|
|
322
|
+
throw new FetchError(
|
|
323
|
+
`Failed to fetch reviews: ${response.statusText}`,
|
|
324
|
+
response.status
|
|
325
|
+
);
|
|
326
|
+
}
|
|
327
|
+
const textData = await response.text();
|
|
328
|
+
const cleanJson = textData.replace(/^\)\]\}'/, "");
|
|
329
|
+
let parsedData;
|
|
330
|
+
try {
|
|
331
|
+
parsedData = JSON.parse(cleanJson);
|
|
332
|
+
} catch (e) {
|
|
333
|
+
console.error("Failed to parse JSON:", cleanJson.substring(0, 100));
|
|
334
|
+
throw new FetchError("Failed to parse API response");
|
|
335
|
+
}
|
|
336
|
+
if (!parsedData || !Array.isArray(parsedData)) {
|
|
337
|
+
console.warn("Unexpected response structure:", parsedData);
|
|
338
|
+
return { data: [], nextPageToken: null };
|
|
339
|
+
}
|
|
340
|
+
const reviews = parsedData[2] || [];
|
|
341
|
+
const nextToken = parsedData[1] || null;
|
|
342
|
+
return { data: reviews, nextPageToken: nextToken };
|
|
343
|
+
};
|
|
344
|
+
if (rateLimiter) {
|
|
345
|
+
return rateLimiter.execute(actualFetch);
|
|
346
|
+
}
|
|
347
|
+
return actualFetch();
|
|
348
|
+
};
|
|
349
|
+
const result = retryOptions ? await withRetry(fetchFn, retryOptions) : await fetchFn();
|
|
350
|
+
if (cache && cacheKey) {
|
|
351
|
+
cache.set(cacheKey, result);
|
|
352
|
+
}
|
|
353
|
+
return result;
|
|
354
|
+
}
|
|
355
|
+
async function paginateReviews(url, sort, pages, searchQuery, clean, lang = "en", cache, rateLimiter, retryOptions, onProgress) {
|
|
356
|
+
const resolvedUrl = await resolveUrl(url);
|
|
357
|
+
const placeId = extractPlaceId(resolvedUrl);
|
|
358
|
+
const initial = await fetchReviews(
|
|
359
|
+
placeId,
|
|
360
|
+
sort,
|
|
361
|
+
"",
|
|
362
|
+
searchQuery,
|
|
363
|
+
lang,
|
|
364
|
+
cache,
|
|
365
|
+
rateLimiter,
|
|
366
|
+
retryOptions
|
|
367
|
+
);
|
|
368
|
+
let allReviews = [...initial.data];
|
|
369
|
+
let nextPageToken = initial.nextPageToken;
|
|
370
|
+
if (onProgress) {
|
|
371
|
+
onProgress(1, pages);
|
|
372
|
+
}
|
|
373
|
+
if (pages === 1 || !nextPageToken) {
|
|
374
|
+
return clean ? parseReviews(allReviews) : allReviews;
|
|
375
|
+
}
|
|
376
|
+
let currentPage = 2;
|
|
377
|
+
const maxPages = pages === "max" ? Infinity : pages;
|
|
378
|
+
while (nextPageToken && currentPage <= maxPages) {
|
|
379
|
+
if (!rateLimiter) {
|
|
380
|
+
await new Promise((resolve) => setTimeout(resolve, 1e3));
|
|
381
|
+
}
|
|
382
|
+
try {
|
|
383
|
+
const nextBatch = await fetchReviews(
|
|
384
|
+
placeId,
|
|
385
|
+
sort,
|
|
386
|
+
nextPageToken,
|
|
387
|
+
searchQuery,
|
|
388
|
+
lang,
|
|
389
|
+
cache,
|
|
390
|
+
rateLimiter,
|
|
391
|
+
retryOptions
|
|
392
|
+
);
|
|
393
|
+
allReviews = [...allReviews, ...nextBatch.data];
|
|
394
|
+
nextPageToken = nextBatch.nextPageToken;
|
|
395
|
+
if (onProgress) {
|
|
396
|
+
onProgress(currentPage, pages);
|
|
397
|
+
}
|
|
398
|
+
if (!nextBatch.data.length && !nextPageToken) {
|
|
399
|
+
break;
|
|
400
|
+
}
|
|
401
|
+
} catch (e) {
|
|
402
|
+
console.error(`Error scraping page ${currentPage}:`, e);
|
|
403
|
+
break;
|
|
404
|
+
}
|
|
405
|
+
currentPage++;
|
|
406
|
+
}
|
|
407
|
+
return clean ? parseReviews(allReviews) : allReviews;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// src/batch.ts
|
|
411
|
+
async function batchScraper(urls, options = {}) {
|
|
412
|
+
const {
|
|
413
|
+
concurrency = 3,
|
|
414
|
+
onProgress,
|
|
415
|
+
includeAnalytics = false,
|
|
416
|
+
...scraperOptions
|
|
417
|
+
} = options;
|
|
418
|
+
const results = [];
|
|
419
|
+
const queue = [...urls];
|
|
420
|
+
let completed = 0;
|
|
421
|
+
const processBatch = async (url) => {
|
|
422
|
+
try {
|
|
423
|
+
const resolvedUrl = await resolveUrl(url);
|
|
424
|
+
const placeId = extractPlaceId(resolvedUrl);
|
|
425
|
+
const reviews = await scraper(url, scraperOptions);
|
|
426
|
+
const result = {
|
|
427
|
+
url,
|
|
428
|
+
placeId,
|
|
429
|
+
reviews
|
|
430
|
+
};
|
|
431
|
+
if (includeAnalytics && scraperOptions.clean) {
|
|
432
|
+
result.analytics = calculateAnalytics(reviews);
|
|
433
|
+
}
|
|
434
|
+
completed++;
|
|
435
|
+
if (onProgress) {
|
|
436
|
+
onProgress(completed, urls.length, url);
|
|
437
|
+
}
|
|
438
|
+
return result;
|
|
439
|
+
} catch (error) {
|
|
440
|
+
completed++;
|
|
441
|
+
if (onProgress) {
|
|
442
|
+
onProgress(completed, urls.length, url);
|
|
443
|
+
}
|
|
444
|
+
return {
|
|
445
|
+
url,
|
|
446
|
+
reviews: [],
|
|
447
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
};
|
|
451
|
+
const workers = [];
|
|
452
|
+
for (let i = 0; i < concurrency; i++) {
|
|
453
|
+
workers.push(
|
|
454
|
+
(async () => {
|
|
455
|
+
while (queue.length > 0) {
|
|
456
|
+
const url = queue.shift();
|
|
457
|
+
if (url) {
|
|
458
|
+
const result = await processBatch(url);
|
|
459
|
+
results.push(result);
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
})()
|
|
463
|
+
);
|
|
464
|
+
}
|
|
465
|
+
await Promise.all(workers);
|
|
466
|
+
return results;
|
|
467
|
+
}
|
|
468
|
+
function calculateAnalytics(reviews) {
|
|
469
|
+
const totalReviews = reviews.length;
|
|
470
|
+
const ratingDistribution = {
|
|
471
|
+
1: 0,
|
|
472
|
+
2: 0,
|
|
473
|
+
3: 0,
|
|
474
|
+
4: 0,
|
|
475
|
+
5: 0
|
|
476
|
+
};
|
|
477
|
+
let totalRating = 0;
|
|
478
|
+
let reviewsWithText = 0;
|
|
479
|
+
let reviewsWithImages = 0;
|
|
480
|
+
let reviewsWithResponse = 0;
|
|
481
|
+
for (const review of reviews) {
|
|
482
|
+
const rating = review.review.rating;
|
|
483
|
+
totalRating += rating;
|
|
484
|
+
ratingDistribution[rating] = (ratingDistribution[rating] || 0) + 1;
|
|
485
|
+
if (review.review.text) reviewsWithText++;
|
|
486
|
+
if (review.images && review.images.length > 0) reviewsWithImages++;
|
|
487
|
+
if (review.response) reviewsWithResponse++;
|
|
488
|
+
}
|
|
489
|
+
return {
|
|
490
|
+
totalReviews,
|
|
491
|
+
averageRating: totalReviews > 0 ? totalRating / totalReviews : 0,
|
|
492
|
+
ratingDistribution,
|
|
493
|
+
reviewsWithText,
|
|
494
|
+
reviewsWithImages,
|
|
495
|
+
reviewsWithResponse
|
|
496
|
+
};
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// src/index.ts
|
|
500
|
+
async function scraper(url, options = {}) {
|
|
501
|
+
const {
|
|
502
|
+
sort_type = "relevant",
|
|
503
|
+
search_query = "",
|
|
504
|
+
pages = "max",
|
|
505
|
+
clean = false,
|
|
506
|
+
lang = "en",
|
|
507
|
+
retry,
|
|
508
|
+
cache: cacheOptions,
|
|
509
|
+
rateLimit,
|
|
510
|
+
timeout,
|
|
511
|
+
onProgress
|
|
512
|
+
} = options;
|
|
513
|
+
validateParams(url, sort_type, pages);
|
|
514
|
+
const sort = SortEnum[sort_type];
|
|
515
|
+
const cache = cacheOptions?.enabled ? new Cache(cacheOptions) : void 0;
|
|
516
|
+
const rateLimiter = rateLimit ? new RateLimiter(rateLimit) : void 0;
|
|
517
|
+
const retryOptions = {
|
|
518
|
+
timeout,
|
|
519
|
+
...retry
|
|
520
|
+
};
|
|
521
|
+
try {
|
|
522
|
+
return await paginateReviews(
|
|
523
|
+
url,
|
|
524
|
+
sort,
|
|
525
|
+
pages,
|
|
526
|
+
search_query,
|
|
527
|
+
clean,
|
|
528
|
+
lang,
|
|
529
|
+
cache,
|
|
530
|
+
rateLimiter,
|
|
531
|
+
retryOptions,
|
|
532
|
+
onProgress
|
|
533
|
+
);
|
|
534
|
+
} catch (e) {
|
|
535
|
+
console.error("Scraper Error:", e);
|
|
536
|
+
throw e;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// src/cli/formatters.ts
|
|
541
|
+
function formatJSON(data) {
|
|
542
|
+
return JSON.stringify(data, null, 2);
|
|
543
|
+
}
|
|
544
|
+
function formatTable(reviews) {
|
|
545
|
+
if (!reviews || reviews.length === 0) {
|
|
546
|
+
return "No reviews found.";
|
|
547
|
+
}
|
|
548
|
+
const isClean = "review_id" in reviews[0];
|
|
549
|
+
if (!isClean) {
|
|
550
|
+
return "Table format only available with --clean flag";
|
|
551
|
+
}
|
|
552
|
+
const cleanReviews = reviews;
|
|
553
|
+
const header = [
|
|
554
|
+
"Author".padEnd(20),
|
|
555
|
+
"Rating".padEnd(8),
|
|
556
|
+
"Date".padEnd(12),
|
|
557
|
+
"Text".padEnd(50)
|
|
558
|
+
].join(" | ");
|
|
559
|
+
const separator = "-".repeat(header.length);
|
|
560
|
+
const rows = cleanReviews.slice(0, 20).map((review) => {
|
|
561
|
+
const author = review.author.name.substring(0, 18).padEnd(20);
|
|
562
|
+
const rating = "\u2B50".repeat(review.review.rating).padEnd(8);
|
|
563
|
+
const date = review.time.published.substring(0, 10).padEnd(12);
|
|
564
|
+
const text = (review.review.text || "No text").substring(0, 48).replace(/\n/g, " ").padEnd(50);
|
|
565
|
+
return [author, rating, date, text].join(" | ");
|
|
566
|
+
});
|
|
567
|
+
return [header, separator, ...rows].join("\n");
|
|
568
|
+
}
|
|
569
|
+
function formatCSV(reviews) {
|
|
570
|
+
if (!reviews || reviews.length === 0) {
|
|
571
|
+
return "No reviews found.";
|
|
572
|
+
}
|
|
573
|
+
const isClean = "review_id" in reviews[0];
|
|
574
|
+
if (!isClean) {
|
|
575
|
+
return "CSV format only available with --clean flag";
|
|
576
|
+
}
|
|
577
|
+
const cleanReviews = reviews;
|
|
578
|
+
const header = [
|
|
579
|
+
"review_id",
|
|
580
|
+
"author_name",
|
|
581
|
+
"rating",
|
|
582
|
+
"published",
|
|
583
|
+
"text",
|
|
584
|
+
"has_images",
|
|
585
|
+
"has_response"
|
|
586
|
+
].join(",");
|
|
587
|
+
const rows = cleanReviews.map((review) => {
|
|
588
|
+
const text = (review.review.text || "").replace(/"/g, '""').replace(/\n/g, " ");
|
|
589
|
+
return [
|
|
590
|
+
review.review_id,
|
|
591
|
+
`"${review.author.name.replace(/"/g, '""')}"`,
|
|
592
|
+
review.review.rating,
|
|
593
|
+
review.time.published,
|
|
594
|
+
`"${text}"`,
|
|
595
|
+
review.images && review.images.length > 0 ? "yes" : "no",
|
|
596
|
+
review.response ? "yes" : "no"
|
|
597
|
+
].join(",");
|
|
598
|
+
});
|
|
599
|
+
return [header, ...rows].join("\n");
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// src/cli/index.ts
|
|
603
|
+
var program = new Command();
|
|
604
|
+
program.name("gmr-scraper").description("Modern Google Maps review scraper with TypeScript").version("1.0.0");
|
|
605
|
+
program.command("scrape").description("Scrape reviews from a Google Maps URL").argument("<url>", "Google Maps place URL").option(
|
|
606
|
+
"-s, --sort <type>",
|
|
607
|
+
"Sort type (relevant, newest, highest_rating, lowest_rating)",
|
|
608
|
+
"relevant"
|
|
609
|
+
).option("-p, --pages <number>", "Number of pages to scrape (or 'max')", "max").option("-q, --query <text>", "Search query to filter reviews", "").option("-l, --lang <code>", "Language code (e.g., en, id, es)", "en").option("-c, --clean", "Return parsed objects instead of raw data", false).option("-o, --output <format>", "Output format (json, table, csv)", "json").option("--cache", "Enable caching", false).option("--retry <attempts>", "Max retry attempts", "3").option("--timeout <ms>", "Request timeout in milliseconds", "30000").option("--rate-limit <rps>", "Requests per second", "2").action(async (url, options) => {
|
|
610
|
+
const spinner = ora("Scraping reviews...").start();
|
|
611
|
+
try {
|
|
612
|
+
const pages = options.pages === "max" ? "max" : parseInt(options.pages);
|
|
613
|
+
const reviews = await scraper(url, {
|
|
614
|
+
sort_type: options.sort,
|
|
615
|
+
pages,
|
|
616
|
+
search_query: options.query,
|
|
617
|
+
lang: options.lang,
|
|
618
|
+
clean: options.clean,
|
|
619
|
+
cache: {
|
|
620
|
+
enabled: options.cache
|
|
621
|
+
},
|
|
622
|
+
retry: {
|
|
623
|
+
maxAttempts: parseInt(options.retry)
|
|
624
|
+
},
|
|
625
|
+
timeout: parseInt(options.timeout),
|
|
626
|
+
rateLimit: {
|
|
627
|
+
requestsPerSecond: parseInt(options.rateLimit)
|
|
628
|
+
},
|
|
629
|
+
onProgress: (current, total) => {
|
|
630
|
+
spinner.text = `Scraping page ${current}${total !== "max" ? `/${total}` : ""}...`;
|
|
631
|
+
}
|
|
632
|
+
});
|
|
633
|
+
spinner.succeed(chalk.green(`Scraped ${reviews.length} reviews`));
|
|
634
|
+
switch (options.output) {
|
|
635
|
+
case "table":
|
|
636
|
+
console.log(formatTable(reviews));
|
|
637
|
+
break;
|
|
638
|
+
case "csv":
|
|
639
|
+
console.log(formatCSV(reviews));
|
|
640
|
+
break;
|
|
641
|
+
case "json":
|
|
642
|
+
default:
|
|
643
|
+
console.log(formatJSON(reviews));
|
|
644
|
+
}
|
|
645
|
+
} catch (error) {
|
|
646
|
+
spinner.fail(chalk.red("Failed to scrape reviews"));
|
|
647
|
+
console.error(
|
|
648
|
+
chalk.red(error instanceof Error ? error.message : String(error))
|
|
649
|
+
);
|
|
650
|
+
process.exit(1);
|
|
651
|
+
}
|
|
652
|
+
});
|
|
653
|
+
program.command("batch").description("Scrape reviews from multiple URLs").argument("<urls...>", "Google Maps place URLs").option("-s, --sort <type>", "Sort type", "relevant").option("-p, --pages <number>", "Number of pages to scrape", "max").option("-c, --clean", "Return parsed objects", false).option("--concurrency <number>", "Number of concurrent requests", "3").option("--analytics", "Include analytics", false).action(async (urls, options) => {
|
|
654
|
+
const spinner = ora(`Scraping ${urls.length} locations...`).start();
|
|
655
|
+
try {
|
|
656
|
+
const pages = options.pages === "max" ? "max" : parseInt(options.pages);
|
|
657
|
+
const results = await batchScraper(urls, {
|
|
658
|
+
sort_type: options.sort,
|
|
659
|
+
pages,
|
|
660
|
+
clean: options.clean,
|
|
661
|
+
concurrency: parseInt(options.concurrency),
|
|
662
|
+
includeAnalytics: options.analytics,
|
|
663
|
+
onProgress: (completed, total, url) => {
|
|
664
|
+
spinner.text = `Progress: ${completed}/${total} (${url})`;
|
|
665
|
+
}
|
|
666
|
+
});
|
|
667
|
+
spinner.succeed(
|
|
668
|
+
chalk.green(`Completed scraping ${urls.length} locations`)
|
|
669
|
+
);
|
|
670
|
+
console.log(formatJSON(results));
|
|
671
|
+
} catch (error) {
|
|
672
|
+
spinner.fail(chalk.red("Batch scraping failed"));
|
|
673
|
+
console.error(
|
|
674
|
+
chalk.red(error instanceof Error ? error.message : String(error))
|
|
675
|
+
);
|
|
676
|
+
process.exit(1);
|
|
677
|
+
}
|
|
678
|
+
});
|
|
679
|
+
program.parse();
|
|
680
|
+
//# sourceMappingURL=cli.js.map
|
|
681
|
+
//# sourceMappingURL=cli.js.map
|