gscdump 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +48 -1
- package/dist/index.mjs +74 -1
- package/dist/sitemap.d.mts +14 -1
- package/dist/sitemap.mjs +38 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,17 @@
|
|
|
1
1
|
import { $Fetch, FetchOptions } from "ofetch";
|
|
2
2
|
import { indexing_v3 } from "@googleapis/indexing/build/v3";
|
|
3
3
|
import { searchconsole_v1 } from "@googleapis/searchconsole/build/v1";
|
|
4
|
+
/**
|
|
5
|
+
* Batch runner with optional concurrency, inter-call delay, and progress.
|
|
6
|
+
* Used by batchRequestIndexing / batchInspectUrls. Defaults to sequential
|
|
7
|
+
* (concurrency = 1) because the underlying APIs rate-limit aggressively;
|
|
8
|
+
* callers that know their quota headroom can opt into parallelism.
|
|
9
|
+
*/
|
|
10
|
+
declare function runSequentialBatch<I, R>(items: I[], operation: (item: I, index: number) => Promise<R>, options?: {
|
|
11
|
+
delayMs?: number;
|
|
12
|
+
concurrency?: number;
|
|
13
|
+
onProgress?: (result: R, index: number, total: number) => void;
|
|
14
|
+
}): Promise<R[]>;
|
|
4
15
|
type ApiSite = searchconsole_v1.Schema$WmxSite;
|
|
5
16
|
type ApiSitemap = searchconsole_v1.Schema$WmxSitemap;
|
|
6
17
|
type ApiSitemapContent = searchconsole_v1.Schema$WmxSitemapContent;
|
|
@@ -375,6 +386,17 @@ declare function verifySite(client: GoogleSearchConsoleClient, siteUrl: string,
|
|
|
375
386
|
* List all verified WebResources for the authed user.
|
|
376
387
|
*/
|
|
377
388
|
declare function listVerifiedSites(client: GoogleSearchConsoleClient): Promise<VerificationWebResource[]>;
|
|
389
|
+
/**
|
|
390
|
+
* Fetch a single verified WebResource by id.
|
|
391
|
+
*/
|
|
392
|
+
declare function getVerifiedSite(client: GoogleSearchConsoleClient, id: string): Promise<VerificationWebResource>;
|
|
393
|
+
/**
|
|
394
|
+
* Drop the calling user's verified ownership of a WebResource. The placed
|
|
395
|
+
* verification token (meta tag / file / DNS record) MUST be removed first,
|
|
396
|
+
* otherwise Google may auto-re-verify and the call will fail. Other owners
|
|
397
|
+
* on the property are unaffected.
|
|
398
|
+
*/
|
|
399
|
+
declare function unverifySite(client: GoogleSearchConsoleClient, id: string): Promise<void>;
|
|
378
400
|
interface GscdumpApiOptions {
|
|
379
401
|
/** API key (gsd_user_xxx or gsd_prod_xxx) */
|
|
380
402
|
apiKey: string;
|
|
@@ -539,4 +561,29 @@ declare const INDEXING_EFFECTIVE_LIMIT = 1800;
|
|
|
539
561
|
declare function hasGscReadScope(scopes: string | null | undefined): boolean;
|
|
540
562
|
declare function hasGscWriteScope(scopes: string | null | undefined): boolean;
|
|
541
563
|
declare function hasIndexingScope(scopes: string | null | undefined): boolean;
|
|
542
|
-
|
|
564
|
+
interface DiscoverSitemapOptions {
|
|
565
|
+
/** User-Agent sent on the discovery requests. */
|
|
566
|
+
userAgent?: string;
|
|
567
|
+
/** AbortSignal threaded through fetches; defaults to a 10s timeout per call. */
|
|
568
|
+
signal?: AbortSignal;
|
|
569
|
+
}
|
|
570
|
+
/**
|
|
571
|
+
* Try to discover a sitemap for `domain` by checking robots.txt for a
|
|
572
|
+
* `Sitemap:` directive, then a small set of common paths. Returns the first
|
|
573
|
+
* URL that responds with a 2xx, or `null`.
|
|
574
|
+
*/
|
|
575
|
+
declare function discoverSitemap(domain: string, options?: DiscoverSitemapOptions): Promise<string | null>;
|
|
576
|
+
interface FetchSitemapUrlsOptions extends DiscoverSitemapOptions {
|
|
577
|
+
/** Maximum nested sitemap-index depth to follow. Default 3. */
|
|
578
|
+
maxDepth?: number;
|
|
579
|
+
/** Stop after this many URLs (across all nested sitemaps). Default unlimited. */
|
|
580
|
+
limit?: number;
|
|
581
|
+
}
|
|
582
|
+
/**
|
|
583
|
+
* Fetch a sitemap (or sitemap index) and return the list of `<loc>` URLs.
|
|
584
|
+
* Sitemap-index files are followed up to `maxDepth` levels. Duplicates are
|
|
585
|
+
* de-duplicated. The XML parser is regex-based — it handles the common
|
|
586
|
+
* `<loc>https://...</loc>` shape but doesn't validate the schema.
|
|
587
|
+
*/
|
|
588
|
+
declare function fetchSitemapUrls(sitemapUrl: string, options?: FetchSitemapUrlsOptions): Promise<string[]>;
|
|
589
|
+
export { ApiSite, ApiSitemap, ApiSitemapContent, Auth, AuthClient, AuthOptions, BackfillProgress, CallOptions, DAYS_PER_RANGE, DataRow, DimensionFilter, DimensionFilterGroup, DiscoverSitemapOptions, FetchSitemapUrlsOptions, GSC_FINALIZED_LAG_DAYS, GSC_FRESHEST_LAG_DAYS, GSC_QUOTAS, GSC_RETENTION_MONTHS, GoogleSearchConsoleClient, GoogleSearchConsoleClientOptions, GscError, GscErrorKind, GscdumpApiOptions, INDEXING_DAILY_LIMIT, INDEXING_EFFECTIVE_LIMIT, INDEXING_ISSUE_FILTERS, INDEXING_ISSUE_LABELS, INDEXING_ISSUE_SEVERITY, IndexStatusResult, IndexingIssueType, IndexingMetadata, IndexingNotificationType, IndexingResult, InspectUrlIndexResponse, InspectUrlResult, MS_PER_DAY, MobileUsabilityResult, Period, PublishUrlNotificationResponse, RequiredNonNullable, ResolvedAnalyticsRange, RichResultsResult, SearchAnalyticsQuery, SearchAnalyticsResponse, Site, SiteAnalytics, UrlInspectionResult, UrlNotificationMetadata, VerificationMethod, VerificationSite, VerificationSiteType, VerificationToken, VerificationWebResource, addDays, addSite, batchInspectUrls, batchRequestIndexing, classifyError, countDays, createAuth, createFetch, daysAgo, deleteSite, deleteSitemap, discoverSitemap, fetchSitemap, fetchSitemapUrls, fetchSitemaps, fetchSites, fetchSitesWithSitemaps, formatErrorForCli, generateGscDateRange, getBackfillProgress, getDateRange, getFreshestGscDate, getIndexingMetadata, getLatestGscDate, getNextDate, getOldestGscDate, getPendingDates, getPreviousDate, getPstDate, getVerificationToken, getVerifiedSite, googleSearchConsole, groupIntoRanges, gscdumpApi, hasGscReadScope, hasGscWriteScope, hasIndexingScope, inspectUrl, isPermissionDeniedError, isValidGscDate, listVerifiedSites, progressBar, requestIndexing, rowWithMetricDefaults, runSequentialBatch, siteUrlToVerificationSite, storageError, submitSitemap, toIsoDate, unverifySite, verificationMethodsFor, verifySite };
|
package/dist/index.mjs
CHANGED
|
@@ -143,6 +143,12 @@ async function verifySite(client, siteUrl, method) {
|
|
|
143
143
|
async function listVerifiedSites(client) {
|
|
144
144
|
return client.verification.list();
|
|
145
145
|
}
|
|
146
|
+
async function getVerifiedSite(client, id) {
|
|
147
|
+
return client.verification.get(id);
|
|
148
|
+
}
|
|
149
|
+
async function unverifySite(client, id) {
|
|
150
|
+
return client.verification.delete(id);
|
|
151
|
+
}
|
|
146
152
|
const MS_PER_DAY = 864e5;
|
|
147
153
|
function toIsoDate(d) {
|
|
148
154
|
return d.toISOString().slice(0, 10);
|
|
@@ -923,4 +929,71 @@ function hasIndexingScope(scopes) {
|
|
|
923
929
|
if (!scopes) return false;
|
|
924
930
|
return scopes.includes("googleapis.com/auth/indexing");
|
|
925
931
|
}
|
|
926
|
-
|
|
932
|
+
const FETCH_TIMEOUT_MS = 1e4;
|
|
933
|
+
const COMMON_PATHS = ["/sitemap.xml", "/sitemap_index.xml"];
|
|
934
|
+
const SITEMAP_DIRECTIVE_RE = /^Sitemap:\s*(\S+)/im;
|
|
935
|
+
async function discoverSitemap(domain, options = {}) {
|
|
936
|
+
const userAgent = options.userAgent ?? "gscdump sitemap fetcher";
|
|
937
|
+
const baseUrl = `https://${domain}`;
|
|
938
|
+
const signalFor = () => options.signal ?? AbortSignal.timeout(FETCH_TIMEOUT_MS);
|
|
939
|
+
const robotsRes = await fetch(`${baseUrl}/robots.txt`, {
|
|
940
|
+
headers: { "User-Agent": userAgent },
|
|
941
|
+
signal: signalFor()
|
|
942
|
+
}).catch(() => null);
|
|
943
|
+
if (robotsRes?.ok) {
|
|
944
|
+
const match = (await robotsRes.text()).match(SITEMAP_DIRECTIVE_RE);
|
|
945
|
+
if (match?.[1]) {
|
|
946
|
+
if ((await fetch(match[1], {
|
|
947
|
+
method: "HEAD",
|
|
948
|
+
signal: signalFor()
|
|
949
|
+
}).catch(() => null))?.ok) return match[1];
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
for (const path of COMMON_PATHS) {
|
|
953
|
+
const url = `${baseUrl}${path}`;
|
|
954
|
+
if ((await fetch(url, {
|
|
955
|
+
method: "HEAD",
|
|
956
|
+
headers: { "User-Agent": userAgent },
|
|
957
|
+
signal: signalFor()
|
|
958
|
+
}).catch(() => null))?.ok) return url;
|
|
959
|
+
}
|
|
960
|
+
return null;
|
|
961
|
+
}
|
|
962
|
+
const LOC_RE = /<loc>([^<]+)<\/loc>/gi;
|
|
963
|
+
const SITEMAPINDEX_RE = /<sitemapindex\b/i;
|
|
964
|
+
async function fetchSitemapUrls(sitemapUrl, options = {}) {
|
|
965
|
+
const userAgent = options.userAgent ?? "gscdump sitemap fetcher";
|
|
966
|
+
const maxDepth = options.maxDepth ?? 3;
|
|
967
|
+
const limit = options.limit;
|
|
968
|
+
const signalFor = () => options.signal ?? AbortSignal.timeout(FETCH_TIMEOUT_MS);
|
|
969
|
+
const seen = /* @__PURE__ */ new Set();
|
|
970
|
+
const out = [];
|
|
971
|
+
const visit = async (url, depth) => {
|
|
972
|
+
if (limit != null && out.length >= limit) return;
|
|
973
|
+
if (depth > maxDepth) return;
|
|
974
|
+
const res = await fetch(url, {
|
|
975
|
+
headers: { "User-Agent": userAgent },
|
|
976
|
+
signal: signalFor()
|
|
977
|
+
});
|
|
978
|
+
if (!res.ok) throw new Error(`Fetch ${url} failed: ${res.status}`);
|
|
979
|
+
const text = await res.text();
|
|
980
|
+
const isIndex = SITEMAPINDEX_RE.test(text);
|
|
981
|
+
const matches = [...text.matchAll(LOC_RE)].map((m) => m[1].trim()).filter(Boolean);
|
|
982
|
+
if (isIndex) {
|
|
983
|
+
for (const child of matches) {
|
|
984
|
+
if (limit != null && out.length >= limit) return;
|
|
985
|
+
await visit(child, depth + 1);
|
|
986
|
+
}
|
|
987
|
+
return;
|
|
988
|
+
}
|
|
989
|
+
for (const u of matches) {
|
|
990
|
+
if (seen.has(u)) continue;
|
|
991
|
+
seen.add(u);
|
|
992
|
+
out.push(u);
|
|
993
|
+
if (limit != null && out.length >= limit) return;
|
|
994
|
+
}
|
|
995
|
+
};
|
|
996
|
+
await visit(sitemapUrl, 0);
|
|
997
|
+
return out;
|
|
998
|
+
}
|
|
999
|
+
export { DAYS_PER_RANGE, GSC_FINALIZED_LAG_DAYS, GSC_FRESHEST_LAG_DAYS, GSC_QUOTAS, GSC_RETENTION_MONTHS, INDEXING_DAILY_LIMIT, INDEXING_EFFECTIVE_LIMIT, INDEXING_ISSUE_FILTERS, INDEXING_ISSUE_LABELS, INDEXING_ISSUE_SEVERITY, MS_PER_DAY, addDays, addSite, batchInspectUrls, batchRequestIndexing, classifyError, countDays, createAuth, createFetch, daysAgo, deleteSite, deleteSitemap, discoverSitemap, fetchSitemap, fetchSitemapUrls, fetchSitemaps, fetchSites, fetchSitesWithSitemaps, formatErrorForCli, generateGscDateRange, getBackfillProgress, getDateRange, getFreshestGscDate, getIndexingMetadata, getLatestGscDate, getNextDate, getOldestGscDate, getPendingDates, getPreviousDate, getPstDate, getVerificationToken, getVerifiedSite, googleSearchConsole, groupIntoRanges, gscdumpApi, hasGscReadScope, hasGscWriteScope, hasIndexingScope, inspectUrl, isPermissionDeniedError, isValidGscDate, listVerifiedSites, progressBar, requestIndexing, rowWithMetricDefaults, runSequentialBatch, siteUrlToVerificationSite, storageError, submitSitemap, toIsoDate, unverifySite, verificationMethodsFor, verifySite };
|
package/dist/sitemap.d.mts
CHANGED
|
@@ -10,4 +10,17 @@ interface DiscoverSitemapOptions {
|
|
|
10
10
|
* URL that responds with a 2xx, or `null`.
|
|
11
11
|
*/
|
|
12
12
|
declare function discoverSitemap(domain: string, options?: DiscoverSitemapOptions): Promise<string | null>;
|
|
13
|
-
|
|
13
|
+
interface FetchSitemapUrlsOptions extends DiscoverSitemapOptions {
|
|
14
|
+
/** Maximum nested sitemap-index depth to follow. Default 3. */
|
|
15
|
+
maxDepth?: number;
|
|
16
|
+
/** Stop after this many URLs (across all nested sitemaps). Default unlimited. */
|
|
17
|
+
limit?: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Fetch a sitemap (or sitemap index) and return the list of `<loc>` URLs.
|
|
21
|
+
* Sitemap-index files are followed up to `maxDepth` levels. Duplicates are
|
|
22
|
+
* de-duplicated. The XML parser is regex-based — it handles the common
|
|
23
|
+
* `<loc>https://...</loc>` shape but doesn't validate the schema.
|
|
24
|
+
*/
|
|
25
|
+
declare function fetchSitemapUrls(sitemapUrl: string, options?: FetchSitemapUrlsOptions): Promise<string[]>;
|
|
26
|
+
export { DiscoverSitemapOptions, FetchSitemapUrlsOptions, discoverSitemap, fetchSitemapUrls };
|
package/dist/sitemap.mjs
CHANGED
|
@@ -28,4 +28,41 @@ async function discoverSitemap(domain, options = {}) {
|
|
|
28
28
|
}
|
|
29
29
|
return null;
|
|
30
30
|
}
|
|
31
|
-
|
|
31
|
+
const LOC_RE = /<loc>([^<]+)<\/loc>/gi;
|
|
32
|
+
const SITEMAPINDEX_RE = /<sitemapindex\b/i;
|
|
33
|
+
async function fetchSitemapUrls(sitemapUrl, options = {}) {
|
|
34
|
+
const userAgent = options.userAgent ?? "gscdump sitemap fetcher";
|
|
35
|
+
const maxDepth = options.maxDepth ?? 3;
|
|
36
|
+
const limit = options.limit;
|
|
37
|
+
const signalFor = () => options.signal ?? AbortSignal.timeout(FETCH_TIMEOUT_MS);
|
|
38
|
+
const seen = /* @__PURE__ */ new Set();
|
|
39
|
+
const out = [];
|
|
40
|
+
const visit = async (url, depth) => {
|
|
41
|
+
if (limit != null && out.length >= limit) return;
|
|
42
|
+
if (depth > maxDepth) return;
|
|
43
|
+
const res = await fetch(url, {
|
|
44
|
+
headers: { "User-Agent": userAgent },
|
|
45
|
+
signal: signalFor()
|
|
46
|
+
});
|
|
47
|
+
if (!res.ok) throw new Error(`Fetch ${url} failed: ${res.status}`);
|
|
48
|
+
const text = await res.text();
|
|
49
|
+
const isIndex = SITEMAPINDEX_RE.test(text);
|
|
50
|
+
const matches = [...text.matchAll(LOC_RE)].map((m) => m[1].trim()).filter(Boolean);
|
|
51
|
+
if (isIndex) {
|
|
52
|
+
for (const child of matches) {
|
|
53
|
+
if (limit != null && out.length >= limit) return;
|
|
54
|
+
await visit(child, depth + 1);
|
|
55
|
+
}
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
for (const u of matches) {
|
|
59
|
+
if (seen.has(u)) continue;
|
|
60
|
+
seen.add(u);
|
|
61
|
+
out.push(u);
|
|
62
|
+
if (limit != null && out.length >= limit) return;
|
|
63
|
+
}
|
|
64
|
+
};
|
|
65
|
+
await visit(sitemapUrl, 0);
|
|
66
|
+
return out;
|
|
67
|
+
}
|
|
68
|
+
export { discoverSitemap, fetchSitemapUrls };
|
package/package.json
CHANGED