@blamejs/blamejs-shop 0.0.65 → 0.0.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/lib/assembly-instructions.js +777 -0
- package/lib/auto-replenish.js +933 -0
- package/lib/business-hours.js +980 -0
- package/lib/click-and-collect.js +711 -0
- package/lib/clickstream.js +713 -0
- package/lib/cost-layers.js +774 -0
- package/lib/credit-limits.js +752 -0
- package/lib/currency-rounding.js +525 -0
- package/lib/customer-activity.js +862 -0
- package/lib/customer-notes.js +712 -0
- package/lib/customer-risk-profile.js +593 -0
- package/lib/customer-surveys.js +1012 -0
- package/lib/damage-photos.js +473 -0
- package/lib/discount-allocation.js +557 -0
- package/lib/dropship-forwarding.js +645 -0
- package/lib/email-templates.js +817 -0
- package/lib/index.js +45 -0
- package/lib/inventory-allocations.js +559 -0
- package/lib/inventory-writeoffs.js +636 -0
- package/lib/knowledge-base.js +1104 -0
- package/lib/locale-router.js +1077 -0
- package/lib/operator-roles.js +768 -0
- package/lib/order-escalation.js +951 -0
- package/lib/order-ratings.js +495 -0
- package/lib/order-tags.js +944 -0
- package/lib/packing-slips.js +810 -0
- package/lib/payment-retries.js +816 -0
- package/lib/pick-lists.js +639 -0
- package/lib/pixel-events.js +995 -0
- package/lib/preorder.js +595 -0
- package/lib/print-queue.js +681 -0
- package/lib/product-qa.js +749 -0
- package/lib/promo-bundles.js +835 -0
- package/lib/push-notifications.js +937 -0
- package/lib/refund-automation.js +853 -0
- package/lib/reorder-reminders.js +798 -0
- package/lib/robots-config.js +753 -0
- package/lib/seller-signup.js +1052 -0
- package/lib/site-redirects.js +690 -0
- package/lib/sitemap-generator.js +717 -0
- package/lib/subscription-gifts.js +710 -0
- package/lib/tax-cert-renewals.js +632 -0
- package/lib/theme-assets.js +711 -0
- package/lib/tier-benefits.js +776 -0
- package/lib/vendor/MANIFEST.json +2 -2
- package/lib/vendor/blamejs/CHANGELOG.md +2 -0
- package/lib/vendor/blamejs/api-snapshot.json +2 -2
- package/lib/vendor/blamejs/lib/metrics.js +68 -4
- package/lib/vendor/blamejs/package.json +1 -1
- package/lib/vendor/blamejs/release-notes/v0.12.5.json +40 -0
- package/lib/wishlist-alerts.js +842 -0
- package/lib/wishlist-sharing.js +718 -0
- package/package.json +1 -1
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @module shop.robotsConfig
|
|
4
|
+
* @title Robots config — operator-editable robots.txt rules,
|
|
5
|
+
* AI-crawler opt-outs, sitemap declarations, optional
|
|
6
|
+
* canonical-host hint.
|
|
7
|
+
*
|
|
8
|
+
* @intro
|
|
9
|
+
* The bytes a crawler fetches at `/robots.txt`. Distinct from the
|
|
10
|
+
* worker's static fallback (which ships an allow-all default with
|
|
11
|
+
* one Sitemap declaration so a fresh deploy is still crawlable) —
|
|
12
|
+
* this primitive is the operator's editable surface:
|
|
13
|
+
*
|
|
14
|
+
* - Per-bot Allow / Disallow stanzas. Each `defineRule` call
|
|
15
|
+
* persists one (user_agent, allow[], disallow[], crawl_delay?)
|
|
16
|
+
* tuple. `render()` joins every active rule into the canonical
|
|
17
|
+
* robots.txt format, stable-sorted by `priority` ASC then
|
|
18
|
+
* `user_agent` ASC.
|
|
19
|
+
* - Sitemap declarations. `addSitemap(url)` records an absolute
|
|
20
|
+
* https:// URL the crawler should fetch the sitemap index from;
|
|
21
|
+
* `render()` emits each one on its own `Sitemap: <url>` line at
|
|
22
|
+
* the bottom of the file (the convention every search engine
|
|
23
|
+
* follows). Duplicate adds are idempotent (the URL is the PK).
|
|
24
|
+
* - Host directive. `setHostDirective(host)` upserts the optional
|
|
25
|
+
* `Host: <canonical-host>` hint. The directive is non-standard
|
|
26
|
+
* but widely-honored — Yandex's primary canonical signal, a soft
|
|
27
|
+
* hint to Bing, ignored by Google. The line is omitted entirely
|
|
28
|
+
* when no host has been set.
|
|
29
|
+
* - Predefined templates. `predefinedTemplates()` returns the
|
|
30
|
+
* four common bot-block payloads operators reach for first:
|
|
31
|
+
* `block_ai_crawlers` (GPTBot / ClaudeBot / CCBot / anthropic-ai /
|
|
32
|
+
* Google-Extended / Bytespider / Amazonbot / FacebookBot — every
|
|
33
|
+
* AI-training scraper currently in the wild), `block_all`
|
|
34
|
+
* (one `User-agent: *` / `Disallow: /` stanza), `open_all`
|
|
35
|
+
* (allow-everything, suitable for a fresh launch), and
|
|
36
|
+
* `standard_with_admin_disallow` (open allow plus a
|
|
37
|
+
* `Disallow: /admin/` to keep the admin surface out of search
|
|
38
|
+
* indices). `applyTemplate({ template_slug })` archives every
|
|
39
|
+
* active rule and writes the template's stanzas as the new
|
|
40
|
+
* baseline — the prior rules remain in the table for audit but
|
|
41
|
+
* no longer affect `render()`.
|
|
42
|
+
*
|
|
43
|
+
* Render contract. The emitted bytes are the canonical robots.txt
|
|
44
|
+
* format: one stanza per rule, each stanza is
|
|
45
|
+
* User-agent: <ua>
|
|
46
|
+
* [Crawl-delay: <n>]
|
|
47
|
+
* Allow: <path> (one line per allow entry)
|
|
48
|
+
* Disallow: <path> (one line per disallow entry)
|
|
49
|
+
* followed by a blank line. Stanzas with the same user_agent are
|
|
50
|
+
* emitted as separate blocks (operators sometimes want two
|
|
51
|
+
* priorities against the same bot — e.g. priority=10 for the
|
|
52
|
+
* permissive "allow /search" line and priority=20 for the broader
|
|
53
|
+
* "disallow /admin" line). The `Host:` line (when set) and every
|
|
54
|
+
* `Sitemap:` line follow at the bottom of the file.
|
|
55
|
+
*
|
|
56
|
+
* Validation discipline.
|
|
57
|
+
* - Sitemap URLs run through `b.safeUrl.parse` with the
|
|
58
|
+
* `{ allowedProtocols: ["https:"] }` allowlist. Cleartext is
|
|
59
|
+
* refused; an MITM rewriting the sitemap location would let an
|
|
60
|
+
* attacker swap the crawler's view of the site.
|
|
61
|
+
* - The host directive accepts a hostname or `host:port` shape —
|
|
62
|
+
* no scheme, no path. The host is just the canonical hostname
|
|
63
|
+
* the operator wants crawlers to attribute results to.
|
|
64
|
+
* - User-agent strings are length-capped and refuse control bytes
|
|
65
|
+
* — robots.txt is a newline-delimited format and a CR/LF in the
|
|
66
|
+
* UA field would break the stanza boundary.
|
|
67
|
+
* - Allow / Disallow path entries are `/`-rooted absolute paths
|
|
68
|
+
* (relative paths are meaningless in robots.txt). Wildcards
|
|
69
|
+
* (`*` and `$`) are permitted — every major crawler honors the
|
|
70
|
+
* extended-syntax wildcards even though they're not in the
|
|
71
|
+
* original 1994 spec.
|
|
72
|
+
*
|
|
73
|
+
* Composes ONLY blamejs:
|
|
74
|
+
* - `b.framework.safeUrl.parse` — sitemap URL validation
|
|
75
|
+
* (https-only).
|
|
76
|
+
* - `b.framework.uuid.v7` — rule-row id; lexicographically
|
|
77
|
+
* sortable so a `created_at` tiebreak is rarely needed.
|
|
78
|
+
*
|
|
79
|
+
* Storage (migration `0147_robots_config.sql`):
|
|
80
|
+
* - `robots_rules` — one row per operator-defined stanza
|
|
81
|
+
* - `robots_sitemaps` — one row per declared sitemap URL
|
|
82
|
+
* - `robots_host_directive` — singleton row (id=1) holding the
|
|
83
|
+
* canonical-host hint
|
|
84
|
+
*
|
|
85
|
+
* @primitive robotsConfig
|
|
86
|
+
* @related shop.sitemapGenerator, b.safeUrl.parse, b.uuid.v7
|
|
87
|
+
*/
|
|
88
|
+
|
|
89
|
+
var MAX_USER_AGENT_LEN = 200;
|
|
90
|
+
var MAX_PATH_LEN = 512;
|
|
91
|
+
var MAX_PATHS_PER_RULE = 200;
|
|
92
|
+
var MAX_SITEMAP_URL_LEN = 2048;
|
|
93
|
+
var MAX_HOST_LEN = 255;
|
|
94
|
+
var MAX_CRAWL_DELAY = 86400; // 24 h — anything larger is a typo
|
|
95
|
+
var MAX_PRIORITY = 1000000;
|
|
96
|
+
var MAX_RULE_ID_LEN = 80;
|
|
97
|
+
|
|
98
|
+
var CONTROL_BYTE_RE = /[\x00-\x1f\x7f]/;
|
|
99
|
+
|
|
100
|
+
// Lazy framework handle — matches the rest of the shop primitives.
|
|
101
|
+
var bShop;
|
|
102
|
+
function _b() {
|
|
103
|
+
if (!bShop) bShop = require("./index");
|
|
104
|
+
return bShop.framework;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// ---- predefined templates ---------------------------------------------
|
|
108
|
+
//
|
|
109
|
+
// The frozen catalog of canned payloads `applyTemplate` writes into the
|
|
110
|
+
// `robots_rules` table. Each entry is the stanza set + the sitemap +
|
|
111
|
+
// host fields a sensible default would carry; `applyTemplate` does the
|
|
112
|
+
// archive-then-write step. The catalog is exported through
|
|
113
|
+
// `predefinedTemplates()` so the operator-facing dashboard can render
|
|
114
|
+
// the choices without re-reading this file.
|
|
115
|
+
|
|
116
|
+
var PREDEFINED_TEMPLATES = Object.freeze({
|
|
117
|
+
block_ai_crawlers: Object.freeze({
|
|
118
|
+
slug: "block_ai_crawlers",
|
|
119
|
+
label: "Block AI training crawlers",
|
|
120
|
+
description:
|
|
121
|
+
"Refuse every AI-training scraper currently in the wild — " +
|
|
122
|
+
"GPTBot (OpenAI), ClaudeBot + anthropic-ai (Anthropic), " +
|
|
123
|
+
"CCBot (Common Crawl, the corpus most LLMs train on), " +
|
|
124
|
+
"Google-Extended (Google's training opt-out token), " +
|
|
125
|
+
"Bytespider (ByteDance), Amazonbot, FacebookBot. Leaves " +
|
|
126
|
+
"the wildcard `*` stanza permissive so search-engine indexing " +
|
|
127
|
+
"continues to work.",
|
|
128
|
+
rules: Object.freeze([
|
|
129
|
+
Object.freeze({ user_agent: "GPTBot", allow: [], disallow: ["/"], priority: 10 }),
|
|
130
|
+
Object.freeze({ user_agent: "ClaudeBot", allow: [], disallow: ["/"], priority: 10 }),
|
|
131
|
+
Object.freeze({ user_agent: "anthropic-ai", allow: [], disallow: ["/"], priority: 10 }),
|
|
132
|
+
Object.freeze({ user_agent: "CCBot", allow: [], disallow: ["/"], priority: 10 }),
|
|
133
|
+
Object.freeze({ user_agent: "Google-Extended", allow: [], disallow: ["/"], priority: 10 }),
|
|
134
|
+
Object.freeze({ user_agent: "Bytespider", allow: [], disallow: ["/"], priority: 10 }),
|
|
135
|
+
Object.freeze({ user_agent: "Amazonbot", allow: [], disallow: ["/"], priority: 10 }),
|
|
136
|
+
Object.freeze({ user_agent: "FacebookBot", allow: [], disallow: ["/"], priority: 10 }),
|
|
137
|
+
Object.freeze({ user_agent: "*", allow: [], disallow: [], priority: 100 }),
|
|
138
|
+
]),
|
|
139
|
+
}),
|
|
140
|
+
block_all: Object.freeze({
|
|
141
|
+
slug: "block_all",
|
|
142
|
+
label: "Block every crawler",
|
|
143
|
+
description:
|
|
144
|
+
"Refuse every crawler at every path. Suitable for a staging " +
|
|
145
|
+
"environment or a pre-launch site that must not leak into " +
|
|
146
|
+
"search indices.",
|
|
147
|
+
rules: Object.freeze([
|
|
148
|
+
Object.freeze({ user_agent: "*", allow: [], disallow: ["/"], priority: 100 }),
|
|
149
|
+
]),
|
|
150
|
+
}),
|
|
151
|
+
open_all: Object.freeze({
|
|
152
|
+
slug: "open_all",
|
|
153
|
+
label: "Open to every crawler",
|
|
154
|
+
description:
|
|
155
|
+
"Permit every crawler to fetch every path. The default shape " +
|
|
156
|
+
"for a freshly-launched storefront that wants maximum search " +
|
|
157
|
+
"visibility.",
|
|
158
|
+
rules: Object.freeze([
|
|
159
|
+
Object.freeze({ user_agent: "*", allow: [], disallow: [], priority: 100 }),
|
|
160
|
+
]),
|
|
161
|
+
}),
|
|
162
|
+
standard_with_admin_disallow: Object.freeze({
|
|
163
|
+
slug: "standard_with_admin_disallow",
|
|
164
|
+
label: "Open with /admin/ disallowed",
|
|
165
|
+
description:
|
|
166
|
+
"Permit every crawler except inside `/admin/`. Keeps the " +
|
|
167
|
+
"operator-only surface out of search indices while leaving " +
|
|
168
|
+
"the storefront fully crawlable.",
|
|
169
|
+
rules: Object.freeze([
|
|
170
|
+
Object.freeze({
|
|
171
|
+
user_agent: "*",
|
|
172
|
+
allow: [],
|
|
173
|
+
disallow: ["/admin/"],
|
|
174
|
+
priority: 100,
|
|
175
|
+
}),
|
|
176
|
+
]),
|
|
177
|
+
}),
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
var TEMPLATE_SLUGS = Object.freeze(Object.keys(PREDEFINED_TEMPLATES));
|
|
181
|
+
|
|
182
|
+
// ---- validators -------------------------------------------------------
|
|
183
|
+
|
|
184
|
+
function _userAgent(s) {
|
|
185
|
+
if (typeof s !== "string" || s.length < 1 || s.length > MAX_USER_AGENT_LEN) {
|
|
186
|
+
throw new TypeError(
|
|
187
|
+
"robotsConfig: user_agent must be a string 1.." + MAX_USER_AGENT_LEN + " chars"
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
if (CONTROL_BYTE_RE.test(s)) {
|
|
191
|
+
throw new TypeError("robotsConfig: user_agent must not contain control bytes");
|
|
192
|
+
}
|
|
193
|
+
// robots.txt is a colon-delimited key/value format — a colon in the
|
|
194
|
+
// user-agent value would shift the parser onto a different key.
|
|
195
|
+
if (s.indexOf(":") !== -1) {
|
|
196
|
+
throw new TypeError("robotsConfig: user_agent must not contain ':'");
|
|
197
|
+
}
|
|
198
|
+
return s;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function _pathEntry(s, label) {
|
|
202
|
+
if (typeof s !== "string" || s.length < 1 || s.length > MAX_PATH_LEN) {
|
|
203
|
+
throw new TypeError(
|
|
204
|
+
"robotsConfig: " + label + " entry must be a string 1.." + MAX_PATH_LEN + " chars"
|
|
205
|
+
);
|
|
206
|
+
}
|
|
207
|
+
if (CONTROL_BYTE_RE.test(s)) {
|
|
208
|
+
throw new TypeError("robotsConfig: " + label + " entry must not contain control bytes");
|
|
209
|
+
}
|
|
210
|
+
// robots.txt path entries are /-rooted absolute paths. Wildcards
|
|
211
|
+
// (`*` for "any sequence") and end-of-path anchors (`$`) are
|
|
212
|
+
// honored by every major crawler — the leading `/` is the only
|
|
213
|
+
// structural rule.
|
|
214
|
+
if (s.charCodeAt(0) !== 47 /* "/" */) {
|
|
215
|
+
throw new TypeError(
|
|
216
|
+
"robotsConfig: " + label + " entry must be a /-rooted absolute path; got " + JSON.stringify(s)
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
return s;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function _pathArray(arr, label) {
|
|
223
|
+
if (arr == null) return [];
|
|
224
|
+
if (!Array.isArray(arr)) {
|
|
225
|
+
throw new TypeError("robotsConfig: " + label + " must be an array of paths");
|
|
226
|
+
}
|
|
227
|
+
if (arr.length > MAX_PATHS_PER_RULE) {
|
|
228
|
+
throw new TypeError(
|
|
229
|
+
"robotsConfig: " + label + " must have <= " + MAX_PATHS_PER_RULE + " entries"
|
|
230
|
+
);
|
|
231
|
+
}
|
|
232
|
+
var out = [];
|
|
233
|
+
for (var i = 0; i < arr.length; i += 1) {
|
|
234
|
+
out.push(_pathEntry(arr[i], label));
|
|
235
|
+
}
|
|
236
|
+
return out;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function _crawlDelay(n) {
|
|
240
|
+
if (n == null) return null;
|
|
241
|
+
if (!Number.isInteger(n) || n < 0 || n > MAX_CRAWL_DELAY) {
|
|
242
|
+
throw new TypeError(
|
|
243
|
+
"robotsConfig: crawl_delay must be an integer 0.." + MAX_CRAWL_DELAY + " seconds, or null"
|
|
244
|
+
);
|
|
245
|
+
}
|
|
246
|
+
return n;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
function _priority(n) {
|
|
250
|
+
if (n == null) return 100;
|
|
251
|
+
if (!Number.isInteger(n) || n < 0 || n > MAX_PRIORITY) {
|
|
252
|
+
throw new TypeError(
|
|
253
|
+
"robotsConfig: priority must be an integer 0.." + MAX_PRIORITY
|
|
254
|
+
);
|
|
255
|
+
}
|
|
256
|
+
return n;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function _ruleId(s) {
|
|
260
|
+
if (typeof s !== "string" || s.length < 1 || s.length > MAX_RULE_ID_LEN) {
|
|
261
|
+
throw new TypeError(
|
|
262
|
+
"robotsConfig: rule_id must be a string 1.." + MAX_RULE_ID_LEN + " chars"
|
|
263
|
+
);
|
|
264
|
+
}
|
|
265
|
+
if (CONTROL_BYTE_RE.test(s)) {
|
|
266
|
+
throw new TypeError("robotsConfig: rule_id must not contain control bytes");
|
|
267
|
+
}
|
|
268
|
+
return s;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
function _sitemapUrl(u) {
|
|
272
|
+
if (typeof u !== "string" || u.length < 1 || u.length > MAX_SITEMAP_URL_LEN) {
|
|
273
|
+
throw new TypeError(
|
|
274
|
+
"robotsConfig: sitemap url must be a string 1.." + MAX_SITEMAP_URL_LEN + " chars"
|
|
275
|
+
);
|
|
276
|
+
}
|
|
277
|
+
if (CONTROL_BYTE_RE.test(u)) {
|
|
278
|
+
throw new TypeError("robotsConfig: sitemap url must not contain control bytes");
|
|
279
|
+
}
|
|
280
|
+
// robots.txt only carries the URL itself, so the path-rooted form
|
|
281
|
+
// promo-banners accepts isn't meaningful here — every crawler treats
|
|
282
|
+
// the Sitemap: value as an absolute URL. https-only protects against
|
|
283
|
+
// an MITM rewriting the sitemap location.
|
|
284
|
+
try {
|
|
285
|
+
_b().safeUrl.parse(u, { allowedProtocols: ["https:"] });
|
|
286
|
+
} catch (e) {
|
|
287
|
+
throw new TypeError(
|
|
288
|
+
"robotsConfig: sitemap url — " + (e && e.message || "must be a valid https:// URL")
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
return u;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function _hostDirective(s) {
|
|
295
|
+
if (typeof s !== "string" || s.length < 1 || s.length > MAX_HOST_LEN) {
|
|
296
|
+
throw new TypeError(
|
|
297
|
+
"robotsConfig: host must be a string 1.." + MAX_HOST_LEN + " chars"
|
|
298
|
+
);
|
|
299
|
+
}
|
|
300
|
+
if (CONTROL_BYTE_RE.test(s)) {
|
|
301
|
+
throw new TypeError("robotsConfig: host must not contain control bytes");
|
|
302
|
+
}
|
|
303
|
+
// The host directive is a bare hostname (optionally with :port) —
|
|
304
|
+
// no scheme, no path. Refuse anything that looks like a URL so an
|
|
305
|
+
// operator can't accidentally paste the full origin URL in.
|
|
306
|
+
if (s.indexOf("/") !== -1 || s.indexOf(":") !== s.lastIndexOf(":")) {
|
|
307
|
+
throw new TypeError(
|
|
308
|
+
"robotsConfig: host must be a bare hostname (optionally host:port), no scheme / path"
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
if (s.indexOf("://") !== -1) {
|
|
312
|
+
throw new TypeError(
|
|
313
|
+
"robotsConfig: host must be a bare hostname, not a scheme://host URL"
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
// A hostname character class — letters, digits, dot, hyphen, and an
|
|
317
|
+
// optional :port suffix.
|
|
318
|
+
if (!/^[A-Za-z0-9.\-]+(?::[0-9]+)?$/.test(s)) {
|
|
319
|
+
throw new TypeError(
|
|
320
|
+
"robotsConfig: host must match hostname (optionally host:port) syntax"
|
|
321
|
+
);
|
|
322
|
+
}
|
|
323
|
+
return s;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
function _templateSlug(s) {
|
|
327
|
+
if (typeof s !== "string" || s.length < 1) {
|
|
328
|
+
throw new TypeError("robotsConfig: template_slug must be a non-empty string");
|
|
329
|
+
}
|
|
330
|
+
if (TEMPLATE_SLUGS.indexOf(s) === -1) {
|
|
331
|
+
throw new TypeError(
|
|
332
|
+
"robotsConfig: template_slug must be one of " + TEMPLATE_SLUGS.join(", ") +
|
|
333
|
+
"; got " + JSON.stringify(s)
|
|
334
|
+
);
|
|
335
|
+
}
|
|
336
|
+
return s;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// ---- row hydration ----------------------------------------------------
|
|
340
|
+
|
|
341
|
+
function _hydrateRule(row) {
|
|
342
|
+
if (!row) return null;
|
|
343
|
+
var allow = [];
|
|
344
|
+
var disallow = [];
|
|
345
|
+
try { allow = JSON.parse(row.allow_json || "[]"); } catch (_e) { allow = []; }
|
|
346
|
+
try { disallow = JSON.parse(row.disallow_json || "[]"); } catch (_e) { disallow = []; }
|
|
347
|
+
return {
|
|
348
|
+
id: row.id,
|
|
349
|
+
user_agent: row.user_agent,
|
|
350
|
+
allow: Array.isArray(allow) ? allow : [],
|
|
351
|
+
disallow: Array.isArray(disallow) ? disallow : [],
|
|
352
|
+
crawl_delay: row.crawl_delay == null ? null : Number(row.crawl_delay),
|
|
353
|
+
priority: Number(row.priority),
|
|
354
|
+
archived_at: row.archived_at == null ? null : Number(row.archived_at),
|
|
355
|
+
created_at: Number(row.created_at),
|
|
356
|
+
updated_at: Number(row.updated_at),
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// ---- render -----------------------------------------------------------
|
|
361
|
+
|
|
362
|
+
function _renderRule(rule) {
|
|
363
|
+
// One stanza per rule. The stanza always opens with the User-agent
|
|
364
|
+
// line; Crawl-delay (when set) follows; then every Allow line; then
|
|
365
|
+
// every Disallow line; closing with a blank line so the next stanza
|
|
366
|
+
// is parseable as a fresh block.
|
|
367
|
+
var lines = ["User-agent: " + rule.user_agent];
|
|
368
|
+
if (rule.crawl_delay != null) {
|
|
369
|
+
lines.push("Crawl-delay: " + rule.crawl_delay);
|
|
370
|
+
}
|
|
371
|
+
for (var ai = 0; ai < rule.allow.length; ai += 1) {
|
|
372
|
+
lines.push("Allow: " + rule.allow[ai]);
|
|
373
|
+
}
|
|
374
|
+
for (var di = 0; di < rule.disallow.length; di += 1) {
|
|
375
|
+
lines.push("Disallow: " + rule.disallow[di]);
|
|
376
|
+
}
|
|
377
|
+
return lines.join("\n");
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// ---- factory ----------------------------------------------------------
|
|
381
|
+
|
|
382
|
+
function create(opts) {
|
|
383
|
+
opts = opts || {};
|
|
384
|
+
var query = opts.query;
|
|
385
|
+
if (!query) {
|
|
386
|
+
query = function (sql, params) { return _b().externalDb.query(sql, params); };
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// Per-factory monotonic clock — guarantees `updated_at` across a
|
|
390
|
+
// defineRule + updateRule + archiveRule chain is strictly
|
|
391
|
+
// increasing even when the wall clock has 1 ms resolution and the
|
|
392
|
+
// caller threads the calls inside one tick.
|
|
393
|
+
var _lastTs = 0;
|
|
394
|
+
function _monotonicTs() {
|
|
395
|
+
var wall = Date.now();
|
|
396
|
+
if (wall > _lastTs) _lastTs = wall;
|
|
397
|
+
else _lastTs += 1;
|
|
398
|
+
return _lastTs;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
async function _getRule(ruleId) {
|
|
402
|
+
var r = await query(
|
|
403
|
+
"SELECT * FROM robots_rules WHERE id = ?1 LIMIT 1",
|
|
404
|
+
[ruleId],
|
|
405
|
+
);
|
|
406
|
+
return _hydrateRule(r.rows[0] || null);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ---- defineRule -----------------------------------------------------
|
|
410
|
+
|
|
411
|
+
async function defineRule(input) {
|
|
412
|
+
if (!input || typeof input !== "object") {
|
|
413
|
+
throw new TypeError("robotsConfig.defineRule: input object required");
|
|
414
|
+
}
|
|
415
|
+
var userAgent = _userAgent(input.user_agent);
|
|
416
|
+
var allow = _pathArray(input.allow, "allow");
|
|
417
|
+
var disallow = _pathArray(input.disallow, "disallow");
|
|
418
|
+
var crawlDelay = _crawlDelay(input.crawl_delay == null ? null : input.crawl_delay);
|
|
419
|
+
var priority = _priority(input.priority == null ? null : input.priority);
|
|
420
|
+
|
|
421
|
+
var id = _b().uuid.v7();
|
|
422
|
+
var ts = _monotonicTs();
|
|
423
|
+
await query(
|
|
424
|
+
"INSERT INTO robots_rules " +
|
|
425
|
+
"(id, user_agent, allow_json, disallow_json, crawl_delay, priority, " +
|
|
426
|
+
" archived_at, created_at, updated_at) " +
|
|
427
|
+
"VALUES (?1, ?2, ?3, ?4, ?5, ?6, NULL, ?7, ?7)",
|
|
428
|
+
[
|
|
429
|
+
id, userAgent,
|
|
430
|
+
JSON.stringify(allow),
|
|
431
|
+
JSON.stringify(disallow),
|
|
432
|
+
crawlDelay,
|
|
433
|
+
priority,
|
|
434
|
+
ts,
|
|
435
|
+
],
|
|
436
|
+
);
|
|
437
|
+
return await _getRule(id);
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// ---- listRules ------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
async function listRules(input) {
|
|
443
|
+
input = input || {};
|
|
444
|
+
var sql;
|
|
445
|
+
var params = [];
|
|
446
|
+
if (input.user_agent != null) {
|
|
447
|
+
var ua = _userAgent(input.user_agent);
|
|
448
|
+
sql = "SELECT * FROM robots_rules WHERE archived_at IS NULL AND user_agent = ?1 " +
|
|
449
|
+
"ORDER BY priority ASC, user_agent ASC, created_at ASC";
|
|
450
|
+
params = [ua];
|
|
451
|
+
} else {
|
|
452
|
+
sql = "SELECT * FROM robots_rules WHERE archived_at IS NULL " +
|
|
453
|
+
"ORDER BY priority ASC, user_agent ASC, created_at ASC";
|
|
454
|
+
}
|
|
455
|
+
var rows = (await query(sql, params)).rows;
|
|
456
|
+
var out = [];
|
|
457
|
+
for (var i = 0; i < rows.length; i += 1) out.push(_hydrateRule(rows[i]));
|
|
458
|
+
return out;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// ---- archiveRule ----------------------------------------------------
|
|
462
|
+
|
|
463
|
+
async function archiveRule(ruleId) {
|
|
464
|
+
_ruleId(ruleId);
|
|
465
|
+
var current = await _getRule(ruleId);
|
|
466
|
+
if (!current) {
|
|
467
|
+
throw new TypeError(
|
|
468
|
+
"robotsConfig.archiveRule: rule_id " + JSON.stringify(ruleId) + " not found"
|
|
469
|
+
);
|
|
470
|
+
}
|
|
471
|
+
if (current.archived_at != null) return current;
|
|
472
|
+
var ts = _monotonicTs();
|
|
473
|
+
await query(
|
|
474
|
+
"UPDATE robots_rules SET archived_at = ?1, updated_at = ?1 WHERE id = ?2",
|
|
475
|
+
[ts, ruleId],
|
|
476
|
+
);
|
|
477
|
+
return await _getRule(ruleId);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// ---- updateRule -----------------------------------------------------
|
|
481
|
+
|
|
482
|
+
async function updateRule(ruleId, patch) {
|
|
483
|
+
_ruleId(ruleId);
|
|
484
|
+
if (!patch || typeof patch !== "object") {
|
|
485
|
+
throw new TypeError("robotsConfig.updateRule: patch object required");
|
|
486
|
+
}
|
|
487
|
+
var current = await _getRule(ruleId);
|
|
488
|
+
if (!current) {
|
|
489
|
+
throw new TypeError(
|
|
490
|
+
"robotsConfig.updateRule: rule_id " + JSON.stringify(ruleId) + " not found"
|
|
491
|
+
);
|
|
492
|
+
}
|
|
493
|
+
if (current.archived_at != null) {
|
|
494
|
+
throw new TypeError(
|
|
495
|
+
"robotsConfig.updateRule: rule_id " + JSON.stringify(ruleId) +
|
|
496
|
+
" is archived; defineRule a fresh rule instead"
|
|
497
|
+
);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
var next = {
|
|
501
|
+
user_agent: current.user_agent,
|
|
502
|
+
allow: current.allow,
|
|
503
|
+
disallow: current.disallow,
|
|
504
|
+
crawl_delay: current.crawl_delay,
|
|
505
|
+
priority: current.priority,
|
|
506
|
+
};
|
|
507
|
+
if (patch.user_agent != null) next.user_agent = _userAgent(patch.user_agent);
|
|
508
|
+
if (patch.allow != null) next.allow = _pathArray(patch.allow, "allow");
|
|
509
|
+
if (patch.disallow != null) next.disallow = _pathArray(patch.disallow, "disallow");
|
|
510
|
+
// `crawl_delay` is the one nullable field — an explicit `null` in
|
|
511
|
+
// the patch clears the column; absence preserves it.
|
|
512
|
+
if (Object.prototype.hasOwnProperty.call(patch, "crawl_delay")) {
|
|
513
|
+
next.crawl_delay = _crawlDelay(patch.crawl_delay);
|
|
514
|
+
}
|
|
515
|
+
if (patch.priority != null) next.priority = _priority(patch.priority);
|
|
516
|
+
|
|
517
|
+
var ts = _monotonicTs();
|
|
518
|
+
await query(
|
|
519
|
+
"UPDATE robots_rules SET " +
|
|
520
|
+
"user_agent = ?1, allow_json = ?2, disallow_json = ?3, " +
|
|
521
|
+
"crawl_delay = ?4, priority = ?5, updated_at = ?6 " +
|
|
522
|
+
"WHERE id = ?7",
|
|
523
|
+
[
|
|
524
|
+
next.user_agent,
|
|
525
|
+
JSON.stringify(next.allow),
|
|
526
|
+
JSON.stringify(next.disallow),
|
|
527
|
+
next.crawl_delay,
|
|
528
|
+
next.priority,
|
|
529
|
+
ts,
|
|
530
|
+
ruleId,
|
|
531
|
+
],
|
|
532
|
+
);
|
|
533
|
+
return await _getRule(ruleId);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// ---- sitemaps -------------------------------------------------------
|
|
537
|
+
|
|
538
|
+
async function addSitemap(url) {
|
|
539
|
+
var u = _sitemapUrl(url);
|
|
540
|
+
var ts = _monotonicTs();
|
|
541
|
+
// INSERT OR IGNORE — the URL is the PK so a duplicate add is a no-op
|
|
542
|
+
// rather than a throw. The first add wins on `added_at`; a later
|
|
543
|
+
// re-add doesn't bump the timestamp (the operator's audit trail
|
|
544
|
+
// shows when the URL was first declared).
|
|
545
|
+
await query(
|
|
546
|
+
"INSERT OR IGNORE INTO robots_sitemaps (url, added_at) VALUES (?1, ?2)",
|
|
547
|
+
[u, ts],
|
|
548
|
+
);
|
|
549
|
+
var r = await query(
|
|
550
|
+
"SELECT url, added_at FROM robots_sitemaps WHERE url = ?1 LIMIT 1",
|
|
551
|
+
[u],
|
|
552
|
+
);
|
|
553
|
+
var row = r.rows[0];
|
|
554
|
+
return { url: row.url, added_at: Number(row.added_at) };
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
async function removeSitemap(url) {
|
|
558
|
+
var u = _sitemapUrl(url);
|
|
559
|
+
var r = await query(
|
|
560
|
+
"DELETE FROM robots_sitemaps WHERE url = ?1",
|
|
561
|
+
[u],
|
|
562
|
+
);
|
|
563
|
+
return { removed: Number(r.rowCount || 0) > 0 };
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
async function listSitemaps() {
|
|
567
|
+
var rows = (await query(
|
|
568
|
+
"SELECT url, added_at FROM robots_sitemaps ORDER BY added_at ASC, url ASC",
|
|
569
|
+
[],
|
|
570
|
+
)).rows;
|
|
571
|
+
var out = [];
|
|
572
|
+
for (var i = 0; i < rows.length; i += 1) {
|
|
573
|
+
out.push({ url: rows[i].url, added_at: Number(rows[i].added_at) });
|
|
574
|
+
}
|
|
575
|
+
return out;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// ---- host directive -------------------------------------------------
|
|
579
|
+
|
|
580
|
+
async function setHostDirective(host) {
|
|
581
|
+
var h = _hostDirective(host);
|
|
582
|
+
var ts = _monotonicTs();
|
|
583
|
+
// SQLite UPSERT against the singleton row (id = 1). The CHECK
|
|
584
|
+
// constraint on the table forbids any other id, so the table is
|
|
585
|
+
// bounded to at most one host string.
|
|
586
|
+
await query(
|
|
587
|
+
"INSERT INTO robots_host_directive (id, host, updated_at) VALUES (1, ?1, ?2) " +
|
|
588
|
+
"ON CONFLICT(id) DO UPDATE SET host = excluded.host, updated_at = excluded.updated_at",
|
|
589
|
+
[h, ts],
|
|
590
|
+
);
|
|
591
|
+
return { host: h, updated_at: ts };
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
async function getHostDirective() {
|
|
595
|
+
var r = await query(
|
|
596
|
+
"SELECT host, updated_at FROM robots_host_directive WHERE id = 1 LIMIT 1",
|
|
597
|
+
[],
|
|
598
|
+
);
|
|
599
|
+
var row = r.rows[0];
|
|
600
|
+
if (!row) return null;
|
|
601
|
+
return { host: row.host, updated_at: Number(row.updated_at) };
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// ---- render ---------------------------------------------------------
|
|
605
|
+
|
|
606
|
+
async function render(input) {
|
|
607
|
+
if (!input || typeof input !== "object") {
|
|
608
|
+
throw new TypeError("robotsConfig.render: input object required");
|
|
609
|
+
}
|
|
610
|
+
// `origin_url` validates through `b.safeUrl` — required so the
|
|
611
|
+
// bytes always carry a canonical origin context even when no
|
|
612
|
+
// sitemap entries are declared. The render output itself doesn't
|
|
613
|
+
// embed origin_url, but accepting it here keeps the surface
|
|
614
|
+
// symmetrical with sitemapGenerator.generate({ origin_url }) and
|
|
615
|
+
// lets a future emission step interpolate the canonical host into
|
|
616
|
+
// operator-supplied path entries.
|
|
617
|
+
if (typeof input.origin_url !== "string" || !input.origin_url.length) {
|
|
618
|
+
throw new TypeError("robotsConfig.render: origin_url must be a non-empty string");
|
|
619
|
+
}
|
|
620
|
+
try {
|
|
621
|
+
_b().safeUrl.parse(input.origin_url, { allowedProtocols: ["https:"] });
|
|
622
|
+
} catch (e) {
|
|
623
|
+
throw new TypeError(
|
|
624
|
+
"robotsConfig.render: origin_url — " + (e && e.message || "must be a valid https:// URL")
|
|
625
|
+
);
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
var rules = await listRules({});
|
|
629
|
+
var sitemaps = await listSitemaps();
|
|
630
|
+
var host = await getHostDirective();
|
|
631
|
+
|
|
632
|
+
var sections = [];
|
|
633
|
+
|
|
634
|
+
// Empty-config fallback: emit the open-all default. The worker's
|
|
635
|
+
// static /robots.txt fallback handles the "no DB connectivity"
|
|
636
|
+
// case; this branch handles "DB up, table empty" so a fresh
|
|
637
|
+
// deploy still serves a well-formed robots.txt.
|
|
638
|
+
if (rules.length === 0) {
|
|
639
|
+
sections.push("User-agent: *\nAllow: /");
|
|
640
|
+
} else {
|
|
641
|
+
for (var i = 0; i < rules.length; i += 1) {
|
|
642
|
+
sections.push(_renderRule(rules[i]));
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
if (host) {
|
|
647
|
+
sections.push("Host: " + host.host);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
if (sitemaps.length > 0) {
|
|
651
|
+
var sitemapLines = [];
|
|
652
|
+
for (var si = 0; si < sitemaps.length; si += 1) {
|
|
653
|
+
sitemapLines.push("Sitemap: " + sitemaps[si].url);
|
|
654
|
+
}
|
|
655
|
+
sections.push(sitemapLines.join("\n"));
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Sections separated by a single blank line; trailing newline so
|
|
659
|
+
// the file ends on a clean LF boundary the way every robots.txt
|
|
660
|
+
// emitter in the wild ships it.
|
|
661
|
+
return sections.join("\n\n") + "\n";
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// ---- templates ------------------------------------------------------
|
|
665
|
+
|
|
666
|
+
function predefinedTemplates() {
|
|
667
|
+
// Return a freshly-cloned snapshot so callers cannot mutate the
|
|
668
|
+
// frozen catalog. The catalog itself is deep-frozen at module
|
|
669
|
+
// load; the returned shape is a plain-object copy suitable for
|
|
670
|
+
// serialization back to a dashboard.
|
|
671
|
+
var out = {};
|
|
672
|
+
for (var i = 0; i < TEMPLATE_SLUGS.length; i += 1) {
|
|
673
|
+
var slug = TEMPLATE_SLUGS[i];
|
|
674
|
+
var tpl = PREDEFINED_TEMPLATES[slug];
|
|
675
|
+
var rules = [];
|
|
676
|
+
for (var ri = 0; ri < tpl.rules.length; ri += 1) {
|
|
677
|
+
var r = tpl.rules[ri];
|
|
678
|
+
rules.push({
|
|
679
|
+
user_agent: r.user_agent,
|
|
680
|
+
allow: r.allow.slice(),
|
|
681
|
+
disallow: r.disallow.slice(),
|
|
682
|
+
priority: r.priority,
|
|
683
|
+
});
|
|
684
|
+
}
|
|
685
|
+
out[slug] = {
|
|
686
|
+
slug: tpl.slug,
|
|
687
|
+
label: tpl.label,
|
|
688
|
+
description: tpl.description,
|
|
689
|
+
rules: rules,
|
|
690
|
+
};
|
|
691
|
+
}
|
|
692
|
+
return out;
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
async function applyTemplate(input) {
|
|
696
|
+
if (!input || typeof input !== "object") {
|
|
697
|
+
throw new TypeError("robotsConfig.applyTemplate: input object required");
|
|
698
|
+
}
|
|
699
|
+
var slug = _templateSlug(input.template_slug);
|
|
700
|
+
var tpl = PREDEFINED_TEMPLATES[slug];
|
|
701
|
+
|
|
702
|
+
// Archive every currently-active rule. The audit trail keeps the
|
|
703
|
+
// prior shape; `listRules({})` returns only the active set so the
|
|
704
|
+
// template's stanzas become the new baseline.
|
|
705
|
+
var active = await listRules({});
|
|
706
|
+
for (var ai = 0; ai < active.length; ai += 1) {
|
|
707
|
+
await archiveRule(active[ai].id);
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
// Write each template stanza as a fresh rule. The priority field
|
|
711
|
+
// on the template entries is honored so an operator who renders
|
|
712
|
+
// immediately sees the canonical bot-block first.
|
|
713
|
+
var written = [];
|
|
714
|
+
for (var ri = 0; ri < tpl.rules.length; ri += 1) {
|
|
715
|
+
var stanza = tpl.rules[ri];
|
|
716
|
+
var rule = await defineRule({
|
|
717
|
+
user_agent: stanza.user_agent,
|
|
718
|
+
allow: stanza.allow,
|
|
719
|
+
disallow: stanza.disallow,
|
|
720
|
+
priority: stanza.priority,
|
|
721
|
+
});
|
|
722
|
+
written.push(rule);
|
|
723
|
+
}
|
|
724
|
+
return { template_slug: slug, rules: written };
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
return {
|
|
728
|
+
defineRule: defineRule,
|
|
729
|
+
listRules: listRules,
|
|
730
|
+
archiveRule: archiveRule,
|
|
731
|
+
updateRule: updateRule,
|
|
732
|
+
addSitemap: addSitemap,
|
|
733
|
+
removeSitemap: removeSitemap,
|
|
734
|
+
listSitemaps: listSitemaps,
|
|
735
|
+
setHostDirective: setHostDirective,
|
|
736
|
+
getHostDirective: getHostDirective,
|
|
737
|
+
render: render,
|
|
738
|
+
predefinedTemplates: predefinedTemplates,
|
|
739
|
+
applyTemplate: applyTemplate,
|
|
740
|
+
TEMPLATE_SLUGS: TEMPLATE_SLUGS,
|
|
741
|
+
};
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
module.exports = {
|
|
745
|
+
create: create,
|
|
746
|
+
TEMPLATE_SLUGS: TEMPLATE_SLUGS,
|
|
747
|
+
MAX_USER_AGENT_LEN: MAX_USER_AGENT_LEN,
|
|
748
|
+
MAX_PATH_LEN: MAX_PATH_LEN,
|
|
749
|
+
MAX_PATHS_PER_RULE: MAX_PATHS_PER_RULE,
|
|
750
|
+
MAX_SITEMAP_URL_LEN: MAX_SITEMAP_URL_LEN,
|
|
751
|
+
MAX_HOST_LEN: MAX_HOST_LEN,
|
|
752
|
+
MAX_CRAWL_DELAY: MAX_CRAWL_DELAY,
|
|
753
|
+
};
|