@adobe/helix-html-pipeline 6.9.0 → 6.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [6.10.0](https://github.com/adobe/helix-html-pipeline/compare/v6.9.0...v6.10.0) (2024-04-24)
2
+
3
+
4
+ ### Features
5
+
6
+ * support dynamic and configurable robots.txt ([#591](https://github.com/adobe/helix-html-pipeline/issues/591)) ([9796e35](https://github.com/adobe/helix-html-pipeline/commit/9796e35903556ba211010fbabd86b45c90b409e6))
7
+
1
8
  # [6.9.0](https://github.com/adobe/helix-html-pipeline/compare/v6.8.1...v6.9.0) (2024-04-23)
2
9
 
3
10
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-html-pipeline",
3
- "version": "6.9.0",
3
+ "version": "6.10.0",
4
4
  "description": "Helix HTML Pipeline",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
package/src/index.js CHANGED
@@ -13,6 +13,7 @@ export * from './html-pipe.js';
13
13
  export * from './json-pipe.js';
14
14
  export * from './auth-pipe.js';
15
15
  export * from './options-pipe.js';
16
+ export * from './robots-pipe.js';
16
17
  export * from './sitemap-pipe.js';
17
18
  export * from './PipelineContent.js';
18
19
  export * from './PipelineRequest.js';
@@ -0,0 +1,214 @@
1
+ /*
2
+ * Copyright 2024 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ import { cleanupHeaderValue, computeSurrogateKey } from '@adobe/helix-shared-utils';
13
+ import fetchContent from './steps/fetch-content.js';
14
+ import renderCode from './steps/render-code.js';
15
+ import setCustomResponseHeaders from './steps/set-custom-response-headers.js';
16
+ import { PipelineStatusError } from './PipelineStatusError.js';
17
+ import { PipelineResponse } from './PipelineResponse.js';
18
+ import initConfig from './steps/init-config.js';
19
+
20
+ /**
21
+ * Default robots.txt contents returned on inner/outer CDN.
22
+ */
23
+ const DEFAULT_ROBOTS = `# Franklin robots.txt FAQ
24
+ #
25
+ # Q: This looks like a default robots.txt, how can I provide my own?
26
+ # A: Put a file named robots.txt into the root of your GitHub
27
+ # repo, Franklin will serve it from there.
28
+ #
29
+ # Q: Why am I'm seeing this robots.txt instead of the one I
30
+ # configured?
31
+ # A: You are visiting from *.aem.page or *.aem.live - in order
32
+ # to prevent these sites from showing up in search engines and
33
+ # giving you a duplicate content penalty on your real site we
34
+ # exclude all robots
35
+ #
36
+ # Q: What do you mean with "real site"?
37
+ # A: If you add a custom domain to this site (e.g.
38
+ # example.com), then Franklin detects that you are ready for
39
+ # production and serves your own robots.txt - but only on
40
+ # example.com
41
+ #
42
+ # Q: This does not answer my questions at all. What can I do?
43
+ # A: head over to #franklin-chat on Slack or
44
+ # github.com/adobe/helix-home/issues and ask your question
45
+ # there.
46
+ User-agent: *
47
+ Disallow: /
48
+ `;
49
+
50
+ /**
51
+ * Internal domains suffixes, either inner or outer CDN. Every host that
52
+ * ends with one of those is considered internal.
53
+ */
54
+ const INTERNAL_DOMAINS = [
55
+ '.aem.page',
56
+ '.aem-fastly.page',
57
+ '.aem-cloudflare.page',
58
+ '.aem.live',
59
+ '.aem-fastly.live',
60
+ '.aem-cloudflare.live',
61
+ '.hlx.page',
62
+ '.hlx-fastly.page',
63
+ '.hlx-cloudflare.page',
64
+ '.hlx.live',
65
+ '.hlx-fastly.live',
66
+ '.hlx-cloudflare.live',
67
+ ];
68
+
69
+ /**
70
+ * Hosts that should not be treated as internal.
71
+ */
72
+ const EXCLUDED_HOSTS = [
73
+ 'www.aem.live',
74
+ ];
75
+
76
+ /**
77
+ * Generate dynamic robots.txt with production host in the sitemap.
78
+ *
79
+ * @param {import('./PipelineState.js').PipelineState} state state
80
+ * @returns {import('./PipelineResponse.js').PipelineResponse} response
81
+ */
82
+ function generateRobots(state) {
83
+ const {
84
+ prodHost,
85
+ } = state;
86
+ const txt = [
87
+ 'User-Agent: *',
88
+ 'Allow: /',
89
+ '',
90
+ `Sitemap: https://${prodHost}/sitemap.xml`,
91
+ ].join('\n');
92
+ return new PipelineResponse(txt, {
93
+ status: 200,
94
+ headers: {
95
+ 'content-type': 'text/plain; charset=utf-8',
96
+ },
97
+ });
98
+ }
99
+
100
+ /**
101
+ * Return the array of hosts in the `X-Forwarded-Host` request header.
102
+ *
103
+ * @param {import('./PipelineRequest.js').PipelineRequest} req request
104
+ * @returns {Array<String>} array of hosts
105
+ */
106
+ function getForwardedHosts(req) {
107
+ const xfh = req.headers.get('x-forwarded-host');
108
+ if (!xfh) {
109
+ return [];
110
+ }
111
+ return xfh.split(',').map((v) => v.trim()).filter((v) => !!v);
112
+ }
113
+
114
+ /**
115
+ * Return the array of surrogate keys to return for a `robots.txt`
116
+ *
117
+ * @param {import('./PipelineState.js').PipelineState} state state
118
+ * @returns {Array<String>} surrogate keys
119
+ */
120
+ async function computeSurrogateKeys(state) {
121
+ const keys = [];
122
+
123
+ const pathKey = `${state.ref}--${state.repo}--${state.owner}${state.info.path}`;
124
+ keys.push(await computeSurrogateKey(`${state.site}--${state.org}_config.json`));
125
+ keys.push(pathKey.replace(/\//g, '_')); // TODO: remove
126
+ keys.push(await computeSurrogateKey(pathKey));
127
+ return keys;
128
+ }
129
+
130
+ /**
131
+ * Serves or renders the robots.txt.
132
+ *
133
+ * @param {PipelineState} state
134
+ * @param {PipelineRequest} req
135
+ * @returns {PipelineResponse}
136
+ */
137
+ export async function robotsPipe(state, req) {
138
+ const { log } = state;
139
+ state.type = 'robots';
140
+
141
+ if (state.info?.path !== '/robots.txt') {
142
+ // this should not happen as it would mean that the caller used the wrong route. so we respond
143
+ // with a 500 to indicate that something is wrong.
144
+ return new PipelineResponse('', {
145
+ status: 500,
146
+ headers: {
147
+ 'x-error': 'invalid route',
148
+ },
149
+ });
150
+ }
151
+
152
+ /** @type PipelineResponse */
153
+ const res = new PipelineResponse('', {
154
+ headers: {
155
+ 'content-type': 'text/plain; charset=utf-8',
156
+ },
157
+ });
158
+
159
+ const { partition } = state;
160
+ const forwardedHosts = getForwardedHosts(req);
161
+
162
+ if (partition === 'preview' || forwardedHosts.every(
163
+ (host) => !EXCLUDED_HOSTS.includes(host)
164
+ && INTERNAL_DOMAINS.some((domain) => host.endsWith(domain)),
165
+ )) {
166
+ // return default robots.txt, vary and no surrogate key
167
+ res.body = DEFAULT_ROBOTS;
168
+ res.headers.set('vary', 'x-forwarded-host');
169
+ return res;
170
+ }
171
+
172
+ try {
173
+ await initConfig(state, req, res);
174
+
175
+ // fetch robots.txt
176
+ state.timer?.update('content-fetch');
177
+
178
+ state.content.sourceBus = 'code';
179
+ await fetchContent(state, req, res);
180
+ if (res.status === 404) {
181
+ const robots = state.config?.robots?.txt;
182
+ if (robots) {
183
+ state.content.data = robots;
184
+ } else {
185
+ const ret = generateRobots(state);
186
+ state.content.data = ret.body;
187
+ }
188
+ res.headers.set('content-type', 'text/plain; charset=utf-8');
189
+ res.status = 200;
190
+ delete res.error;
191
+ }
192
+ if (res.error) {
193
+ // if content loading produced an error, we're done.
194
+ throw new PipelineStatusError(res.status, res.error);
195
+ }
196
+
197
+ state.timer?.update('serialize');
198
+ await renderCode(state, req, res);
199
+
200
+ // set surrogate keys
201
+ const keys = await computeSurrogateKeys(state);
202
+ res.headers.set('x-surrogate-key', keys.join(' '));
203
+ res.headers.set('vary', 'x-forwarded-host');
204
+
205
+ await setCustomResponseHeaders(state, req, res);
206
+ } catch (e) {
207
+ res.error = e.message;
208
+ res.status = e.code || 500;
209
+
210
+ log.error(`pipeline status: ${res.status} ${res.error}`);
211
+ res.headers.set('x-error', cleanupHeaderValue(res.error));
212
+ }
213
+ return res;
214
+ }
@@ -66,9 +66,6 @@ async function generateSitemap(state) {
66
66
  * Serves or renders the sitemap xml. The sitemap is always served from the preview content-bus
67
67
  * partition.
68
68
  *
69
- * todo: currently only serves an existing sitemap.xml from the contentbus.
70
- * generate sitemap on the fly based on the sitemap.json
71
- *
72
69
  * @param {PipelineState} state
73
70
  * @param {PipelineRequest} req
74
71
  * @returns {PipelineResponse}