@adobe/helix-html-pipeline 6.9.0 → 6.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [6.10.1](https://github.com/adobe/helix-html-pipeline/compare/v6.10.0...v6.10.1) (2024-04-25)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * don't fetch robots.txt from code-bus ([#595](https://github.com/adobe/helix-html-pipeline/issues/595)) ([5000a9c](https://github.com/adobe/helix-html-pipeline/commit/5000a9cdc13fbd12cd479740c60a613c9fcd14e8))
7
+
8
+ # [6.10.0](https://github.com/adobe/helix-html-pipeline/compare/v6.9.0...v6.10.0) (2024-04-24)
9
+
10
+
11
+ ### Features
12
+
13
+ * support dynamic and configurable robots.txt ([#591](https://github.com/adobe/helix-html-pipeline/issues/591)) ([9796e35](https://github.com/adobe/helix-html-pipeline/commit/9796e35903556ba211010fbabd86b45c90b409e6))
14
+
1
15
  # [6.9.0](https://github.com/adobe/helix-html-pipeline/compare/v6.8.1...v6.9.0) (2024-04-23)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-html-pipeline",
3
- "version": "6.9.0",
3
+ "version": "6.10.1",
4
4
  "description": "Helix HTML Pipeline",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
package/src/index.js CHANGED
@@ -13,6 +13,7 @@ export * from './html-pipe.js';
13
13
  export * from './json-pipe.js';
14
14
  export * from './auth-pipe.js';
15
15
  export * from './options-pipe.js';
16
+ export * from './robots-pipe.js';
16
17
  export * from './sitemap-pipe.js';
17
18
  export * from './PipelineContent.js';
18
19
  export * from './PipelineRequest.js';
@@ -0,0 +1,200 @@
1
+ /*
2
+ * Copyright 2024 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ import { cleanupHeaderValue, computeSurrogateKey } from '@adobe/helix-shared-utils';
13
+ import renderCode from './steps/render-code.js';
14
+ import setCustomResponseHeaders from './steps/set-custom-response-headers.js';
15
+ import { PipelineResponse } from './PipelineResponse.js';
16
+ import initConfig from './steps/init-config.js';
17
+
18
+ /**
19
+ * Default robots.txt contents returned on inner/outer CDN.
20
+ */
21
+ const DEFAULT_ROBOTS = `# Franklin robots.txt FAQ
22
+ #
23
+ # Q: This looks like a default robots.txt, how can I provide my own?
24
+ # A: Put a file named robots.txt into the root of your GitHub
25
+ # repo, Franklin will serve it from there.
26
+ #
27
+ # Q: Why am I'm seeing this robots.txt instead of the one I
28
+ # configured?
29
+ # A: You are visiting from *.aem.page or *.aem.live - in order
30
+ # to prevent these sites from showing up in search engines and
31
+ # giving you a duplicate content penalty on your real site we
32
+ # exclude all robots
33
+ #
34
+ # Q: What do you mean with "real site"?
35
+ # A: If you add a custom domain to this site (e.g.
36
+ # example.com), then Franklin detects that you are ready for
37
+ # production and serves your own robots.txt - but only on
38
+ # example.com
39
+ #
40
+ # Q: This does not answer my questions at all. What can I do?
41
+ # A: head over to #franklin-chat on Slack or
42
+ # github.com/adobe/helix-home/issues and ask your question
43
+ # there.
44
+ User-agent: *
45
+ Disallow: /
46
+ `;
47
+
48
+ /**
49
+ * Internal domains suffixes, either inner or outer CDN. Every host that
50
+ * ends with one of those is considered internal.
51
+ */
52
+ const INTERNAL_DOMAINS = [
53
+ '.aem.page',
54
+ '.aem-fastly.page',
55
+ '.aem-cloudflare.page',
56
+ '.aem.live',
57
+ '.aem-fastly.live',
58
+ '.aem-cloudflare.live',
59
+ '.hlx.page',
60
+ '.hlx-fastly.page',
61
+ '.hlx-cloudflare.page',
62
+ '.hlx.live',
63
+ '.hlx-fastly.live',
64
+ '.hlx-cloudflare.live',
65
+ ];
66
+
67
+ /**
68
+ * Hosts that should not be treated as internal.
69
+ */
70
+ const EXCLUDED_HOSTS = [
71
+ 'www.aem.live',
72
+ ];
73
+
74
+ /**
75
+ * Generate dynamic robots.txt with production host in the sitemap.
76
+ *
77
+ * @param {import('./PipelineState.js').PipelineState} state state
78
+ * @returns {import('./PipelineResponse.js').PipelineResponse} response
79
+ */
80
+ function generateRobots(state) {
81
+ const {
82
+ prodHost,
83
+ } = state;
84
+ const txt = [
85
+ 'User-Agent: *',
86
+ 'Allow: /',
87
+ '',
88
+ `Sitemap: https://${prodHost}/sitemap.xml`,
89
+ ].join('\n');
90
+ return new PipelineResponse(txt, {
91
+ status: 200,
92
+ headers: {
93
+ 'content-type': 'text/plain; charset=utf-8',
94
+ },
95
+ });
96
+ }
97
+
98
+ /**
99
+ * Return the array of hosts in the `X-Forwarded-Host` request header.
100
+ *
101
+ * @param {import('./PipelineRequest.js').PipelineRequest} req request
102
+ * @returns {Array<String>} array of hosts
103
+ */
104
+ function getForwardedHosts(req) {
105
+ const xfh = req.headers.get('x-forwarded-host');
106
+ if (!xfh) {
107
+ return [];
108
+ }
109
+ return xfh.split(',').map((v) => v.trim()).filter((v) => !!v);
110
+ }
111
+
112
+ /**
113
+ * Return the array of surrogate keys to return for a `robots.txt`
114
+ *
115
+ * @param {import('./PipelineState.js').PipelineState} state state
116
+ * @returns {Array<String>} surrogate keys
117
+ */
118
+ async function computeSurrogateKeys(state) {
119
+ const keys = [];
120
+
121
+ const pathKey = `${state.ref}--${state.repo}--${state.owner}${state.info.path}`;
122
+ keys.push(await computeSurrogateKey(`${state.site}--${state.org}_config.json`));
123
+ keys.push(pathKey.replace(/\//g, '_')); // TODO: remove
124
+ keys.push(await computeSurrogateKey(pathKey));
125
+ return keys;
126
+ }
127
+
128
+ /**
129
+ * Serves or renders the robots.txt.
130
+ *
131
+ * @param {PipelineState} state
132
+ * @param {PipelineRequest} req
133
+ * @returns {PipelineResponse}
134
+ */
135
+ export async function robotsPipe(state, req) {
136
+ const { log } = state;
137
+ state.type = 'robots';
138
+
139
+ if (state.info?.path !== '/robots.txt') {
140
+ // this should not happen as it would mean that the caller used the wrong route. so we respond
141
+ // with a 500 to indicate that something is wrong.
142
+ return new PipelineResponse('', {
143
+ status: 500,
144
+ headers: {
145
+ 'x-error': 'invalid route',
146
+ },
147
+ });
148
+ }
149
+
150
+ /** @type PipelineResponse */
151
+ const res = new PipelineResponse('', {
152
+ headers: {
153
+ 'content-type': 'text/plain; charset=utf-8',
154
+ },
155
+ });
156
+
157
+ const { partition } = state;
158
+ const forwardedHosts = getForwardedHosts(req);
159
+
160
+ if (partition === 'preview' || forwardedHosts.every(
161
+ (host) => !EXCLUDED_HOSTS.includes(host)
162
+ && INTERNAL_DOMAINS.some((domain) => host.endsWith(domain)),
163
+ )) {
164
+ // return default robots.txt, vary and no surrogate key
165
+ res.body = DEFAULT_ROBOTS;
166
+ res.headers.set('vary', 'x-forwarded-host');
167
+ return res;
168
+ }
169
+
170
+ try {
171
+ await initConfig(state, req, res);
172
+
173
+ const robots = state.config?.robots?.txt;
174
+ if (robots) {
175
+ state.content.data = robots;
176
+ } else {
177
+ const ret = generateRobots(state);
178
+ state.content.data = ret.body;
179
+ }
180
+ res.headers.set('content-type', 'text/plain; charset=utf-8');
181
+ res.status = 200;
182
+
183
+ state.timer?.update('serialize');
184
+ await renderCode(state, req, res);
185
+
186
+ // set surrogate keys
187
+ const keys = await computeSurrogateKeys(state);
188
+ res.headers.set('x-surrogate-key', keys.join(' '));
189
+ res.headers.set('vary', 'x-forwarded-host');
190
+
191
+ await setCustomResponseHeaders(state, req, res);
192
+ } catch (e) {
193
+ res.error = e.message;
194
+ res.status = e.code || 500;
195
+
196
+ log.error(`pipeline status: ${res.status} ${res.error}`);
197
+ res.headers.set('x-error', cleanupHeaderValue(res.error));
198
+ }
199
+ return res;
200
+ }
@@ -66,9 +66,6 @@ async function generateSitemap(state) {
66
66
  * Serves or renders the sitemap xml. The sitemap is always served from the preview content-bus
67
67
  * partition.
68
68
  *
69
- * todo: currently only serves an existing sitemap.xml from the contentbus.
70
- * generate sitemap on the fly based on the sitemap.json
71
- *
72
69
  * @param {PipelineState} state
73
70
  * @param {PipelineRequest} req
74
71
  * @returns {PipelineResponse}