@adobe/helix-html-pipeline 6.8.1 → 6.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +1 -1
- package/src/index.js +1 -0
- package/src/robots-pipe.js +214 -0
- package/src/sitemap-pipe.js +0 -3
- package/src/steps/utils.js +7 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [6.10.0](https://github.com/adobe/helix-html-pipeline/compare/v6.9.0...v6.10.0) (2024-04-24)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* support dynamic and configurable robots.txt ([#591](https://github.com/adobe/helix-html-pipeline/issues/591)) ([9796e35](https://github.com/adobe/helix-html-pipeline/commit/9796e35903556ba211010fbabd86b45c90b409e6))
|
|
7
|
+
|
|
8
|
+
# [6.9.0](https://github.com/adobe/helix-html-pipeline/compare/v6.8.1...v6.9.0) (2024-04-23)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* improve xfh parsing ([#594](https://github.com/adobe/helix-html-pipeline/issues/594)) ([d90a285](https://github.com/adobe/helix-html-pipeline/commit/d90a285a575fa36527e0045271b5f856ce66a932)), closes [#592](https://github.com/adobe/helix-html-pipeline/issues/592)
|
|
14
|
+
|
|
1
15
|
## [6.8.1](https://github.com/adobe/helix-html-pipeline/compare/v6.8.0...v6.8.1) (2024-04-18)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -13,6 +13,7 @@ export * from './html-pipe.js';
|
|
|
13
13
|
export * from './json-pipe.js';
|
|
14
14
|
export * from './auth-pipe.js';
|
|
15
15
|
export * from './options-pipe.js';
|
|
16
|
+
export * from './robots-pipe.js';
|
|
16
17
|
export * from './sitemap-pipe.js';
|
|
17
18
|
export * from './PipelineContent.js';
|
|
18
19
|
export * from './PipelineRequest.js';
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2024 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
import { cleanupHeaderValue, computeSurrogateKey } from '@adobe/helix-shared-utils';
|
|
13
|
+
import fetchContent from './steps/fetch-content.js';
|
|
14
|
+
import renderCode from './steps/render-code.js';
|
|
15
|
+
import setCustomResponseHeaders from './steps/set-custom-response-headers.js';
|
|
16
|
+
import { PipelineStatusError } from './PipelineStatusError.js';
|
|
17
|
+
import { PipelineResponse } from './PipelineResponse.js';
|
|
18
|
+
import initConfig from './steps/init-config.js';
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Default robots.txt contents returned on inner/outer CDN.
|
|
22
|
+
*/
|
|
23
|
+
const DEFAULT_ROBOTS = `# Franklin robots.txt FAQ
|
|
24
|
+
#
|
|
25
|
+
# Q: This looks like a default robots.txt, how can I provide my own?
|
|
26
|
+
# A: Put a file named robots.txt into the root of your GitHub
|
|
27
|
+
# repo, Franklin will serve it from there.
|
|
28
|
+
#
|
|
29
|
+
# Q: Why am I'm seeing this robots.txt instead of the one I
|
|
30
|
+
# configured?
|
|
31
|
+
# A: You are visiting from *.aem.page or *.aem.live - in order
|
|
32
|
+
# to prevent these sites from showing up in search engines and
|
|
33
|
+
# giving you a duplicate content penalty on your real site we
|
|
34
|
+
# exclude all robots
|
|
35
|
+
#
|
|
36
|
+
# Q: What do you mean with "real site"?
|
|
37
|
+
# A: If you add a custom domain to this site (e.g.
|
|
38
|
+
# example.com), then Franklin detects that you are ready for
|
|
39
|
+
# production and serves your own robots.txt - but only on
|
|
40
|
+
# example.com
|
|
41
|
+
#
|
|
42
|
+
# Q: This does not answer my questions at all. What can I do?
|
|
43
|
+
# A: head over to #franklin-chat on Slack or
|
|
44
|
+
# github.com/adobe/helix-home/issues and ask your question
|
|
45
|
+
# there.
|
|
46
|
+
User-agent: *
|
|
47
|
+
Disallow: /
|
|
48
|
+
`;
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Internal domains suffixes, either inner or outer CDN. Every host that
|
|
52
|
+
* ends with one of those is considered internal.
|
|
53
|
+
*/
|
|
54
|
+
const INTERNAL_DOMAINS = [
|
|
55
|
+
'.aem.page',
|
|
56
|
+
'.aem-fastly.page',
|
|
57
|
+
'.aem-cloudflare.page',
|
|
58
|
+
'.aem.live',
|
|
59
|
+
'.aem-fastly.live',
|
|
60
|
+
'.aem-cloudflare.live',
|
|
61
|
+
'.hlx.page',
|
|
62
|
+
'.hlx-fastly.page',
|
|
63
|
+
'.hlx-cloudflare.page',
|
|
64
|
+
'.hlx.live',
|
|
65
|
+
'.hlx-fastly.live',
|
|
66
|
+
'.hlx-cloudflare.live',
|
|
67
|
+
];
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Hosts that should not be treated as internal.
|
|
71
|
+
*/
|
|
72
|
+
const EXCLUDED_HOSTS = [
|
|
73
|
+
'www.aem.live',
|
|
74
|
+
];
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Generate dynamic robots.txt with production host in the sitemap.
|
|
78
|
+
*
|
|
79
|
+
* @param {import('./PipelineState.js').PipelineState} state state
|
|
80
|
+
* @returns {import('./PipelineResponse.js').PipelineResponse} response
|
|
81
|
+
*/
|
|
82
|
+
function generateRobots(state) {
|
|
83
|
+
const {
|
|
84
|
+
prodHost,
|
|
85
|
+
} = state;
|
|
86
|
+
const txt = [
|
|
87
|
+
'User-Agent: *',
|
|
88
|
+
'Allow: /',
|
|
89
|
+
'',
|
|
90
|
+
`Sitemap: https://${prodHost}/sitemap.xml`,
|
|
91
|
+
].join('\n');
|
|
92
|
+
return new PipelineResponse(txt, {
|
|
93
|
+
status: 200,
|
|
94
|
+
headers: {
|
|
95
|
+
'content-type': 'text/plain; charset=utf-8',
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Return the array of hosts in the `X-Forwarded-Host` request header.
|
|
102
|
+
*
|
|
103
|
+
* @param {import('./PipelineRequest.js').PipelineRequest} req request
|
|
104
|
+
* @returns {Array<String>} array of hosts
|
|
105
|
+
*/
|
|
106
|
+
function getForwardedHosts(req) {
|
|
107
|
+
const xfh = req.headers.get('x-forwarded-host');
|
|
108
|
+
if (!xfh) {
|
|
109
|
+
return [];
|
|
110
|
+
}
|
|
111
|
+
return xfh.split(',').map((v) => v.trim()).filter((v) => !!v);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Return the array of surrogate keys to return for a `robots.txt`
|
|
116
|
+
*
|
|
117
|
+
* @param {import('./PipelineState.js').PipelineState} state state
|
|
118
|
+
* @returns {Array<String>} surrogate keys
|
|
119
|
+
*/
|
|
120
|
+
async function computeSurrogateKeys(state) {
|
|
121
|
+
const keys = [];
|
|
122
|
+
|
|
123
|
+
const pathKey = `${state.ref}--${state.repo}--${state.owner}${state.info.path}`;
|
|
124
|
+
keys.push(await computeSurrogateKey(`${state.site}--${state.org}_config.json`));
|
|
125
|
+
keys.push(pathKey.replace(/\//g, '_')); // TODO: remove
|
|
126
|
+
keys.push(await computeSurrogateKey(pathKey));
|
|
127
|
+
return keys;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Serves or renders the robots.txt.
|
|
132
|
+
*
|
|
133
|
+
* @param {PipelineState} state
|
|
134
|
+
* @param {PipelineRequest} req
|
|
135
|
+
* @returns {PipelineResponse}
|
|
136
|
+
*/
|
|
137
|
+
export async function robotsPipe(state, req) {
|
|
138
|
+
const { log } = state;
|
|
139
|
+
state.type = 'robots';
|
|
140
|
+
|
|
141
|
+
if (state.info?.path !== '/robots.txt') {
|
|
142
|
+
// this should not happen as it would mean that the caller used the wrong route. so we respond
|
|
143
|
+
// with a 500 to indicate that something is wrong.
|
|
144
|
+
return new PipelineResponse('', {
|
|
145
|
+
status: 500,
|
|
146
|
+
headers: {
|
|
147
|
+
'x-error': 'invalid route',
|
|
148
|
+
},
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/** @type PipelineResponse */
|
|
153
|
+
const res = new PipelineResponse('', {
|
|
154
|
+
headers: {
|
|
155
|
+
'content-type': 'text/plain; charset=utf-8',
|
|
156
|
+
},
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
const { partition } = state;
|
|
160
|
+
const forwardedHosts = getForwardedHosts(req);
|
|
161
|
+
|
|
162
|
+
if (partition === 'preview' || forwardedHosts.every(
|
|
163
|
+
(host) => !EXCLUDED_HOSTS.includes(host)
|
|
164
|
+
&& INTERNAL_DOMAINS.some((domain) => host.endsWith(domain)),
|
|
165
|
+
)) {
|
|
166
|
+
// return default robots.txt, vary and no surrogate key
|
|
167
|
+
res.body = DEFAULT_ROBOTS;
|
|
168
|
+
res.headers.set('vary', 'x-forwarded-host');
|
|
169
|
+
return res;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
try {
|
|
173
|
+
await initConfig(state, req, res);
|
|
174
|
+
|
|
175
|
+
// fetch robots.txt
|
|
176
|
+
state.timer?.update('content-fetch');
|
|
177
|
+
|
|
178
|
+
state.content.sourceBus = 'code';
|
|
179
|
+
await fetchContent(state, req, res);
|
|
180
|
+
if (res.status === 404) {
|
|
181
|
+
const robots = state.config?.robots?.txt;
|
|
182
|
+
if (robots) {
|
|
183
|
+
state.content.data = robots;
|
|
184
|
+
} else {
|
|
185
|
+
const ret = generateRobots(state);
|
|
186
|
+
state.content.data = ret.body;
|
|
187
|
+
}
|
|
188
|
+
res.headers.set('content-type', 'text/plain; charset=utf-8');
|
|
189
|
+
res.status = 200;
|
|
190
|
+
delete res.error;
|
|
191
|
+
}
|
|
192
|
+
if (res.error) {
|
|
193
|
+
// if content loading produced an error, we're done.
|
|
194
|
+
throw new PipelineStatusError(res.status, res.error);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
state.timer?.update('serialize');
|
|
198
|
+
await renderCode(state, req, res);
|
|
199
|
+
|
|
200
|
+
// set surrogate keys
|
|
201
|
+
const keys = await computeSurrogateKeys(state);
|
|
202
|
+
res.headers.set('x-surrogate-key', keys.join(' '));
|
|
203
|
+
res.headers.set('vary', 'x-forwarded-host');
|
|
204
|
+
|
|
205
|
+
await setCustomResponseHeaders(state, req, res);
|
|
206
|
+
} catch (e) {
|
|
207
|
+
res.error = e.message;
|
|
208
|
+
res.status = e.code || 500;
|
|
209
|
+
|
|
210
|
+
log.error(`pipeline status: ${res.status} ${res.error}`);
|
|
211
|
+
res.headers.set('x-error', cleanupHeaderValue(res.error));
|
|
212
|
+
}
|
|
213
|
+
return res;
|
|
214
|
+
}
|
package/src/sitemap-pipe.js
CHANGED
|
@@ -66,9 +66,6 @@ async function generateSitemap(state) {
|
|
|
66
66
|
* Serves or renders the sitemap xml. The sitemap is always served from the preview content-bus
|
|
67
67
|
* partition.
|
|
68
68
|
*
|
|
69
|
-
* todo: currently only serves an existing sitemap.xml from the contentbus.
|
|
70
|
-
* generate sitemap on the fly based on the sitemap.json
|
|
71
|
-
*
|
|
72
69
|
* @param {PipelineState} state
|
|
73
70
|
* @param {PipelineRequest} req
|
|
74
71
|
* @returns {PipelineResponse}
|
package/src/steps/utils.js
CHANGED
|
@@ -24,7 +24,13 @@ const HELIX_URL_REGEXP = /^https:\/\/(?!admin\.|www\.)[^.]+\.(aem|hlx3?)\.(live|
|
|
|
24
24
|
export function getOriginalHost(headers) {
|
|
25
25
|
const xfh = headers.get('x-forwarded-host');
|
|
26
26
|
if (xfh) {
|
|
27
|
-
|
|
27
|
+
const segs = xfh.split(',');
|
|
28
|
+
for (const seg of segs) {
|
|
29
|
+
const host = seg.trim();
|
|
30
|
+
if (host) {
|
|
31
|
+
return host;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
28
34
|
}
|
|
29
35
|
return headers.get('host');
|
|
30
36
|
}
|