@adobe/spacecat-shared-utils 1.82.3 → 1.84.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +1 -1
- package/src/bot-blocker-detect/bot-blocker-detect.js +118 -0
- package/src/bot-blocker-detect/index.d.ts +24 -0
- package/src/index.d.ts +1 -0
- package/src/index.js +1 -0
- package/src/log-wrapper.js +62 -37
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-utils-v1.84.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.83.0...@adobe/spacecat-shared-utils-v1.84.0) (2025-12-10)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* add bot-blocker-detect ([#1233](https://github.com/adobe/spacecat-shared/issues/1233)) ([5d73f1b](https://github.com/adobe/spacecat-shared/commit/5d73f1b07ba5ea9735577b0bb0519d9d1cfd278c))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-utils-v1.83.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.82.3...@adobe/spacecat-shared-utils-v1.83.0) (2025-12-10)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* Implement Structured (JSON) Logging for Spacecat Audits ([#1232](https://github.com/adobe/spacecat-shared/issues/1232)) ([7eae4d6](https://github.com/adobe/spacecat-shared/commit/7eae4d62fe9f0592f8124082fc66e9754803dd2b))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-utils-v1.82.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.82.2...@adobe/spacecat-shared-utils-v1.82.3) (2025-12-10)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { tracingFetch, SPACECAT_USER_AGENT } from '../tracing-fetch.js';
|
|
14
|
+
import { isValidUrl } from '../functions.js';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Confidence levels used in bot blocker detection:
|
|
18
|
+
* - 1.0 (ABSOLUTE): Site responds successfully with 200 OK - definitively crawlable
|
|
19
|
+
* - 0.99 (HIGH): Known bot blocker signature detected (Cloudflare cf-ray, Imperva x-iinfo/x-cdn)
|
|
20
|
+
* - 0.95 (MEDIUM): HTTP/2 protocol errors indicating potential blocking
|
|
21
|
+
* - 0.5: Unknown status code without known blocker signature (e.g., 403 without headers)
|
|
22
|
+
* - 0.3: Unknown error occurred during request
|
|
23
|
+
*
|
|
24
|
+
* Only detections with confidence >= 0.95 should be considered reliable indicators of bot blocking.
|
|
25
|
+
* Lower confidence values indicate uncertain situations that may require manual investigation.
|
|
26
|
+
*/
|
|
27
|
+
const CONFIDENCE_HIGH = 0.99;
|
|
28
|
+
const CONFIDENCE_MEDIUM = 0.95;
|
|
29
|
+
const CONFIDENCE_ABSOLUTE = 1.0;
|
|
30
|
+
const DEFAULT_TIMEOUT = 5000;
|
|
31
|
+
|
|
32
|
+
function analyzeResponse(response) {
|
|
33
|
+
const { status, headers } = response;
|
|
34
|
+
|
|
35
|
+
if (status === 403 && headers.get('cf-ray')) {
|
|
36
|
+
return {
|
|
37
|
+
crawlable: false,
|
|
38
|
+
type: 'cloudflare',
|
|
39
|
+
confidence: CONFIDENCE_HIGH,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (status === 403 && (headers.get('x-iinfo') || headers.get('x-cdn') === 'Incapsula')) {
|
|
44
|
+
return {
|
|
45
|
+
crawlable: false,
|
|
46
|
+
type: 'imperva',
|
|
47
|
+
confidence: CONFIDENCE_HIGH,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (status === 200) {
|
|
52
|
+
return {
|
|
53
|
+
crawlable: true,
|
|
54
|
+
type: 'none',
|
|
55
|
+
confidence: CONFIDENCE_ABSOLUTE,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
crawlable: true,
|
|
61
|
+
type: 'unknown',
|
|
62
|
+
confidence: 0.5,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function analyzeError(error) {
|
|
67
|
+
if (error.code === 'NGHTTP2_INTERNAL_ERROR' || error.code === 'ERR_HTTP2_STREAM_ERROR') {
|
|
68
|
+
return {
|
|
69
|
+
crawlable: false,
|
|
70
|
+
type: 'http2-block',
|
|
71
|
+
confidence: CONFIDENCE_MEDIUM,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
crawlable: true,
|
|
77
|
+
type: 'unknown',
|
|
78
|
+
confidence: 0.3,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Detects bot blocker technology on a website.
|
|
84
|
+
* Makes a single HEAD request and analyzes the response for blocking patterns.
|
|
85
|
+
*
|
|
86
|
+
* Currently detects:
|
|
87
|
+
* - Cloudflare bot blocking (403 + cf-ray header)
|
|
88
|
+
* - Imperva/Incapsula (403 + x-iinfo or x-cdn: Incapsula header)
|
|
89
|
+
* - HTTP/2 stream errors (NGHTTP2_INTERNAL_ERROR, ERR_HTTP2_STREAM_ERROR)
|
|
90
|
+
*
|
|
91
|
+
* @param {Object} config - Configuration object
|
|
92
|
+
* @param {string} config.baseUrl - The base URL to check
|
|
93
|
+
* @param {number} [config.timeout=5000] - Request timeout in milliseconds
|
|
94
|
+
* @returns {Promise<Object>} Detection result with:
|
|
95
|
+
* - crawlable {boolean}: Whether the site can be crawled by bots
|
|
96
|
+
* - type {string}: Blocker type ('cloudflare', 'imperva', 'http2-block', 'none', 'unknown')
|
|
97
|
+
* - confidence {number}: Confidence level (0.0-1.0, see confidence level constants)
|
|
98
|
+
* @throws {Error} If baseUrl is invalid
|
|
99
|
+
*/
|
|
100
|
+
export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
|
|
101
|
+
if (!baseUrl || !isValidUrl(baseUrl)) {
|
|
102
|
+
throw new Error('Invalid baseUrl');
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
const response = await tracingFetch(baseUrl, {
|
|
107
|
+
method: 'HEAD',
|
|
108
|
+
headers: {
|
|
109
|
+
'User-Agent': SPACECAT_USER_AGENT,
|
|
110
|
+
},
|
|
111
|
+
signal: AbortSignal.timeout(timeout),
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
return analyzeResponse(response);
|
|
115
|
+
} catch (error) {
|
|
116
|
+
return analyzeError(error);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
export interface BotBlockerConfig {
|
|
14
|
+
baseUrl: string;
|
|
15
|
+
timeout?: number;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface BotBlockerResult {
|
|
19
|
+
crawlable: boolean;
|
|
20
|
+
type: 'cloudflare' | 'imperva' | 'http2-block' | 'none' | 'unknown';
|
|
21
|
+
confidence: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function detectBotBlocker(config: BotBlockerConfig): Promise<BotBlockerResult>;
|
package/src/index.d.ts
CHANGED
package/src/index.js
CHANGED
|
@@ -110,6 +110,7 @@ export * as llmoConfig from './llmo-config.js';
|
|
|
110
110
|
export * as schemas from './schemas.js';
|
|
111
111
|
|
|
112
112
|
export { detectLocale } from './locale-detect/locale-detect.js';
|
|
113
|
+
export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
|
|
113
114
|
export { prettifyLogForwardingConfig } from './cdn-helpers.js';
|
|
114
115
|
|
|
115
116
|
export {
|
package/src/log-wrapper.js
CHANGED
|
@@ -13,64 +13,89 @@
|
|
|
13
13
|
import { getTraceId } from './xray.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
16
|
+
* Check if a value is a plain object (not Array, not Error, not null, not other special objects)
|
|
17
|
+
* @param {*} value - The value to check
|
|
18
|
+
* @returns {boolean} - True if the value is a plain object
|
|
19
|
+
*/
|
|
20
|
+
function isPlainObject(value) {
|
|
21
|
+
return typeof value === 'object'
|
|
22
|
+
&& value !== null
|
|
23
|
+
&& !Array.isArray(value)
|
|
24
|
+
&& !(value instanceof Error)
|
|
25
|
+
&& value.constructor === Object;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* A higher-order function that wraps a given function and enhances logging by converting
|
|
30
|
+
* all logs to JSON format and appending `jobId` and `traceId` to log messages when available.
|
|
24
31
|
*
|
|
25
|
-
*
|
|
26
|
-
* message
|
|
27
|
-
*
|
|
28
|
-
* logging and returns the result of the original function.
|
|
32
|
+
* All log messages are automatically converted to structured JSON format:
|
|
33
|
+
* - String messages become: { message: "...", jobId: "...", traceId: "..." }
|
|
34
|
+
* - Object messages are merged with: { ...yourObject, jobId: "...", traceId: "..." }
|
|
29
35
|
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
36
|
+
* @param {function} fn - The original function to be wrapped
|
|
37
|
+
* @returns {function(object, object): Promise<Response>} - A wrapped function with JSON logging
|
|
32
38
|
*/
|
|
33
39
|
export function logWrapper(fn) {
|
|
34
40
|
return async (message, context) => {
|
|
35
41
|
const { log } = context;
|
|
36
42
|
|
|
37
43
|
if (log && !context.contextualLog) {
|
|
38
|
-
const markers =
|
|
44
|
+
const markers = {};
|
|
39
45
|
|
|
40
46
|
// Extract jobId from message if available
|
|
41
47
|
if (typeof message === 'object' && message !== null && 'jobId' in message) {
|
|
42
|
-
|
|
43
|
-
markers.push(`[jobId=${jobId}]`);
|
|
48
|
+
markers.jobId = message.jobId;
|
|
44
49
|
}
|
|
45
50
|
|
|
46
51
|
// Extract traceId from AWS X-Ray
|
|
47
52
|
const traceId = getTraceId();
|
|
48
53
|
if (traceId) {
|
|
49
|
-
markers.
|
|
54
|
+
markers.traceId = traceId;
|
|
50
55
|
}
|
|
51
56
|
|
|
52
|
-
//
|
|
53
|
-
|
|
54
|
-
const markerString = markers.join(' ');
|
|
57
|
+
// Define log levels
|
|
58
|
+
const logLevels = ['info', 'error', 'debug', 'warn', 'trace', 'verbose', 'silly', 'fatal'];
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
// Wrap all log methods to output structured JSON
|
|
61
|
+
context.log = logLevels.reduce((accumulator, level) => {
|
|
62
|
+
if (typeof log[level] === 'function') {
|
|
63
|
+
accumulator[level] = (...args) => {
|
|
64
|
+
// If first argument is a plain object, merge with markers
|
|
65
|
+
if (args.length > 0 && isPlainObject(args[0])) {
|
|
66
|
+
return log[level]({ ...markers, ...args[0] });
|
|
67
|
+
}
|
|
58
68
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
69
|
+
// If first argument is a string, convert to structured format
|
|
70
|
+
if (args.length > 0 && typeof args[0] === 'string') {
|
|
71
|
+
const logObject = {
|
|
72
|
+
...markers,
|
|
73
|
+
message: args[0],
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
// If second argument is a plain object, merge it into the log object
|
|
77
|
+
if (args.length > 1 && isPlainObject(args[1])) {
|
|
78
|
+
Object.assign(logObject, args[1]);
|
|
79
|
+
|
|
80
|
+
// If there are more arguments after the object, add them as 'data'
|
|
81
|
+
if (args.length > 2) {
|
|
82
|
+
logObject.data = args.slice(2);
|
|
83
|
+
}
|
|
84
|
+
} else if (args.length > 1) {
|
|
85
|
+
// If there are additional arguments but second is not a plain object,
|
|
86
|
+
// add all additional args as 'data'
|
|
87
|
+
logObject.data = args.slice(1);
|
|
67
88
|
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
89
|
+
|
|
90
|
+
return log[level](logObject);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// For other types (arrays, primitives, Error objects), wrap in object
|
|
94
|
+
return log[level]({ ...markers, data: args });
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
return accumulator;
|
|
98
|
+
}, {});
|
|
74
99
|
|
|
75
100
|
// Mark that we've processed this context
|
|
76
101
|
context.contextualLog = context.log;
|