@adobe/spacecat-shared-utils 1.83.0 → 1.85.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [@adobe/spacecat-shared-utils-v1.85.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.84.0...@adobe/spacecat-shared-utils-v1.85.0) (2025-12-11)
2
+
3
+
4
+ ### Features
5
+
6
+ * add WIKIPEDIA_ANALYSIS audit type and opportunity type ([#1222](https://github.com/adobe/spacecat-shared/issues/1222)) ([5a1b113](https://github.com/adobe/spacecat-shared/commit/5a1b113ff343a930e80bb5aafedbe2b8c5423534))
7
+
8
+ # [@adobe/spacecat-shared-utils-v1.84.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.83.0...@adobe/spacecat-shared-utils-v1.84.0) (2025-12-10)
9
+
10
+
11
+ ### Features
12
+
13
+ * add bot-blocker-detect ([#1233](https://github.com/adobe/spacecat-shared/issues/1233)) ([5d73f1b](https://github.com/adobe/spacecat-shared/commit/5d73f1b07ba5ea9735577b0bb0519d9d1cfd278c))
14
+
1
15
  # [@adobe/spacecat-shared-utils-v1.83.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-utils-v1.82.3...@adobe/spacecat-shared-utils-v1.83.0) (2025-12-10)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-utils",
3
- "version": "1.83.0",
3
+ "version": "1.85.0",
4
4
  "description": "Shared modules of the Spacecat Services - utils",
5
5
  "type": "module",
6
6
  "exports": {
@@ -0,0 +1,118 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import { tracingFetch, SPACECAT_USER_AGENT } from '../tracing-fetch.js';
14
+ import { isValidUrl } from '../functions.js';
15
+
16
+ /**
17
+ * Confidence levels used in bot blocker detection:
18
+ * - 1.0 (ABSOLUTE): Site responds successfully with 200 OK - definitively crawlable
19
+ * - 0.99 (HIGH): Known bot blocker signature detected (Cloudflare cf-ray, Imperva x-iinfo/x-cdn)
20
+ * - 0.95 (MEDIUM): HTTP/2 protocol errors indicating potential blocking
21
+ * - 0.5: Unknown status code without known blocker signature (e.g., 403 without headers)
22
+ * - 0.3: Unknown error occurred during request
23
+ *
24
+ * Only detections with confidence >= 0.95 should be considered reliable indicators of bot blocking.
25
+ * Lower confidence values indicate uncertain situations that may require manual investigation.
26
+ */
27
+ const CONFIDENCE_HIGH = 0.99;
28
+ const CONFIDENCE_MEDIUM = 0.95;
29
+ const CONFIDENCE_ABSOLUTE = 1.0;
30
+ const DEFAULT_TIMEOUT = 5000;
31
+
32
+ function analyzeResponse(response) {
33
+ const { status, headers } = response;
34
+
35
+ if (status === 403 && headers.get('cf-ray')) {
36
+ return {
37
+ crawlable: false,
38
+ type: 'cloudflare',
39
+ confidence: CONFIDENCE_HIGH,
40
+ };
41
+ }
42
+
43
+ if (status === 403 && (headers.get('x-iinfo') || headers.get('x-cdn') === 'Incapsula')) {
44
+ return {
45
+ crawlable: false,
46
+ type: 'imperva',
47
+ confidence: CONFIDENCE_HIGH,
48
+ };
49
+ }
50
+
51
+ if (status === 200) {
52
+ return {
53
+ crawlable: true,
54
+ type: 'none',
55
+ confidence: CONFIDENCE_ABSOLUTE,
56
+ };
57
+ }
58
+
59
+ return {
60
+ crawlable: true,
61
+ type: 'unknown',
62
+ confidence: 0.5,
63
+ };
64
+ }
65
+
66
+ function analyzeError(error) {
67
+ if (error.code === 'NGHTTP2_INTERNAL_ERROR' || error.code === 'ERR_HTTP2_STREAM_ERROR') {
68
+ return {
69
+ crawlable: false,
70
+ type: 'http2-block',
71
+ confidence: CONFIDENCE_MEDIUM,
72
+ };
73
+ }
74
+
75
+ return {
76
+ crawlable: true,
77
+ type: 'unknown',
78
+ confidence: 0.3,
79
+ };
80
+ }
81
+
82
+ /**
83
+ * Detects bot blocker technology on a website.
84
+ * Makes a single HEAD request and analyzes the response for blocking patterns.
85
+ *
86
+ * Currently detects:
87
+ * - Cloudflare bot blocking (403 + cf-ray header)
88
+ * - Imperva/Incapsula (403 + x-iinfo or x-cdn: Incapsula header)
89
+ * - HTTP/2 stream errors (NGHTTP2_INTERNAL_ERROR, ERR_HTTP2_STREAM_ERROR)
90
+ *
91
+ * @param {Object} config - Configuration object
92
+ * @param {string} config.baseUrl - The base URL to check
93
+ * @param {number} [config.timeout=5000] - Request timeout in milliseconds
94
+ * @returns {Promise<Object>} Detection result with:
95
+ * - crawlable {boolean}: Whether the site can be crawled by bots
96
+ * - type {string}: Blocker type ('cloudflare', 'imperva', 'http2-block', 'none', 'unknown')
97
+ * - confidence {number}: Confidence level (0.0-1.0, see confidence level constants)
98
+ * @throws {Error} If baseUrl is invalid
99
+ */
100
+ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
101
+ if (!baseUrl || !isValidUrl(baseUrl)) {
102
+ throw new Error('Invalid baseUrl');
103
+ }
104
+
105
+ try {
106
+ const response = await tracingFetch(baseUrl, {
107
+ method: 'HEAD',
108
+ headers: {
109
+ 'User-Agent': SPACECAT_USER_AGENT,
110
+ },
111
+ signal: AbortSignal.timeout(timeout),
112
+ });
113
+
114
+ return analyzeResponse(response);
115
+ } catch (error) {
116
+ return analyzeError(error);
117
+ }
118
+ }
@@ -0,0 +1,24 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export interface BotBlockerConfig {
14
+ baseUrl: string;
15
+ timeout?: number;
16
+ }
17
+
18
+ export interface BotBlockerResult {
19
+ crawlable: boolean;
20
+ type: 'cloudflare' | 'imperva' | 'http2-block' | 'none' | 'unknown';
21
+ confidence: number;
22
+ }
23
+
24
+ export function detectBotBlocker(config: BotBlockerConfig): Promise<BotBlockerResult>;
package/src/constants.js CHANGED
@@ -64,4 +64,7 @@ export const OPPORTUNITY_TYPES = /** @type {const} */ ({
64
64
 
65
65
  // Paid Cookie Consent
66
66
  PAID_COOKIE_CONSENT: 'paid-cookie-consent',
67
+
68
+ // Wikipedia Analysis (LLMO)
69
+ WIKIPEDIA_ANALYSIS: 'wikipedia-analysis',
67
70
  });
package/src/index.d.ts CHANGED
@@ -330,3 +330,4 @@ export * as llmoConfig from './llmo-config.js';
330
330
  export * as schemas from './schemas.js';
331
331
 
332
332
  export { type detectLocale } from './locale-detect/index.js';
333
+ export { type detectBotBlocker } from './bot-blocker-detect/index.js';
package/src/index.js CHANGED
@@ -110,6 +110,7 @@ export * as llmoConfig from './llmo-config.js';
110
110
  export * as schemas from './schemas.js';
111
111
 
112
112
  export { detectLocale } from './locale-detect/locale-detect.js';
113
+ export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
113
114
  export { prettifyLogForwardingConfig } from './cdn-helpers.js';
114
115
 
115
116
  export {