intelwatch 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,11 @@
1
1
  import axios from 'axios';
2
2
 
3
+ // ── Debug stub (remplacer par logger réel en prod) ──────────────────────────
4
+ const debug = (...args) => {
5
+ if (process.env.DEBUG_FETCHER) console.log('[fetcher]', ...args);
6
+ };
7
+
8
+ // ── User-Agent rotation ────────────────────────────────────────────────────
3
9
  const USER_AGENTS = [
4
10
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
5
11
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
@@ -12,18 +18,108 @@ function randomUserAgent() {
12
18
  return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
13
19
  }
14
20
 
15
- function sleep(ms) {
21
+ // ── Utilitaires ────────────────────────────────────────────────────────────
22
+ export function sleep(ms) {
16
23
  return new Promise(resolve => setTimeout(resolve, ms));
17
24
  }
18
25
 
26
+ // ── Domaines protégés (Cloudflare / anti-bot lourd) ────────────────────────
27
+ const PROTECTED_DOMAINS = ['pappers.fr', 'societe.com', 'verif.com', 'score3.fr', 'manageo.fr'];
28
+
29
+ export function isProtectedDomain(url) {
30
+ try {
31
+ const { hostname } = new URL(url);
32
+ return PROTECTED_DOMAINS.some(d => hostname === d || hostname.endsWith(`.${d}`));
33
+ } catch {
34
+ return false;
35
+ }
36
+ }
37
+
38
+ // ── Camofox (anti-bot bypass local) ────────────────────────────────────────
39
+ const CAMOFOX_BASE = 'http://localhost:9377';
40
+ const CAMOFOX_USER_ID = 'intelwatch';
41
+ const CAMOFOX_SESSION_KEY = 'default';
42
+ const CAMOFOX_WAIT_MS = 7000;
43
+
44
+ export async function camofoxFetch(url, options = {}) {
45
+ const { timeout = 30000 } = options;
46
+
47
+ // Vérifier disponibilité Camofox
48
+ let healthCheck;
49
+ try {
50
+ healthCheck = await axios.get(`${CAMOFOX_BASE}/health`, { timeout: 2000 });
51
+ } catch {
52
+ debug('camofox indisponible sur', CAMOFOX_BASE);
53
+ throw new Error(`Camofox unavailable at ${CAMOFOX_BASE} — cannot bypass protection for ${url}`);
54
+ }
55
+
56
+ let tabId;
57
+ try {
58
+ // POST /tabs — ouvrir onglet navigateur
59
+ const createRes = await axios.post(`${CAMOFOX_BASE}/tabs`, {
60
+ userId: CAMOFOX_USER_ID,
61
+ sessionKey: CAMOFOX_SESSION_KEY,
62
+ url,
63
+ }, { timeout });
64
+
65
+ tabId = createRes.data?.tabId || createRes.data?.id;
66
+ if (!tabId) throw new Error('Camofox: no tabId returned from POST /tabs');
67
+
68
+ debug('camofox tab created:', tabId, '— waiting', CAMOFOX_WAIT_MS, 'ms');
69
+
70
+ // Attente résolution challenge CF
71
+ await sleep(CAMOFOX_WAIT_MS);
72
+
73
+ // GET /tabs/{tabId}/snapshot — récupérer HTML rendu
74
+ const snapRes = await axios.get(`${CAMOFOX_BASE}/tabs/${tabId}/snapshot`, {
75
+ params: { userId: CAMOFOX_USER_ID },
76
+ timeout,
77
+ });
78
+
79
+ // Wrapper dans un format compatible response Axios
80
+ return {
81
+ status: snapRes.status,
82
+ statusText: snapRes.statusText,
83
+ headers: snapRes.headers,
84
+ data: snapRes.data,
85
+ config: snapRes.config,
86
+ request: snapRes.request,
87
+ _camofox: true,
88
+ };
89
+ } finally {
90
+ // Toujours cleanup, même en cas d'erreur
91
+ if (tabId) {
92
+ try {
93
+ await axios.delete(`${CAMOFOX_BASE}/tabs/${tabId}`, {
94
+ params: { userId: CAMOFOX_USER_ID },
95
+ timeout: 5000,
96
+ });
97
+ debug('camofox tab cleaned up:', tabId);
98
+ } catch (err) {
99
+ debug('camofox cleanup failed for tab', tabId, err.message);
100
+ }
101
+ }
102
+ }
103
+ }
104
+
105
+ // ── Fetch principal (Axios + fallback Camofox) ─────────────────────────────
19
106
  export async function fetch(url, options = {}) {
20
107
  const {
21
108
  retries = 3,
22
109
  delay = 1500,
23
110
  timeout = 15000,
24
111
  headers = {},
112
+ forceCamofox = false,
25
113
  } = options;
26
114
 
115
+ // Mode force : court-circuiter Axios, aller direct Camofox
116
+ if (forceCamofox) {
117
+ return camofoxFetch(url, options);
118
+ }
119
+
120
+ // Domaine protégé connu : tentative Axios puis fallback si 403
121
+ const protected_ = isProtectedDomain(url);
122
+
27
123
  const config = {
28
124
  url,
29
125
  method: options.method || 'GET',
@@ -41,6 +137,8 @@ export async function fetch(url, options = {}) {
41
137
  };
42
138
 
43
139
  let lastError;
140
+ let needsCamofox = false;
141
+
44
142
  for (let attempt = 1; attempt <= retries; attempt++) {
45
143
  try {
46
144
  if (attempt > 1) {
@@ -59,6 +157,13 @@ export async function fetch(url, options = {}) {
59
157
  throw new Error(`Rate limited (429) after ${retries} attempts`);
60
158
  }
61
159
 
160
+ // 403 = signature Cloudflare → fallback Camofox
161
+ if (response.status === 403) {
162
+ debug('403 détecté pour', url, '— fallback camofox');
163
+ needsCamofox = true;
164
+ break;
165
+ }
166
+
62
167
  return response;
63
168
  } catch (err) {
64
169
  lastError = err;
@@ -68,9 +173,25 @@ export async function fetch(url, options = {}) {
68
173
  }
69
174
  }
70
175
 
71
- throw lastError;
176
+ // Fallback Camofox si 403 ou domaine protégé (et Axios a échoué)
177
+ if (needsCamofox || (protected_ && lastError)) {
178
+ try {
179
+ return await camofoxFetch(url, options);
180
+ } catch (camofoxErr) {
181
+ // Camofox indisponible → propager l'erreur Axios originale
182
+ debug('camofox fallback échoué:', camofoxErr.message);
183
+ if (lastError) throw lastError;
184
+ throw camofoxErr;
185
+ }
186
+ }
187
+
188
+ if (lastError) throw lastError;
189
+
190
+ // Ne devrait jamais arriver, mais sécurité
191
+ throw new Error(`fetch failed for ${url}`);
72
192
  }
73
193
 
194
+ // ── Fetch avec jitter ───────────────────────────────────────────────────────
74
195
  export async function fetchWithDelay(url, options = {}) {
75
196
  const minDelay = options.minDelay ?? 1000;
76
197
  const maxDelay = options.maxDelay ?? 2000;
@@ -78,5 +199,3 @@ export async function fetchWithDelay(url, options = {}) {
78
199
  await sleep(jitter);
79
200
  return fetch(url, options);
80
201
  }
81
-
82
- export { sleep };
@@ -89,9 +89,11 @@ export function extractPricing($, html) {
89
89
  const planKeywords = ['starter', 'basic', 'pro', 'professional', 'business', 'enterprise', 'free', 'premium', 'plus'];
90
90
  const plans = [];
91
91
  for (const kw of planKeywords) {
92
- const regex = new RegExp(`${kw}[^\\n]*?\\$[\\d,]+`, 'gi');
93
- const matches = html.match(regex) || [];
94
- plans.push(...matches.slice(0, 2));
92
+ // Rechercher dans le texte propre (au lieu du code HTML raw) pour éviter de capturer du code source
93
+ const textContent = $.text().replace(/\s+/g, ' ');
94
+ const regex = new RegExp(`(?:^|\\s)${kw}\\s[^$€£]{0,50}?[$€£][\\d,.]+`, 'gi');
95
+ const matches = textContent.match(regex) || [];
96
+ plans.push(...matches.slice(0, 2).map(m => m.trim()));
95
97
  }
96
98
 
97
99
  return {
Binary file
@@ -1,281 +0,0 @@
1
- import axios from 'axios';
2
- import { analyzeSentiment, categorizeMention } from '../utils/sentiment.js';
3
-
4
- const BRAVE_API = 'https://api.search.brave.com/res/v1';
5
-
6
- /**
7
- * Search via Brave Search API — reliable, no rate limiting issues.
8
- * Uses BRAVE_API_KEY env var or falls back to config.
9
- */
10
- function getApiKey() {
11
- return process.env.BRAVE_API_KEY || process.env.BRAVE_SEARCH_API_KEY || null;
12
- }
13
-
14
- /**
15
- * Web search via Brave
16
- */
17
- export async function braveWebSearch(query, options = {}) {
18
- const apiKey = getApiKey();
19
- if (!apiKey) return { results: [], error: 'No BRAVE_API_KEY set' };
20
-
21
- try {
22
- const params = {
23
- q: query,
24
- count: options.count || 20,
25
- country: options.country || 'FR',
26
- search_lang: options.lang || 'fr',
27
- freshness: options.freshness || undefined, // 'pd' (day), 'pw' (week), 'pm' (month)
28
- };
29
-
30
- const resp = await axios.get(`${BRAVE_API}/web/search`, {
31
- headers: { 'X-Subscription-Token': apiKey, 'Accept': 'application/json' },
32
- params,
33
- timeout: 15000,
34
- });
35
-
36
- const results = (resp.data.web?.results || []).map(r => ({
37
- title: r.title,
38
- url: r.url,
39
- domain: r.meta_url?.hostname?.replace('www.', '') || new URL(r.url).hostname.replace('www.', ''),
40
- snippet: r.description || '',
41
- age: r.age || null,
42
- }));
43
-
44
- return { results, error: null };
45
- } catch (err) {
46
- return { results: [], error: err.message };
47
- }
48
- }
49
-
50
- /**
51
- * News search via Brave
52
- */
53
- export async function braveNewsSearch(query, options = {}) {
54
- const apiKey = getApiKey();
55
- if (!apiKey) return { results: [], error: 'No BRAVE_API_KEY set' };
56
-
57
- try {
58
- const params = {
59
- q: query,
60
- count: options.count || 20,
61
- country: options.country || 'FR',
62
- search_lang: options.lang || 'fr',
63
- freshness: options.freshness || 'pm', // last month by default
64
- };
65
-
66
- const resp = await axios.get(`${BRAVE_API}/news/search`, {
67
- headers: { 'X-Subscription-Token': apiKey, 'Accept': 'application/json' },
68
- params,
69
- timeout: 15000,
70
- });
71
-
72
- const results = (resp.data.results || []).map(r => ({
73
- title: r.title,
74
- url: r.url,
75
- domain: r.meta_url?.hostname?.replace('www.', '') || '',
76
- snippet: r.description || '',
77
- age: r.age || null,
78
- source: r.meta_url?.hostname || '',
79
- }));
80
-
81
- return { results, error: null };
82
- } catch (err) {
83
- return { results: [], error: err.message };
84
- }
85
- }
86
-
87
- /**
88
- * Full press & mentions search for a brand/company.
89
- * Combines news + web results, analyzes sentiment, categorizes.
90
- */
91
- export async function searchPressMentions(brandName, options = {}) {
92
- const mentions = [];
93
-
94
- // 1. News search
95
- const news = await braveNewsSearch(brandName, { freshness: 'pm', ...options });
96
- for (const r of news.results) {
97
- const sentiment = analyzeSentiment(r.title + ' ' + r.snippet);
98
- mentions.push({
99
- source: 'news',
100
- url: r.url,
101
- domain: r.domain || r.source,
102
- title: r.title,
103
- snippet: r.snippet?.substring(0, 300),
104
- age: r.age,
105
- sentiment: sentiment.label,
106
- sentimentScore: sentiment.score,
107
- category: categorizeMention(r.url, r.title, r.snippet),
108
- });
109
- }
110
-
111
- // 2. Web search for recent mentions
112
- await new Promise(r => setTimeout(r, 500));
113
- const web = await braveWebSearch(`"${brandName}" avis OR actualité OR news`, { freshness: 'pw', ...options });
114
- for (const r of web.results) {
115
- if (mentions.some(m => m.url === r.url)) continue; // dedupe
116
- const sentiment = analyzeSentiment(r.title + ' ' + r.snippet);
117
- mentions.push({
118
- source: 'web',
119
- url: r.url,
120
- domain: r.domain,
121
- title: r.title,
122
- snippet: r.snippet?.substring(0, 300),
123
- age: r.age,
124
- sentiment: sentiment.label,
125
- sentimentScore: sentiment.score,
126
- category: categorizeMention(r.url, r.title, r.snippet),
127
- });
128
- }
129
-
130
- // 3. Search for reviews specifically
131
- await new Promise(r => setTimeout(r, 500));
132
- const reviews = await braveWebSearch(`"${brandName}" avis clients trustpilot`, { count: 10, ...options });
133
- for (const r of reviews.results) {
134
- if (mentions.some(m => m.url === r.url)) continue;
135
- const sentiment = analyzeSentiment(r.title + ' ' + r.snippet);
136
- if (/trustpilot|avis|review|capterra|g2\.com|glassdoor/.test(r.url + r.title)) {
137
- mentions.push({
138
- source: 'review',
139
- url: r.url,
140
- domain: r.domain,
141
- title: r.title,
142
- snippet: r.snippet?.substring(0, 300),
143
- age: r.age,
144
- sentiment: sentiment.label,
145
- sentimentScore: sentiment.score,
146
- category: 'review',
147
- });
148
- }
149
- }
150
-
151
- // ── Relevance filter: drop results that don't actually mention the brand ──
152
- const brandLower = brandName.toLowerCase().trim();
153
- const brandWords = brandLower.split(/\s+/);
154
- const filtered = mentions.filter(m => {
155
- const text = ((m.title || '') + ' ' + (m.snippet || '') + ' ' + (m.domain || '')).toLowerCase();
156
- // Must contain the exact brand name OR all words of the brand
157
- if (text.includes(brandLower)) return true;
158
- if (brandWords.length > 1 && brandWords.every(w => text.includes(w))) return true;
159
- // Fuzzy: allow 1 char difference for short names (e.g. "Endrix" vs "Endrick" should be EXCLUDED)
160
- return false;
161
- });
162
-
163
- return {
164
- brandName,
165
- checkedAt: new Date().toISOString(),
166
- mentions: filtered,
167
- mentionCount: filtered.length,
168
- unfilteredCount: mentions.length,
169
- error: news.error || web.error || null,
170
- };
171
- }
172
-
173
- /**
174
- * Search SERP rankings for a keyword
175
- */
176
- export async function searchKeywordRankings(keyword, options = {}) {
177
- const search = await braveWebSearch(keyword, { count: 20, ...options });
178
-
179
- return search.results.map((r, i) => ({
180
- position: i + 1,
181
- url: r.url,
182
- domain: r.domain,
183
- title: r.title,
184
- snippet: r.snippet,
185
- }));
186
- }
187
-
188
- /**
189
- * Social media search via Brave — filters by platform.
190
- * platforms: array of 'twitter', 'reddit', 'linkedin'
191
- */
192
- export async function searchSocial(query, platforms = ['twitter', 'reddit', 'linkedin'], options = {}) {
193
- const apiKey = getApiKey();
194
- if (!apiKey) return { results: [], byPlatform: {}, error: 'No BRAVE_API_KEY set' };
195
-
196
- const siteFilters = {
197
- twitter: 'site:x.com OR site:twitter.com',
198
- reddit: 'site:reddit.com',
199
- linkedin: 'site:linkedin.com',
200
- };
201
-
202
- const siteQuery = platforms
203
- .map(p => siteFilters[p])
204
- .filter(Boolean)
205
- .join(' OR ');
206
-
207
- const fullQuery = `${query} (${siteQuery})`;
208
-
209
- const search = await braveWebSearch(fullQuery, { count: options.count || 15, ...options });
210
-
211
- const results = (search.results || []).map(r => {
212
- let platform = 'other';
213
- const urlLower = r.url.toLowerCase();
214
- if (urlLower.includes('x.com') || urlLower.includes('twitter.com')) platform = 'twitter';
215
- else if (urlLower.includes('reddit.com')) platform = 'reddit';
216
- else if (urlLower.includes('linkedin.com')) platform = 'linkedin';
217
- return { ...r, platform };
218
- });
219
-
220
- const byPlatform = {};
221
- for (const r of results) {
222
- if (!byPlatform[r.platform]) byPlatform[r.platform] = [];
223
- byPlatform[r.platform].push(r);
224
- }
225
-
226
- return { results, byPlatform, error: search.error };
227
- }
228
-
229
- /**
230
- * Extract review ratings from search snippets
231
- */
232
- export function extractRatingsFromResults(results) {
233
- const platforms = [];
234
-
235
- for (const r of results) {
236
- const text = `${r.title} ${r.snippet}`.toLowerCase();
237
-
238
- // Trustpilot pattern
239
- if (/trustpilot/.test(r.url) || /trustpilot/.test(text)) {
240
- const ratingMatch = text.match(/(\d[.,]\d)\s*(?:\/\s*5|sur\s*5|out of 5|stars?|étoiles?)/);
241
- const countMatch = text.match(/([\d\s,.]+)\s*(?:avis|reviews?|évaluations?)/);
242
- if (ratingMatch || countMatch) {
243
- platforms.push({
244
- name: 'Trustpilot',
245
- url: r.url,
246
- rating: ratingMatch ? parseFloat(ratingMatch[1].replace(',', '.')) : null,
247
- reviewCount: countMatch ? countMatch[1].replace(/\s/g, '').replace(',', '') : null,
248
- });
249
- }
250
- }
251
-
252
- // Google reviews pattern
253
- if (/google/.test(text) && /avis|review/.test(text)) {
254
- const ratingMatch = text.match(/(\d[.,]\d)\s*(?:\/\s*5|sur\s*5|stars?|étoiles?)/);
255
- const countMatch = text.match(/([\d\s,.]+)\s*(?:avis|reviews?|évaluations?)/);
256
- if (ratingMatch) {
257
- platforms.push({
258
- name: 'Google',
259
- url: r.url,
260
- rating: parseFloat(ratingMatch[1].replace(',', '.')),
261
- reviewCount: countMatch ? countMatch[1].replace(/\s/g, '') : null,
262
- });
263
- }
264
- }
265
-
266
- // Glassdoor (employer reputation)
267
- if (/glassdoor/.test(r.url)) {
268
- const ratingMatch = text.match(/(\d[.,]\d)\s*(?:\/\s*5|sur\s*5|stars?)/);
269
- if (ratingMatch) {
270
- platforms.push({
271
- name: 'Glassdoor',
272
- url: r.url,
273
- rating: parseFloat(ratingMatch[1].replace(',', '.')),
274
- reviewCount: null,
275
- });
276
- }
277
- }
278
- }
279
-
280
- return platforms;
281
- }