glance-cli 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,382 @@
1
+ /**
2
+ * Language Detection Module
3
+ *
4
+ * Smart language detection using multiple signals:
5
+ * - URL patterns (/fr, /es, ?lang=fr, etc.)
6
+ * - HTML lang attributes
7
+ * - Content-based detection
8
+ * - Domain patterns (example.fr, example.es)
9
+ */
10
+
11
+ import * as cheerio from "cheerio";
12
+
13
+ // Supported languages mapping
14
+ export const SUPPORTED_LANGUAGES = {
15
+ en: "English",
16
+ fr: "French",
17
+ es: "Spanish",
18
+ ht: "Haitian Creole",
19
+ } as const;
20
+
21
+ export type SupportedLanguage = keyof typeof SUPPORTED_LANGUAGES;
22
+
23
+ // Language detection result
24
+ export interface LanguageDetectionResult {
25
+ detected: SupportedLanguage;
26
+ confidence: "high" | "medium" | "low";
27
+ source: "url" | "html" | "content" | "default";
28
+ signals: string[];
29
+ }
30
+
31
+ /**
32
+ * Detect language from URL patterns
33
+ */
34
+ function detectLanguageFromURL(url: string): {
35
+ lang: SupportedLanguage | null;
36
+ confidence: "high" | "medium";
37
+ } {
38
+ const urlObj = new URL(url);
39
+
40
+ // Check URL path segments (/fr/, /es/, /en/)
41
+ const pathSegments = urlObj.pathname.toLowerCase().split("/").filter(Boolean);
42
+ for (const segment of pathSegments) {
43
+ if (segment === "fr" || segment === "french")
44
+ return { lang: "fr", confidence: "high" };
45
+ if (segment === "es" || segment === "spanish" || segment === "espanol")
46
+ return { lang: "es", confidence: "high" };
47
+ if (segment === "ht" || segment === "haitian" || segment === "kreyol")
48
+ return { lang: "ht", confidence: "high" };
49
+ if (segment === "en" || segment === "english")
50
+ return { lang: "en", confidence: "high" };
51
+ }
52
+
53
+ // Check query parameters (?lang=fr, ?locale=es, etc.)
54
+ const langParam =
55
+ urlObj.searchParams.get("lang") ||
56
+ urlObj.searchParams.get("language") ||
57
+ urlObj.searchParams.get("locale") ||
58
+ urlObj.searchParams.get("hl"); // Google uses 'hl'
59
+
60
+ if (langParam) {
61
+ const normalized = langParam.toLowerCase().slice(0, 2);
62
+ if (normalized in SUPPORTED_LANGUAGES) {
63
+ return { lang: normalized as SupportedLanguage, confidence: "high" };
64
+ }
65
+ }
66
+
67
+ // Check domain TLD (.fr, .es, .ht)
68
+ const domain = urlObj.hostname.toLowerCase();
69
+ if (domain.endsWith(".fr")) return { lang: "fr", confidence: "medium" };
70
+ if (domain.endsWith(".es")) return { lang: "es", confidence: "medium" };
71
+ if (domain.endsWith(".ht")) return { lang: "ht", confidence: "medium" };
72
+
73
+ // Check subdomain (fr.example.com, es.example.com)
74
+ const subdomain = domain.split(".")[0];
75
+ if (subdomain === "fr") return { lang: "fr", confidence: "medium" };
76
+ if (subdomain === "es") return { lang: "es", confidence: "medium" };
77
+ if (subdomain === "ht") return { lang: "ht", confidence: "medium" };
78
+
79
+ return { lang: null, confidence: "medium" };
80
+ }
81
+
82
+ /**
83
+ * Detect language from HTML attributes
84
+ */
85
+ function detectLanguageFromHTML(html: string): {
86
+ lang: SupportedLanguage | null;
87
+ confidence: "high" | "medium";
88
+ } {
89
+ try {
90
+ const $ = cheerio.load(html);
91
+
92
+ // Check <html lang="...">
93
+ const htmlLang = $("html").attr("lang")?.toLowerCase().slice(0, 2);
94
+ if (htmlLang && htmlLang in SUPPORTED_LANGUAGES) {
95
+ return { lang: htmlLang as SupportedLanguage, confidence: "high" };
96
+ }
97
+
98
+ // Check meta tags
99
+ const contentLanguage = $('meta[http-equiv="content-language"]')
100
+ .attr("content")
101
+ ?.toLowerCase()
102
+ .slice(0, 2);
103
+ if (contentLanguage && contentLanguage in SUPPORTED_LANGUAGES) {
104
+ return { lang: contentLanguage as SupportedLanguage, confidence: "high" };
105
+ }
106
+
107
+ // Check Open Graph locale
108
+ const ogLocale = $('meta[property="og:locale"]')
109
+ .attr("content")
110
+ ?.toLowerCase()
111
+ .slice(0, 2);
112
+ if (ogLocale && ogLocale in SUPPORTED_LANGUAGES) {
113
+ return { lang: ogLocale as SupportedLanguage, confidence: "medium" };
114
+ }
115
+
116
+ // Check language meta tag
117
+ const languageMeta = $('meta[name="language"]')
118
+ .attr("content")
119
+ ?.toLowerCase()
120
+ .slice(0, 2);
121
+ if (languageMeta && languageMeta in SUPPORTED_LANGUAGES) {
122
+ return { lang: languageMeta as SupportedLanguage, confidence: "medium" };
123
+ }
124
+ } catch (_error) {
125
+ // Ignore parsing errors
126
+ }
127
+
128
+ return { lang: null, confidence: "medium" };
129
+ }
130
+
131
+ /**
132
+ * Simple content-based language detection
133
+ * Looks for common words and patterns
134
+ */
135
+ function detectLanguageFromContent(text: string): {
136
+ lang: SupportedLanguage | null;
137
+ confidence: "low" | "medium";
138
+ } {
139
+ // Normalize text for analysis
140
+ const normalizedText = text.toLowerCase().slice(0, 1000); // Check first 1000 chars
141
+
142
+ // Language fingerprints - common words that indicate language
143
+ const languagePatterns = {
144
+ fr: {
145
+ words: [
146
+ "le",
147
+ "la",
148
+ "les",
149
+ "de",
150
+ "et",
151
+ "est",
152
+ "un",
153
+ "une",
154
+ "pour",
155
+ "dans",
156
+ "avec",
157
+ "sur",
158
+ "par",
159
+ "vous",
160
+ "nous",
161
+ "ils",
162
+ "elle",
163
+ ],
164
+ patterns: [/\bqu'/g, /\bd'/g, /\bl'/g, /\bc'/g], // French contractions
165
+ score: 0,
166
+ },
167
+ es: {
168
+ words: [
169
+ "el",
170
+ "la",
171
+ "los",
172
+ "las",
173
+ "de",
174
+ "y",
175
+ "es",
176
+ "en",
177
+ "por",
178
+ "para",
179
+ "con",
180
+ "un",
181
+ "una",
182
+ "que",
183
+ "del",
184
+ ],
185
+ patterns: [/ñ/g, /¿/g, /¡/g], // Spanish-specific characters
186
+ score: 0,
187
+ },
188
+ ht: {
189
+ words: [
190
+ "nan",
191
+ "ak",
192
+ "pou",
193
+ "yo",
194
+ "li",
195
+ "nou",
196
+ "mwen",
197
+ "ou",
198
+ "se",
199
+ "ki",
200
+ "gen",
201
+ "bay",
202
+ "fè",
203
+ "ka",
204
+ ],
205
+ patterns: [/\bm'/g, /\bl'/g, /\bn'/g], // Haitian Creole contractions
206
+ score: 0,
207
+ },
208
+ en: {
209
+ words: [
210
+ "the",
211
+ "is",
212
+ "at",
213
+ "of",
214
+ "and",
215
+ "to",
216
+ "in",
217
+ "for",
218
+ "with",
219
+ "on",
220
+ "by",
221
+ "from",
222
+ "up",
223
+ "about",
224
+ "into",
225
+ ],
226
+ patterns: [
227
+ /\b(you|your|you're|you'll)\b/g,
228
+ /\b(it's|isn't|aren't|won't)\b/g,
229
+ ],
230
+ score: 0,
231
+ },
232
+ };
233
+
234
+ // Count occurrences of language-specific indicators
235
+ for (const [_lang, data] of Object.entries(languagePatterns)) {
236
+ // Check common words
237
+ for (const word of data.words) {
238
+ const regex = new RegExp(`\\b${word}\\b`, "gi");
239
+ const matches = normalizedText.match(regex);
240
+ if (matches) {
241
+ data.score += matches.length * 2; // Weight word matches heavily
242
+ }
243
+ }
244
+
245
+ // Check patterns
246
+ for (const pattern of data.patterns) {
247
+ const matches = normalizedText.match(pattern);
248
+ if (matches) {
249
+ data.score += matches.length;
250
+ }
251
+ }
252
+ }
253
+
254
+ // Find language with highest score
255
+ let detectedLang: SupportedLanguage | null = null;
256
+ let highestScore = 10; // Minimum threshold
257
+
258
+ for (const [lang, data] of Object.entries(languagePatterns)) {
259
+ if (data.score > highestScore) {
260
+ highestScore = data.score;
261
+ detectedLang = lang as SupportedLanguage;
262
+ }
263
+ }
264
+
265
+ // Determine confidence based on score difference
266
+ const scores = Object.values(languagePatterns)
267
+ .map((d) => d.score)
268
+ .sort((a, b) => b - a);
269
+ const scoreDifference = scores[0]! - scores[1]!;
270
+
271
+ const confidence = scoreDifference > 20 ? "medium" : "low";
272
+
273
+ return { lang: detectedLang, confidence };
274
+ }
275
+
276
+ /**
277
+ * Main language detection function
278
+ * Combines multiple detection methods for best accuracy
279
+ */
280
+ export function detectLanguage(
281
+ url: string,
282
+ html?: string,
283
+ content?: string,
284
+ userSpecifiedLang?: string,
285
+ ): LanguageDetectionResult {
286
+ const signals: string[] = [];
287
+
288
+ // Priority 1: User-specified language (always wins)
289
+ if (userSpecifiedLang && userSpecifiedLang in SUPPORTED_LANGUAGES) {
290
+ return {
291
+ detected: userSpecifiedLang as SupportedLanguage,
292
+ confidence: "high",
293
+ source: "url",
294
+ signals: ["user-specified"],
295
+ };
296
+ }
297
+
298
+ // Priority 2: URL detection (most explicit)
299
+ const urlResult = detectLanguageFromURL(url);
300
+ if (urlResult.lang) {
301
+ signals.push(`URL: ${urlResult.lang}`);
302
+ if (urlResult.confidence === "high") {
303
+ return {
304
+ detected: urlResult.lang,
305
+ confidence: "high",
306
+ source: "url",
307
+ signals,
308
+ };
309
+ }
310
+ }
311
+
312
+ // Priority 3: HTML lang attribute (author's intent)
313
+ if (html) {
314
+ const htmlResult = detectLanguageFromHTML(html);
315
+ if (htmlResult.lang) {
316
+ signals.push(`HTML: ${htmlResult.lang}`);
317
+ if (htmlResult.confidence === "high") {
318
+ return {
319
+ detected: htmlResult.lang,
320
+ confidence: urlResult.lang === htmlResult.lang ? "high" : "medium",
321
+ source: "html",
322
+ signals,
323
+ };
324
+ }
325
+ }
326
+ }
327
+
328
+ // Priority 4: Content-based detection (fallback)
329
+ if (content) {
330
+ const contentResult = detectLanguageFromContent(content);
331
+ if (contentResult.lang) {
332
+ signals.push(`Content: ${contentResult.lang}`);
333
+
334
+ // If URL and content agree, boost confidence
335
+ const confidence =
336
+ urlResult.lang === contentResult.lang
337
+ ? "medium"
338
+ : contentResult.confidence;
339
+
340
+ return {
341
+ detected: contentResult.lang,
342
+ confidence,
343
+ source: "content",
344
+ signals,
345
+ };
346
+ }
347
+ }
348
+
349
+ // If we found any URL hint, use it with low confidence
350
+ if (urlResult.lang) {
351
+ return {
352
+ detected: urlResult.lang,
353
+ confidence: "low",
354
+ source: "url",
355
+ signals,
356
+ };
357
+ }
358
+
359
+ // Default to English
360
+ return {
361
+ detected: "en",
362
+ confidence: "low",
363
+ source: "default",
364
+ signals: ["fallback to English"],
365
+ };
366
+ }
367
+
368
+ /**
369
+ * Check if auto-detection should be enabled
370
+ * (can be controlled by environment variable or config)
371
+ */
372
+ export function shouldAutoDetectLanguage(): boolean {
373
+ // Can be controlled via env variable if needed
374
+ return process.env.DISABLE_LANGUAGE_DETECTION !== "true";
375
+ }
376
+
377
+ /**
378
+ * Get language name for display
379
+ */
380
+ export function getLanguageName(code: SupportedLanguage): string {
381
+ return SUPPORTED_LANGUAGES[code] || "Unknown";
382
+ }