glance-cli 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +9 -0
- package/dist/cli.js +198 -1064
- package/package.json +4 -2
- package/src/cli/commands.ts +854 -0
- package/src/cli/config.ts +24 -0
- package/src/cli/display.ts +270 -0
- package/src/cli/errors.ts +31 -0
- package/src/cli/index.ts +239 -0
- package/src/cli/logger.ts +43 -0
- package/src/cli/types.ts +114 -0
- package/src/cli/utils.ts +239 -0
- package/src/cli/validators.ts +176 -0
- package/src/cli.ts +17 -0
- package/src/core/compat.ts +96 -0
- package/src/core/extractor.ts +532 -0
- package/src/core/fetcher.ts +592 -0
- package/src/core/formatter.ts +742 -0
- package/src/core/language-detector.ts +382 -0
- package/src/core/screenshot.ts +444 -0
- package/src/core/service-detector.ts +411 -0
- package/src/core/summarizer.ts +656 -0
- package/src/core/text-cleaner.ts +150 -0
- package/src/core/voice.ts +708 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language Detection Module
|
|
3
|
+
*
|
|
4
|
+
* Smart language detection using multiple signals:
|
|
5
|
+
* - URL patterns (/fr, /es, ?lang=fr, etc.)
|
|
6
|
+
* - HTML lang attributes
|
|
7
|
+
* - Content-based detection
|
|
8
|
+
* - Domain patterns (example.fr, example.es)
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as cheerio from "cheerio";
|
|
12
|
+
|
|
13
|
+
// Supported languages mapping
|
|
14
|
+
export const SUPPORTED_LANGUAGES = {
|
|
15
|
+
en: "English",
|
|
16
|
+
fr: "French",
|
|
17
|
+
es: "Spanish",
|
|
18
|
+
ht: "Haitian Creole",
|
|
19
|
+
} as const;
|
|
20
|
+
|
|
21
|
+
export type SupportedLanguage = keyof typeof SUPPORTED_LANGUAGES;
|
|
22
|
+
|
|
23
|
+
// Language detection result
|
|
24
|
+
export interface LanguageDetectionResult {
|
|
25
|
+
detected: SupportedLanguage;
|
|
26
|
+
confidence: "high" | "medium" | "low";
|
|
27
|
+
source: "url" | "html" | "content" | "default";
|
|
28
|
+
signals: string[];
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Detect language from URL patterns
|
|
33
|
+
*/
|
|
34
|
+
function detectLanguageFromURL(url: string): {
|
|
35
|
+
lang: SupportedLanguage | null;
|
|
36
|
+
confidence: "high" | "medium";
|
|
37
|
+
} {
|
|
38
|
+
const urlObj = new URL(url);
|
|
39
|
+
|
|
40
|
+
// Check URL path segments (/fr/, /es/, /en/)
|
|
41
|
+
const pathSegments = urlObj.pathname.toLowerCase().split("/").filter(Boolean);
|
|
42
|
+
for (const segment of pathSegments) {
|
|
43
|
+
if (segment === "fr" || segment === "french")
|
|
44
|
+
return { lang: "fr", confidence: "high" };
|
|
45
|
+
if (segment === "es" || segment === "spanish" || segment === "espanol")
|
|
46
|
+
return { lang: "es", confidence: "high" };
|
|
47
|
+
if (segment === "ht" || segment === "haitian" || segment === "kreyol")
|
|
48
|
+
return { lang: "ht", confidence: "high" };
|
|
49
|
+
if (segment === "en" || segment === "english")
|
|
50
|
+
return { lang: "en", confidence: "high" };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Check query parameters (?lang=fr, ?locale=es, etc.)
|
|
54
|
+
const langParam =
|
|
55
|
+
urlObj.searchParams.get("lang") ||
|
|
56
|
+
urlObj.searchParams.get("language") ||
|
|
57
|
+
urlObj.searchParams.get("locale") ||
|
|
58
|
+
urlObj.searchParams.get("hl"); // Google uses 'hl'
|
|
59
|
+
|
|
60
|
+
if (langParam) {
|
|
61
|
+
const normalized = langParam.toLowerCase().slice(0, 2);
|
|
62
|
+
if (normalized in SUPPORTED_LANGUAGES) {
|
|
63
|
+
return { lang: normalized as SupportedLanguage, confidence: "high" };
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Check domain TLD (.fr, .es, .ht)
|
|
68
|
+
const domain = urlObj.hostname.toLowerCase();
|
|
69
|
+
if (domain.endsWith(".fr")) return { lang: "fr", confidence: "medium" };
|
|
70
|
+
if (domain.endsWith(".es")) return { lang: "es", confidence: "medium" };
|
|
71
|
+
if (domain.endsWith(".ht")) return { lang: "ht", confidence: "medium" };
|
|
72
|
+
|
|
73
|
+
// Check subdomain (fr.example.com, es.example.com)
|
|
74
|
+
const subdomain = domain.split(".")[0];
|
|
75
|
+
if (subdomain === "fr") return { lang: "fr", confidence: "medium" };
|
|
76
|
+
if (subdomain === "es") return { lang: "es", confidence: "medium" };
|
|
77
|
+
if (subdomain === "ht") return { lang: "ht", confidence: "medium" };
|
|
78
|
+
|
|
79
|
+
return { lang: null, confidence: "medium" };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Detect language from HTML attributes
|
|
84
|
+
*/
|
|
85
|
+
function detectLanguageFromHTML(html: string): {
|
|
86
|
+
lang: SupportedLanguage | null;
|
|
87
|
+
confidence: "high" | "medium";
|
|
88
|
+
} {
|
|
89
|
+
try {
|
|
90
|
+
const $ = cheerio.load(html);
|
|
91
|
+
|
|
92
|
+
// Check <html lang="...">
|
|
93
|
+
const htmlLang = $("html").attr("lang")?.toLowerCase().slice(0, 2);
|
|
94
|
+
if (htmlLang && htmlLang in SUPPORTED_LANGUAGES) {
|
|
95
|
+
return { lang: htmlLang as SupportedLanguage, confidence: "high" };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Check meta tags
|
|
99
|
+
const contentLanguage = $('meta[http-equiv="content-language"]')
|
|
100
|
+
.attr("content")
|
|
101
|
+
?.toLowerCase()
|
|
102
|
+
.slice(0, 2);
|
|
103
|
+
if (contentLanguage && contentLanguage in SUPPORTED_LANGUAGES) {
|
|
104
|
+
return { lang: contentLanguage as SupportedLanguage, confidence: "high" };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Check Open Graph locale
|
|
108
|
+
const ogLocale = $('meta[property="og:locale"]')
|
|
109
|
+
.attr("content")
|
|
110
|
+
?.toLowerCase()
|
|
111
|
+
.slice(0, 2);
|
|
112
|
+
if (ogLocale && ogLocale in SUPPORTED_LANGUAGES) {
|
|
113
|
+
return { lang: ogLocale as SupportedLanguage, confidence: "medium" };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Check language meta tag
|
|
117
|
+
const languageMeta = $('meta[name="language"]')
|
|
118
|
+
.attr("content")
|
|
119
|
+
?.toLowerCase()
|
|
120
|
+
.slice(0, 2);
|
|
121
|
+
if (languageMeta && languageMeta in SUPPORTED_LANGUAGES) {
|
|
122
|
+
return { lang: languageMeta as SupportedLanguage, confidence: "medium" };
|
|
123
|
+
}
|
|
124
|
+
} catch (_error) {
|
|
125
|
+
// Ignore parsing errors
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return { lang: null, confidence: "medium" };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Simple content-based language detection
|
|
133
|
+
* Looks for common words and patterns
|
|
134
|
+
*/
|
|
135
|
+
function detectLanguageFromContent(text: string): {
|
|
136
|
+
lang: SupportedLanguage | null;
|
|
137
|
+
confidence: "low" | "medium";
|
|
138
|
+
} {
|
|
139
|
+
// Normalize text for analysis
|
|
140
|
+
const normalizedText = text.toLowerCase().slice(0, 1000); // Check first 1000 chars
|
|
141
|
+
|
|
142
|
+
// Language fingerprints - common words that indicate language
|
|
143
|
+
const languagePatterns = {
|
|
144
|
+
fr: {
|
|
145
|
+
words: [
|
|
146
|
+
"le",
|
|
147
|
+
"la",
|
|
148
|
+
"les",
|
|
149
|
+
"de",
|
|
150
|
+
"et",
|
|
151
|
+
"est",
|
|
152
|
+
"un",
|
|
153
|
+
"une",
|
|
154
|
+
"pour",
|
|
155
|
+
"dans",
|
|
156
|
+
"avec",
|
|
157
|
+
"sur",
|
|
158
|
+
"par",
|
|
159
|
+
"vous",
|
|
160
|
+
"nous",
|
|
161
|
+
"ils",
|
|
162
|
+
"elle",
|
|
163
|
+
],
|
|
164
|
+
patterns: [/\bqu'/g, /\bd'/g, /\bl'/g, /\bc'/g], // French contractions
|
|
165
|
+
score: 0,
|
|
166
|
+
},
|
|
167
|
+
es: {
|
|
168
|
+
words: [
|
|
169
|
+
"el",
|
|
170
|
+
"la",
|
|
171
|
+
"los",
|
|
172
|
+
"las",
|
|
173
|
+
"de",
|
|
174
|
+
"y",
|
|
175
|
+
"es",
|
|
176
|
+
"en",
|
|
177
|
+
"por",
|
|
178
|
+
"para",
|
|
179
|
+
"con",
|
|
180
|
+
"un",
|
|
181
|
+
"una",
|
|
182
|
+
"que",
|
|
183
|
+
"del",
|
|
184
|
+
],
|
|
185
|
+
patterns: [/ñ/g, /¿/g, /¡/g], // Spanish-specific characters
|
|
186
|
+
score: 0,
|
|
187
|
+
},
|
|
188
|
+
ht: {
|
|
189
|
+
words: [
|
|
190
|
+
"nan",
|
|
191
|
+
"ak",
|
|
192
|
+
"pou",
|
|
193
|
+
"yo",
|
|
194
|
+
"li",
|
|
195
|
+
"nou",
|
|
196
|
+
"mwen",
|
|
197
|
+
"ou",
|
|
198
|
+
"se",
|
|
199
|
+
"ki",
|
|
200
|
+
"gen",
|
|
201
|
+
"bay",
|
|
202
|
+
"fè",
|
|
203
|
+
"ka",
|
|
204
|
+
],
|
|
205
|
+
patterns: [/\bm'/g, /\bl'/g, /\bn'/g], // Haitian Creole contractions
|
|
206
|
+
score: 0,
|
|
207
|
+
},
|
|
208
|
+
en: {
|
|
209
|
+
words: [
|
|
210
|
+
"the",
|
|
211
|
+
"is",
|
|
212
|
+
"at",
|
|
213
|
+
"of",
|
|
214
|
+
"and",
|
|
215
|
+
"to",
|
|
216
|
+
"in",
|
|
217
|
+
"for",
|
|
218
|
+
"with",
|
|
219
|
+
"on",
|
|
220
|
+
"by",
|
|
221
|
+
"from",
|
|
222
|
+
"up",
|
|
223
|
+
"about",
|
|
224
|
+
"into",
|
|
225
|
+
],
|
|
226
|
+
patterns: [
|
|
227
|
+
/\b(you|your|you're|you'll)\b/g,
|
|
228
|
+
/\b(it's|isn't|aren't|won't)\b/g,
|
|
229
|
+
],
|
|
230
|
+
score: 0,
|
|
231
|
+
},
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
// Count occurrences of language-specific indicators
|
|
235
|
+
for (const [_lang, data] of Object.entries(languagePatterns)) {
|
|
236
|
+
// Check common words
|
|
237
|
+
for (const word of data.words) {
|
|
238
|
+
const regex = new RegExp(`\\b${word}\\b`, "gi");
|
|
239
|
+
const matches = normalizedText.match(regex);
|
|
240
|
+
if (matches) {
|
|
241
|
+
data.score += matches.length * 2; // Weight word matches heavily
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Check patterns
|
|
246
|
+
for (const pattern of data.patterns) {
|
|
247
|
+
const matches = normalizedText.match(pattern);
|
|
248
|
+
if (matches) {
|
|
249
|
+
data.score += matches.length;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Find language with highest score
|
|
255
|
+
let detectedLang: SupportedLanguage | null = null;
|
|
256
|
+
let highestScore = 10; // Minimum threshold
|
|
257
|
+
|
|
258
|
+
for (const [lang, data] of Object.entries(languagePatterns)) {
|
|
259
|
+
if (data.score > highestScore) {
|
|
260
|
+
highestScore = data.score;
|
|
261
|
+
detectedLang = lang as SupportedLanguage;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Determine confidence based on score difference
|
|
266
|
+
const scores = Object.values(languagePatterns)
|
|
267
|
+
.map((d) => d.score)
|
|
268
|
+
.sort((a, b) => b - a);
|
|
269
|
+
const scoreDifference = scores[0]! - scores[1]!;
|
|
270
|
+
|
|
271
|
+
const confidence = scoreDifference > 20 ? "medium" : "low";
|
|
272
|
+
|
|
273
|
+
return { lang: detectedLang, confidence };
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Main language detection function
|
|
278
|
+
* Combines multiple detection methods for best accuracy
|
|
279
|
+
*/
|
|
280
|
+
export function detectLanguage(
|
|
281
|
+
url: string,
|
|
282
|
+
html?: string,
|
|
283
|
+
content?: string,
|
|
284
|
+
userSpecifiedLang?: string,
|
|
285
|
+
): LanguageDetectionResult {
|
|
286
|
+
const signals: string[] = [];
|
|
287
|
+
|
|
288
|
+
// Priority 1: User-specified language (always wins)
|
|
289
|
+
if (userSpecifiedLang && userSpecifiedLang in SUPPORTED_LANGUAGES) {
|
|
290
|
+
return {
|
|
291
|
+
detected: userSpecifiedLang as SupportedLanguage,
|
|
292
|
+
confidence: "high",
|
|
293
|
+
source: "url",
|
|
294
|
+
signals: ["user-specified"],
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Priority 2: URL detection (most explicit)
|
|
299
|
+
const urlResult = detectLanguageFromURL(url);
|
|
300
|
+
if (urlResult.lang) {
|
|
301
|
+
signals.push(`URL: ${urlResult.lang}`);
|
|
302
|
+
if (urlResult.confidence === "high") {
|
|
303
|
+
return {
|
|
304
|
+
detected: urlResult.lang,
|
|
305
|
+
confidence: "high",
|
|
306
|
+
source: "url",
|
|
307
|
+
signals,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Priority 3: HTML lang attribute (author's intent)
|
|
313
|
+
if (html) {
|
|
314
|
+
const htmlResult = detectLanguageFromHTML(html);
|
|
315
|
+
if (htmlResult.lang) {
|
|
316
|
+
signals.push(`HTML: ${htmlResult.lang}`);
|
|
317
|
+
if (htmlResult.confidence === "high") {
|
|
318
|
+
return {
|
|
319
|
+
detected: htmlResult.lang,
|
|
320
|
+
confidence: urlResult.lang === htmlResult.lang ? "high" : "medium",
|
|
321
|
+
source: "html",
|
|
322
|
+
signals,
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Priority 4: Content-based detection (fallback)
|
|
329
|
+
if (content) {
|
|
330
|
+
const contentResult = detectLanguageFromContent(content);
|
|
331
|
+
if (contentResult.lang) {
|
|
332
|
+
signals.push(`Content: ${contentResult.lang}`);
|
|
333
|
+
|
|
334
|
+
// If URL and content agree, boost confidence
|
|
335
|
+
const confidence =
|
|
336
|
+
urlResult.lang === contentResult.lang
|
|
337
|
+
? "medium"
|
|
338
|
+
: contentResult.confidence;
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
detected: contentResult.lang,
|
|
342
|
+
confidence,
|
|
343
|
+
source: "content",
|
|
344
|
+
signals,
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// If we found any URL hint, use it with low confidence
|
|
350
|
+
if (urlResult.lang) {
|
|
351
|
+
return {
|
|
352
|
+
detected: urlResult.lang,
|
|
353
|
+
confidence: "low",
|
|
354
|
+
source: "url",
|
|
355
|
+
signals,
|
|
356
|
+
};
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// Default to English
|
|
360
|
+
return {
|
|
361
|
+
detected: "en",
|
|
362
|
+
confidence: "low",
|
|
363
|
+
source: "default",
|
|
364
|
+
signals: ["fallback to English"],
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Check if auto-detection should be enabled
|
|
370
|
+
* (can be controlled by environment variable or config)
|
|
371
|
+
*/
|
|
372
|
+
export function shouldAutoDetectLanguage(): boolean {
|
|
373
|
+
// Can be controlled via env variable if needed
|
|
374
|
+
return process.env.DISABLE_LANGUAGE_DETECTION !== "true";
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Get language name for display
|
|
379
|
+
*/
|
|
380
|
+
export function getLanguageName(code: SupportedLanguage): string {
|
|
381
|
+
return SUPPORTED_LANGUAGES[code] || "Unknown";
|
|
382
|
+
}
|