bctranslate 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/utils.js CHANGED
@@ -1,18 +1,99 @@
1
1
  import { createHash } from 'crypto';
2
+ import { basename, extname } from 'path';
3
+
4
+ // Words too generic to use as the sole semantic component of a key
5
+ const STOP_WORDS = new Set([
6
+ 'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
7
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'shall',
8
+ 'should', 'may', 'might', 'must', 'can', 'could',
9
+ 'to', 'for', 'and', 'or', 'but', 'of', 'in', 'on', 'at', 'by',
10
+ 'as', 'if', 'its', 'it', 'this', 'that', 'these', 'those',
11
+ 'my', 'your', 'our', 'their', 'with', 'from', 'up', 'about',
12
+ 'no', 'not', 'so',
13
+ ]);
2
14
 
3
15
  /**
4
- * Generate a deterministic, short key from a string.
5
- * Same input always produces the same key — idempotent across runs.
16
+ * Generate a readable, slug-based i18n key from a string.
17
+ * "Submit" "submit"
18
+ * "Please enter your email" → "please_enter_your_email"
19
+ * Falls back to a hash prefix for non-Latin or symbol-only strings.
6
20
  */
7
- export function hashKey(text) {
8
- const normalized = text.trim().toLowerCase().replace(/\s+/g, ' ');
9
- const hash = createHash('sha256').update(normalized).digest('hex').slice(0, 8);
10
- return `key_${hash}`;
11
- }
21
+ export function textKey(text) {
22
+ const trimmed = text.trim();
23
+
24
+ const slug = trimmed
25
+ .toLowerCase()
26
+ .replace(/[^\w\s]/g, ' ') // punctuation → space
27
+ .replace(/\s+/g, '_') // spaces → underscores
28
+ .replace(/^[^a-z]+/, '') // strip non-alpha prefix
29
+ .replace(/[^a-z0-9_]/g, '') // remove remaining non-ASCII
30
+ .replace(/_+/g, '_') // collapse multiple underscores
31
+ .replace(/^_|_$/g, '') // trim underscores
32
+ .slice(0, 40)
33
+ .replace(/_+$/, '');
34
+
35
+ if (slug && slug.length >= 2 && /[a-z]/.test(slug)) {
36
+ return slug;
37
+ }
38
+
39
+ // Fallback: hash (for Chinese, Arabic, emoji, symbols, etc.)
40
+ const hash = createHash('sha256').update(trimmed).digest('hex').slice(0, 8);
41
+ return `key_${hash}`;
42
+ }
43
+
44
+ /**
45
+ * Generate a context-aware i18n key using the component name as namespace
46
+ * and key content words as the slug.
47
+ *
48
+ * "Notes" in HomeView.vue → home.notes
49
+ * "Quick Note" in HomeView.vue → home.quickNote
50
+ * "View livestock" in HomeView.vue → home.viewLivestock
51
+ * "Submit" in LoginForm.vue → loginForm.submit
52
+ * "你好" in App.vue → app.key_3d2a1f
53
+ *
54
+ * @param {string} text The source string to key
55
+ * @param {string} filePath Absolute or relative path of the source file
56
+ */
57
+ export function contextKey(text, filePath) {
58
+ const trimmed = text.trim();
59
+
60
+ // ── Namespace: derive from filename ──────────────────────────────────────
61
+ const fileName = basename(filePath, extname(filePath));
62
+ // Strip common Vue/React suffixes: HomeView → Home, UserCard → User, etc.
63
+ const stripped = fileName.replace(
64
+ /(?:View|Component|Page|Screen|Modal|Dialog|Card|Panel|Widget|Layout|Container)$/,
65
+ ''
66
+ ) || fileName;
67
+ // camelCase namespace: "UserProfile" → "userProfile", "home" → "home"
68
+ const ns = stripped[0].toLowerCase() + stripped.slice(1);
69
+
70
+ // ── Slug: 1-3 meaningful words in camelCase ───────────────────────────────
71
+ const words = trimmed
72
+ .replace(/[^\w\s]/g, ' ')
73
+ .split(/\s+/)
74
+ .map((w) => w.toLowerCase())
75
+ .filter((w) => w.length >= 2 && /[a-z]/.test(w) && !STOP_WORDS.has(w));
76
+
77
+ if (!words.length) {
78
+ // Non-Latin scripts, emoji, or all stop words — fall back to hash suffix
79
+ const hash = createHash('sha256').update(trimmed).digest('hex').slice(0, 6);
80
+ return `${ns}.key${hash}`;
81
+ }
82
+
83
+ const slug =
84
+ words[0] + words.slice(1, 3).map((w) => w[0].toUpperCase() + w.slice(1)).join('');
85
+
86
+ return `${ns}.${slug}`;
87
+ }
12
88
 
13
89
  /**
14
- * Determine if a string is "translatable" i.e. it contains actual
15
- * human-readable text and not just whitespace, numbers, symbols, or code.
90
+ * @deprecated Use contextKey() for new code. textKey() kept for non-file contexts.
91
+ */
92
+ export const hashKey = textKey;
93
+
94
+ /**
95
+ * Determine if a string is "translatable" — contains actual human-readable
96
+ * text rather than whitespace, numbers, symbols, or code identifiers.
16
97
  */
17
98
  export function isTranslatable(text) {
18
99
  if (!text || typeof text !== 'string') return false;
@@ -27,15 +108,19 @@ export function isTranslatable(text) {
27
108
  if (/^[\d.,]+$/.test(trimmed)) return false;
28
109
 
29
110
  // Skip single characters that are punctuation/symbols
30
- if (trimmed.length === 1 && /[^a-zA-Z\u00C0-\u024F\u0400-\u04FF\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]/.test(trimmed)) return false;
111
+ if (
112
+ trimmed.length === 1 &&
113
+ /[^a-zA-Z\u00C0-\u024F\u0400-\u04FF\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]/.test(trimmed)
114
+ )
115
+ return false;
31
116
 
32
- // Skip things that look like code identifiers (camelCase, snake_case with no spaces)
117
+ // Skip code identifiers (camelCase, snake_case with no spaces)
33
118
  if (/^[a-zA-Z_$][a-zA-Z0-9_$.]*$/.test(trimmed) && !trimmed.includes(' ') && trimmed.length > 1) {
34
- // But allow single real words (check if it has vowels or is a common word)
119
+ // Allow capitalised real words: "Submit", "Cancel", "Home"
35
120
  if (/[aeiouAEIOU]/.test(trimmed) && trimmed.length > 2 && /^[A-Z][a-z]+$/.test(trimmed)) {
36
- return true; // Likely a real word like "Submit", "Cancel", "Home"
121
+ return true;
37
122
  }
38
- // Allow ALL CAPS short words (likely labels)
123
+ // Allow ALL-CAPS short labels: "OK", "FAQ"
39
124
  if (/^[A-Z]{2,12}$/.test(trimmed)) return true;
40
125
  return false;
41
126
  }
@@ -44,29 +129,80 @@ export function isTranslatable(text) {
44
129
  if (/^(https?:\/\/|\/|\.\/|\.\.\/)/.test(trimmed)) return false;
45
130
  if (/^[\w.+-]+@[\w.-]+\.\w+$/.test(trimmed)) return false;
46
131
 
47
- // Skip template expressions that are purely code ({{ something }})
132
+ // Skip pure template expressions {{ something }}
48
133
  if (/^\{\{[^}]+\}\}$/.test(trimmed)) return false;
49
134
 
50
135
  // Must contain at least one letter from any script
51
- if (!/[a-zA-Z\u00C0-\u024F\u0400-\u04FF\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uAC00-\uD7AF]/.test(trimmed)) return false;
136
+ if (
137
+ !/[a-zA-Z\u00C0-\u024F\u0400-\u04FF\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uAC00-\uD7AF]/.test(
138
+ trimmed
139
+ )
140
+ )
141
+ return false;
52
142
 
53
143
  return true;
54
144
  }
55
145
 
56
146
  /**
57
- * Check if a string contains interpolation expressions ({{ }}, {}, ${}).
58
- * Returns the parts if so.
147
+ * Shield interpolation variables before sending to Argos Translate.
148
+ * Replaces {{ name }}, {name}, ${name}, %{name} with XML-like tokens <xi/>
149
+ * that NMT models are trained to preserve verbatim.
150
+ *
151
+ * Returns { shielded, tokens } — call unshieldInterpolations() to restore.
152
+ */
153
+ export function shieldInterpolations(text) {
154
+ const tokens = [];
155
+ let shielded = text;
156
+
157
+ // Vue {{ expr }} — must come first to avoid matching inner {
158
+ shielded = shielded.replace(/\{\{[^}]*\}\}/g, (m) => {
159
+ const i = tokens.length;
160
+ tokens.push(m);
161
+ return `<x${i}/>`;
162
+ });
163
+
164
+ // Template literal ${expr}
165
+ shielded = shielded.replace(/\$\{[^}]*\}/g, (m) => {
166
+ const i = tokens.length;
167
+ tokens.push(m);
168
+ return `<x${i}/>`;
169
+ });
170
+
171
+ // i18next / vue-i18n {varName} or {0}
172
+ shielded = shielded.replace(/\{[^{}\s][^{}]*\}/g, (m) => {
173
+ const i = tokens.length;
174
+ tokens.push(m);
175
+ return `<x${i}/>`;
176
+ });
177
+
178
+ // Ruby / Rails %{varName}
179
+ shielded = shielded.replace(/%\{[^}]+\}/g, (m) => {
180
+ const i = tokens.length;
181
+ tokens.push(m);
182
+ return `<x${i}/>`;
183
+ });
184
+
185
+ return { shielded, tokens };
186
+ }
187
+
188
+ /**
189
+ * Restore interpolation variables after translation.
190
+ * Tolerates minor whitespace changes the MT model may introduce.
191
+ */
192
+ export function unshieldInterpolations(text, tokens) {
193
+ if (!tokens || tokens.length === 0) return text;
194
+ return text.replace(/<x(\d+)\s*\/>/gi, (_, idx) => tokens[parseInt(idx, 10)] ?? '');
195
+ }
196
+
197
+ /**
198
+ * Check if a string contains interpolation expressions.
59
199
  */
60
200
  export function parseInterpolation(text) {
61
- // Vue-style {{ expr }}
62
201
  const vuePattern = /\{\{\s*([^}]+?)\s*\}\}/g;
63
- // React/JS-style {expr} or ${expr}
64
- const jsPattern = /\$?\{([^}]+)\}/g;
65
202
 
66
203
  const parts = [];
67
204
  let hasInterpolation = false;
68
205
 
69
- // Check for Vue interpolation
70
206
  if (vuePattern.test(text)) {
71
207
  hasInterpolation = true;
72
208
  vuePattern.lastIndex = 0;
@@ -88,4 +224,4 @@ export function parseInterpolation(text) {
88
224
  }
89
225
 
90
226
  return { hasInterpolation, parts };
91
- }
227
+ }