wogiflow 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.workflow/agents/reviewer.md +81 -0
- package/.workflow/agents/security.md +94 -0
- package/.workflow/agents/story-writer.md +58 -0
- package/.workflow/bridges/base-bridge.js +395 -0
- package/.workflow/bridges/claude-bridge.js +434 -0
- package/.workflow/bridges/index.js +130 -0
- package/.workflow/lib/assumption-detector.js +481 -0
- package/.workflow/lib/config-substitution.js +371 -0
- package/.workflow/lib/failure-categories.js +478 -0
- package/.workflow/state/app-map.md.template +15 -0
- package/.workflow/state/architecture.md.template +24 -0
- package/.workflow/state/component-index.json.template +5 -0
- package/.workflow/state/decisions.md.template +15 -0
- package/.workflow/state/feedback-patterns.md.template +9 -0
- package/.workflow/state/knowledge-sync.json.template +6 -0
- package/.workflow/state/progress.md.template +14 -0
- package/.workflow/state/ready.json.template +7 -0
- package/.workflow/state/request-log.md.template +14 -0
- package/.workflow/state/session-state.json.template +11 -0
- package/.workflow/state/stack.md.template +33 -0
- package/.workflow/state/testing.md.template +36 -0
- package/.workflow/templates/claude-md.hbs +257 -0
- package/.workflow/templates/correction-report.md +67 -0
- package/.workflow/templates/gemini-md.hbs +52 -0
- package/README.md +1802 -0
- package/bin/flow +205 -0
- package/lib/index.js +33 -0
- package/lib/installer.js +467 -0
- package/lib/release-channel.js +269 -0
- package/lib/skill-registry.js +526 -0
- package/lib/upgrader.js +401 -0
- package/lib/utils.js +305 -0
- package/package.json +64 -0
- package/scripts/flow +985 -0
- package/scripts/flow-adaptive-learning.js +1259 -0
- package/scripts/flow-aggregate.js +488 -0
- package/scripts/flow-archive +133 -0
- package/scripts/flow-auto-context.js +1015 -0
- package/scripts/flow-auto-learn.js +615 -0
- package/scripts/flow-bridge.js +223 -0
- package/scripts/flow-browser-suggest.js +316 -0
- package/scripts/flow-bug.js +247 -0
- package/scripts/flow-cascade.js +711 -0
- package/scripts/flow-changelog +85 -0
- package/scripts/flow-checkpoint.js +483 -0
- package/scripts/flow-cli.js +403 -0
- package/scripts/flow-code-intelligence.js +760 -0
- package/scripts/flow-complexity.js +502 -0
- package/scripts/flow-config-set.js +152 -0
- package/scripts/flow-constants.js +157 -0
- package/scripts/flow-context +152 -0
- package/scripts/flow-context-init.js +482 -0
- package/scripts/flow-context-monitor.js +384 -0
- package/scripts/flow-context-scoring.js +886 -0
- package/scripts/flow-correct.js +458 -0
- package/scripts/flow-damage-control.js +985 -0
- package/scripts/flow-deps +101 -0
- package/scripts/flow-diff.js +700 -0
- package/scripts/flow-done +151 -0
- package/scripts/flow-done.js +489 -0
- package/scripts/flow-durable-session.js +1541 -0
- package/scripts/flow-entropy-monitor.js +345 -0
- package/scripts/flow-export-profile +349 -0
- package/scripts/flow-export-scanner.js +1046 -0
- package/scripts/flow-figma-confirm.js +400 -0
- package/scripts/flow-figma-extract.js +496 -0
- package/scripts/flow-figma-generate.js +683 -0
- package/scripts/flow-figma-index.js +909 -0
- package/scripts/flow-figma-match.js +617 -0
- package/scripts/flow-figma-mcp-server.js +518 -0
- package/scripts/flow-figma-pipeline.js +414 -0
- package/scripts/flow-file-ops.js +301 -0
- package/scripts/flow-gate-confidence.js +825 -0
- package/scripts/flow-guided-edit.js +659 -0
- package/scripts/flow-health +185 -0
- package/scripts/flow-health.js +413 -0
- package/scripts/flow-hooks.js +556 -0
- package/scripts/flow-http-client.js +249 -0
- package/scripts/flow-hybrid-detect.js +167 -0
- package/scripts/flow-hybrid-interactive.js +591 -0
- package/scripts/flow-hybrid-test.js +152 -0
- package/scripts/flow-import-profile +439 -0
- package/scripts/flow-init +253 -0
- package/scripts/flow-instruction-richness.js +827 -0
- package/scripts/flow-jira-integration.js +579 -0
- package/scripts/flow-knowledge-router.js +522 -0
- package/scripts/flow-knowledge-sync.js +589 -0
- package/scripts/flow-linear-integration.js +631 -0
- package/scripts/flow-links.js +774 -0
- package/scripts/flow-log-manager.js +559 -0
- package/scripts/flow-loop-enforcer.js +1246 -0
- package/scripts/flow-loop-retry-learning.js +630 -0
- package/scripts/flow-lsp.js +923 -0
- package/scripts/flow-map-index +348 -0
- package/scripts/flow-map-sync +201 -0
- package/scripts/flow-memory-blocks.js +668 -0
- package/scripts/flow-memory-compactor.js +350 -0
- package/scripts/flow-memory-db.js +1110 -0
- package/scripts/flow-memory-sync.js +484 -0
- package/scripts/flow-metrics.js +353 -0
- package/scripts/flow-migrate-ids.js +370 -0
- package/scripts/flow-model-adapter.js +802 -0
- package/scripts/flow-model-router.js +884 -0
- package/scripts/flow-models.js +1231 -0
- package/scripts/flow-morning.js +517 -0
- package/scripts/flow-multi-approach.js +660 -0
- package/scripts/flow-new-feature +86 -0
- package/scripts/flow-onboard +1042 -0
- package/scripts/flow-orchestrate-llm.js +459 -0
- package/scripts/flow-orchestrate.js +3592 -0
- package/scripts/flow-output.js +123 -0
- package/scripts/flow-parallel-detector.js +399 -0
- package/scripts/flow-parallel-dispatch.js +987 -0
- package/scripts/flow-parallel.js +428 -0
- package/scripts/flow-pattern-enforcer.js +600 -0
- package/scripts/flow-prd-manager.js +282 -0
- package/scripts/flow-progress.js +323 -0
- package/scripts/flow-project-analyzer.js +975 -0
- package/scripts/flow-prompt-composer.js +487 -0
- package/scripts/flow-providers.js +1381 -0
- package/scripts/flow-queue.js +308 -0
- package/scripts/flow-ready +82 -0
- package/scripts/flow-ready.js +189 -0
- package/scripts/flow-regression.js +396 -0
- package/scripts/flow-response-parser.js +450 -0
- package/scripts/flow-resume.js +284 -0
- package/scripts/flow-rules-sync.js +439 -0
- package/scripts/flow-run-trace.js +718 -0
- package/scripts/flow-safety.js +587 -0
- package/scripts/flow-search +104 -0
- package/scripts/flow-security.js +481 -0
- package/scripts/flow-session-end +106 -0
- package/scripts/flow-session-end.js +437 -0
- package/scripts/flow-session-state.js +671 -0
- package/scripts/flow-setup-hooks +216 -0
- package/scripts/flow-setup-hooks.js +377 -0
- package/scripts/flow-skill-create.js +329 -0
- package/scripts/flow-skill-creator.js +572 -0
- package/scripts/flow-skill-generator.js +1046 -0
- package/scripts/flow-skill-learn.js +880 -0
- package/scripts/flow-skill-matcher.js +578 -0
- package/scripts/flow-spec-generator.js +820 -0
- package/scripts/flow-stack-wizard.js +895 -0
- package/scripts/flow-standup +162 -0
- package/scripts/flow-start +74 -0
- package/scripts/flow-start.js +235 -0
- package/scripts/flow-status +110 -0
- package/scripts/flow-status.js +301 -0
- package/scripts/flow-step-browser.js +83 -0
- package/scripts/flow-step-changelog.js +217 -0
- package/scripts/flow-step-comments.js +306 -0
- package/scripts/flow-step-complexity.js +234 -0
- package/scripts/flow-step-coverage.js +218 -0
- package/scripts/flow-step-knowledge.js +193 -0
- package/scripts/flow-step-pr-tests.js +364 -0
- package/scripts/flow-step-regression.js +89 -0
- package/scripts/flow-step-review.js +516 -0
- package/scripts/flow-step-security.js +162 -0
- package/scripts/flow-step-silent-failures.js +290 -0
- package/scripts/flow-step-simplifier.js +346 -0
- package/scripts/flow-story +105 -0
- package/scripts/flow-story.js +500 -0
- package/scripts/flow-suspend.js +252 -0
- package/scripts/flow-sync-daemon.js +654 -0
- package/scripts/flow-task-analyzer.js +606 -0
- package/scripts/flow-team-dashboard.js +748 -0
- package/scripts/flow-team-sync.js +752 -0
- package/scripts/flow-team.js +977 -0
- package/scripts/flow-tech-options.js +528 -0
- package/scripts/flow-templates.js +812 -0
- package/scripts/flow-tiered-learning.js +728 -0
- package/scripts/flow-trace +204 -0
- package/scripts/flow-transcript-chunking.js +1106 -0
- package/scripts/flow-transcript-digest.js +7918 -0
- package/scripts/flow-transcript-language.js +465 -0
- package/scripts/flow-transcript-parsing.js +1085 -0
- package/scripts/flow-transcript-stories.js +2194 -0
- package/scripts/flow-update-map +224 -0
- package/scripts/flow-utils.js +2242 -0
- package/scripts/flow-verification.js +644 -0
- package/scripts/flow-verify.js +1177 -0
- package/scripts/flow-voice-input.js +638 -0
- package/scripts/flow-watch +168 -0
- package/scripts/flow-workflow-steps.js +521 -0
- package/scripts/flow-workflow.js +1029 -0
- package/scripts/flow-worktree.js +489 -0
- package/scripts/hooks/adapters/base-adapter.js +102 -0
- package/scripts/hooks/adapters/claude-code.js +359 -0
- package/scripts/hooks/adapters/index.js +79 -0
- package/scripts/hooks/core/component-check.js +341 -0
- package/scripts/hooks/core/index.js +35 -0
- package/scripts/hooks/core/loop-check.js +241 -0
- package/scripts/hooks/core/session-context.js +294 -0
- package/scripts/hooks/core/task-gate.js +177 -0
- package/scripts/hooks/core/validation.js +230 -0
- package/scripts/hooks/entry/claude-code/post-tool-use.js +65 -0
- package/scripts/hooks/entry/claude-code/pre-tool-use.js +89 -0
- package/scripts/hooks/entry/claude-code/session-end.js +87 -0
- package/scripts/hooks/entry/claude-code/session-start.js +46 -0
- package/scripts/hooks/entry/claude-code/stop.js +43 -0
- package/scripts/postinstall.js +139 -0
- package/templates/browser-test-flow.json +56 -0
- package/templates/bug-report.md +43 -0
- package/templates/component-detail.md +42 -0
- package/templates/component.stories.tsx +49 -0
- package/templates/context/constraints.md +83 -0
- package/templates/context/conventions.md +177 -0
- package/templates/context/stack.md +60 -0
- package/templates/correction-report.md +90 -0
- package/templates/feature-proposal.md +35 -0
- package/templates/hybrid/_base.md +254 -0
- package/templates/hybrid/_patterns.md +45 -0
- package/templates/hybrid/create-component.md +127 -0
- package/templates/hybrid/create-file.md +56 -0
- package/templates/hybrid/create-hook.md +145 -0
- package/templates/hybrid/create-service.md +70 -0
- package/templates/hybrid/fix-bug.md +33 -0
- package/templates/hybrid/modify-file.md +55 -0
- package/templates/story.md +68 -0
- package/templates/task.json +56 -0
- package/templates/trace.md +69 -0
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Wogi Flow - Language Detection Module
|
|
5
|
+
*
|
|
6
|
+
* Detects languages in transcript content using:
|
|
7
|
+
* - Script/character set detection
|
|
8
|
+
* - Common word analysis
|
|
9
|
+
* - N-gram/trigram profiles
|
|
10
|
+
*
|
|
11
|
+
* Supports multiple languages including RTL scripts.
|
|
12
|
+
* Extracted from flow-transcript-digest.js for modularity.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
// ==========================================================================
|
|
16
|
+
// E5-S1: Language Detection Functions
|
|
17
|
+
// ==========================================================================
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Script patterns for character set detection
|
|
21
|
+
*/
|
|
22
|
+
const SCRIPT_PATTERNS = {
|
|
23
|
+
latin: /[a-zA-ZàâäéèêëïîôùûüÿçœæÀÂÄÉÈÊËÏÎÔÙÛÜŸÇŒÆáéíóúüñÁÉÍÓÚÜÑäöüßÄÖÜ]/g,
|
|
24
|
+
cyrillic: /[\u0400-\u04FF]/g,
|
|
25
|
+
hebrew: /[\u0590-\u05FF]/g,
|
|
26
|
+
arabic: /[\u0600-\u06FF\u0750-\u077F]/g,
|
|
27
|
+
cjk: /[\u4E00-\u9FFF\u3400-\u4DBF]/g,
|
|
28
|
+
hiragana: /[\u3040-\u309F]/g,
|
|
29
|
+
katakana: /[\u30A0-\u30FF]/g,
|
|
30
|
+
hangul: /[\uAC00-\uD7AF\u1100-\u11FF]/g,
|
|
31
|
+
greek: /[\u0370-\u03FF]/g,
|
|
32
|
+
thai: /[\u0E00-\u0E7F]/g,
|
|
33
|
+
devanagari: /[\u0900-\u097F]/g
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Language metadata
|
|
38
|
+
*/
|
|
39
|
+
const LANGUAGE_INFO = {
|
|
40
|
+
en: { name: 'English', script: 'latin', rtl: false },
|
|
41
|
+
es: { name: 'Spanish', script: 'latin', rtl: false },
|
|
42
|
+
fr: { name: 'French', script: 'latin', rtl: false },
|
|
43
|
+
de: { name: 'German', script: 'latin', rtl: false },
|
|
44
|
+
pt: { name: 'Portuguese', script: 'latin', rtl: false },
|
|
45
|
+
it: { name: 'Italian', script: 'latin', rtl: false },
|
|
46
|
+
nl: { name: 'Dutch', script: 'latin', rtl: false },
|
|
47
|
+
ru: { name: 'Russian', script: 'cyrillic', rtl: false },
|
|
48
|
+
he: { name: 'Hebrew', script: 'hebrew', rtl: true },
|
|
49
|
+
ar: { name: 'Arabic', script: 'arabic', rtl: true },
|
|
50
|
+
zh: { name: 'Chinese', script: 'cjk', rtl: false },
|
|
51
|
+
ja: { name: 'Japanese', script: 'cjk', rtl: false },
|
|
52
|
+
ko: { name: 'Korean', script: 'hangul', rtl: false },
|
|
53
|
+
el: { name: 'Greek', script: 'greek', rtl: false },
|
|
54
|
+
th: { name: 'Thai', script: 'thai', rtl: false },
|
|
55
|
+
hi: { name: 'Hindi', script: 'devanagari', rtl: false },
|
|
56
|
+
pl: { name: 'Polish', script: 'latin', rtl: false },
|
|
57
|
+
tr: { name: 'Turkish', script: 'latin', rtl: false },
|
|
58
|
+
sv: { name: 'Swedish', script: 'latin', rtl: false },
|
|
59
|
+
no: { name: 'Norwegian', script: 'latin', rtl: false },
|
|
60
|
+
da: { name: 'Danish', script: 'latin', rtl: false },
|
|
61
|
+
fi: { name: 'Finnish', script: 'latin', rtl: false },
|
|
62
|
+
vi: { name: 'Vietnamese', script: 'latin', rtl: false }
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Common words by language (top 30 most frequent)
|
|
67
|
+
*/
|
|
68
|
+
const COMMON_WORDS = {
|
|
69
|
+
en: ['the', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does',
|
|
70
|
+
'did', 'will', 'would', 'could', 'should', 'can', 'may', 'might',
|
|
71
|
+
'this', 'that', 'these', 'those', 'with', 'from', 'about', 'into',
|
|
72
|
+
'through', 'during', 'before', 'after'],
|
|
73
|
+
|
|
74
|
+
es: ['el', 'la', 'los', 'las', 'un', 'una', 'de', 'del', 'que', 'en',
|
|
75
|
+
'es', 'son', 'por', 'para', 'con', 'sin', 'sobre', 'como', 'pero',
|
|
76
|
+
'muy', 'ya', 'aunque', 'porque', 'cuando', 'donde', 'quien',
|
|
77
|
+
'cual', 'todo', 'nada', 'algo'],
|
|
78
|
+
|
|
79
|
+
fr: ['le', 'la', 'les', 'un', 'une', 'de', 'du', 'des', 'et', 'en',
|
|
80
|
+
'est', 'sont', 'avoir', 'pour', 'que', 'qui', 'dans', 'sur',
|
|
81
|
+
'avec', 'plus', 'pas', 'ce', 'cette', 'ces', 'nous', 'vous',
|
|
82
|
+
'ils', 'elle', 'elles', 'mais'],
|
|
83
|
+
|
|
84
|
+
de: ['der', 'die', 'das', 'den', 'dem', 'ein', 'eine', 'und', 'ist', 'sind',
|
|
85
|
+
'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'kann',
|
|
86
|
+
'mit', 'von', 'zu', 'bei', 'nach', 'auch', 'nur', 'noch',
|
|
87
|
+
'aber', 'oder', 'wenn', 'wie', 'nicht'],
|
|
88
|
+
|
|
89
|
+
pt: ['o', 'a', 'os', 'as', 'um', 'uma', 'de', 'do', 'da', 'que', 'em',
|
|
90
|
+
'no', 'na', 'para', 'por', 'com', 'mais', 'como', 'esse',
|
|
91
|
+
'essa', 'este', 'esta', 'seu', 'sua', 'ele', 'ela', 'eles',
|
|
92
|
+
'elas', 'mas', 'ou'],
|
|
93
|
+
|
|
94
|
+
it: ['il', 'la', 'i', 'le', 'lo', 'gli', 'un', 'una', 'di', 'che', 'e',
|
|
95
|
+
'in', 'per', 'con', 'non', 'da', 'su', 'come', 'ma', 'anche',
|
|
96
|
+
'questo', 'quella', 'questi', 'quelle', 'essere', 'avere',
|
|
97
|
+
'fare', 'dire', 'potere', 'volere'],
|
|
98
|
+
|
|
99
|
+
nl: ['de', 'het', 'een', 'van', 'en', 'in', 'is', 'zijn', 'op', 'te',
|
|
100
|
+
'dat', 'die', 'voor', 'met', 'niet', 'aan', 'er', 'om', 'ook', 'als',
|
|
101
|
+
'maar', 'bij', 'nog', 'wel', 'dan', 'naar', 'kan', 'zou', 'worden', 'heeft'],
|
|
102
|
+
|
|
103
|
+
he: ['של', 'את', 'על', 'הוא', 'היא', 'הם', 'הן', 'לא', 'זה', 'כי', 'אם',
|
|
104
|
+
'גם', 'יש', 'אין', 'עם', 'אל', 'מה', 'כל', 'היה', 'להיות', 'אני',
|
|
105
|
+
'אתה', 'את', 'אנחנו', 'הזה', 'הזאת', 'עוד', 'רק', 'כמו', 'אבל'],
|
|
106
|
+
|
|
107
|
+
ru: ['и', 'в', 'не', 'на', 'я', 'что', 'он', 'с', 'как', 'это',
|
|
108
|
+
'она', 'они', 'но', 'по', 'из', 'за', 'все', 'так', 'его', 'же',
|
|
109
|
+
'от', 'для', 'или', 'было', 'бы', 'мне', 'вы', 'мы', 'был', 'быть']
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Common trigrams by language
|
|
114
|
+
*/
|
|
115
|
+
const TRIGRAM_PROFILES = {
|
|
116
|
+
en: ['the', 'and', 'ing', 'ion', 'tio', 'ent', 'ati', 'for', 'her', 'ter',
|
|
117
|
+
'hat', 'tha', 'ere', 'ate', 'his', 'con', 'res', 'ver', 'all', 'ons'],
|
|
118
|
+
es: ['que', 'ent', 'ade', 'los', 'del', 'est', 'con', 'nte', 'par',
|
|
119
|
+
'las', 'cia', 'era', 'ien', 'com', 'res', 'sta', 'tra', 'pro', 'una', 'por'],
|
|
120
|
+
fr: ['ent', 'que', 'les', 'ion', 'tio', 'men', 'ait', 'ons', 'ant', 'our',
|
|
121
|
+
'des', 'eur', 'par', 'est', 'eme', 'com', 'ous', 'ter', 'con', 'dan'],
|
|
122
|
+
de: ['der', 'und', 'den', 'ein', 'che', 'die', 'sch', 'ung', 'ich', 'ter',
|
|
123
|
+
'ent', 'gen', 'das', 'ber', 'ine', 'eit', 'mit', 'ren', 'nen', 'ver']
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Detect script types in text
|
|
128
|
+
*/
|
|
129
|
+
function detectScript(text) {
|
|
130
|
+
const scripts = {};
|
|
131
|
+
let total = 0;
|
|
132
|
+
|
|
133
|
+
for (const [name, pattern] of Object.entries(SCRIPT_PATTERNS)) {
|
|
134
|
+
const matches = text.match(pattern) || [];
|
|
135
|
+
if (matches.length > 0) {
|
|
136
|
+
scripts[name] = matches.length;
|
|
137
|
+
total += matches.length;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Calculate percentages
|
|
142
|
+
const percentages = {};
|
|
143
|
+
for (const [name, count] of Object.entries(scripts)) {
|
|
144
|
+
percentages[name] = total > 0 ? count / total : 0;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return { counts: scripts, percentages, total };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Clean text for language detection
|
|
152
|
+
*/
|
|
153
|
+
function cleanForDetection(text) {
|
|
154
|
+
return text
|
|
155
|
+
// Remove timestamps
|
|
156
|
+
.replace(/\d{1,2}:\d{2}(:\d{2})?(\.\d+)?/g, '')
|
|
157
|
+
// Remove speaker labels
|
|
158
|
+
.replace(/^[A-Z][a-z]+\s[A-Z][a-z]+:/gm, '')
|
|
159
|
+
.replace(/<v\s+[^>]+>/g, '')
|
|
160
|
+
// Remove URLs
|
|
161
|
+
.replace(/https?:\/\/[^\s]+/g, '')
|
|
162
|
+
// Remove extra whitespace
|
|
163
|
+
.replace(/\s+/g, ' ')
|
|
164
|
+
.trim();
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Extract word tokens from text
|
|
169
|
+
*/
|
|
170
|
+
function extractWords(text) {
|
|
171
|
+
// Handle different scripts
|
|
172
|
+
const words = text.toLowerCase().match(/[\p{L}]+/gu) || [];
|
|
173
|
+
return words.filter(w => w.length > 1);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Analyze common words to score languages
|
|
178
|
+
*/
|
|
179
|
+
function analyzeCommonWords(text) {
|
|
180
|
+
const words = extractWords(text);
|
|
181
|
+
const wordSet = new Set(words);
|
|
182
|
+
const scores = {};
|
|
183
|
+
|
|
184
|
+
for (const [lang, commonList] of Object.entries(COMMON_WORDS)) {
|
|
185
|
+
let matches = 0;
|
|
186
|
+
for (const word of commonList) {
|
|
187
|
+
if (wordSet.has(word)) {
|
|
188
|
+
matches++;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
// Also count occurrences
|
|
192
|
+
let occurrences = 0;
|
|
193
|
+
for (const word of words) {
|
|
194
|
+
if (commonList.includes(word)) {
|
|
195
|
+
occurrences++;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
scores[lang] = {
|
|
199
|
+
uniqueMatches: matches,
|
|
200
|
+
totalOccurrences: occurrences,
|
|
201
|
+
score: words.length > 0 ? occurrences / words.length : 0
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return scores;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Extract trigrams from text
|
|
210
|
+
*/
|
|
211
|
+
function extractTrigrams(text) {
|
|
212
|
+
const cleaned = text.toLowerCase().replace(/[^a-z]/g, '');
|
|
213
|
+
const trigrams = [];
|
|
214
|
+
for (let i = 0; i < cleaned.length - 2; i++) {
|
|
215
|
+
trigrams.push(cleaned.substring(i, i + 3));
|
|
216
|
+
}
|
|
217
|
+
return trigrams;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Analyze trigrams to score languages
|
|
222
|
+
*/
|
|
223
|
+
function analyzeNgrams(text) {
|
|
224
|
+
const trigrams = extractTrigrams(text);
|
|
225
|
+
const trigramSet = new Set(trigrams);
|
|
226
|
+
const scores = {};
|
|
227
|
+
|
|
228
|
+
for (const [lang, profile] of Object.entries(TRIGRAM_PROFILES)) {
|
|
229
|
+
let matches = 0;
|
|
230
|
+
for (const trigram of profile) {
|
|
231
|
+
if (trigramSet.has(trigram)) {
|
|
232
|
+
matches++;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
scores[lang] = {
|
|
236
|
+
matches: matches,
|
|
237
|
+
score: profile.length > 0 ? matches / profile.length : 0
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return scores;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Combine detection signals into final scores
|
|
246
|
+
*/
|
|
247
|
+
function combineLanguageScores(scriptResult, wordResult, ngramResult) {
|
|
248
|
+
const scores = {};
|
|
249
|
+
|
|
250
|
+
// Script-based detection for non-Latin scripts
|
|
251
|
+
if (scriptResult.percentages.hebrew > 0.3) {
|
|
252
|
+
scores.he = (scores.he || 0) + scriptResult.percentages.hebrew;
|
|
253
|
+
}
|
|
254
|
+
if (scriptResult.percentages.arabic > 0.3) {
|
|
255
|
+
scores.ar = (scores.ar || 0) + scriptResult.percentages.arabic;
|
|
256
|
+
}
|
|
257
|
+
if (scriptResult.percentages.cyrillic > 0.3) {
|
|
258
|
+
scores.ru = (scores.ru || 0) + scriptResult.percentages.cyrillic;
|
|
259
|
+
}
|
|
260
|
+
if (scriptResult.percentages.cjk > 0.3) {
|
|
261
|
+
// Could be Chinese or Japanese
|
|
262
|
+
if (scriptResult.percentages.hiragana > 0.1 || scriptResult.percentages.katakana > 0.1) {
|
|
263
|
+
scores.ja = (scores.ja || 0) + scriptResult.percentages.cjk;
|
|
264
|
+
} else {
|
|
265
|
+
scores.zh = (scores.zh || 0) + scriptResult.percentages.cjk;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
if (scriptResult.percentages.hangul > 0.3) {
|
|
269
|
+
scores.ko = (scores.ko || 0) + scriptResult.percentages.hangul;
|
|
270
|
+
}
|
|
271
|
+
if (scriptResult.percentages.greek > 0.3) {
|
|
272
|
+
scores.el = (scores.el || 0) + scriptResult.percentages.greek;
|
|
273
|
+
}
|
|
274
|
+
if (scriptResult.percentages.thai > 0.3) {
|
|
275
|
+
scores.th = (scores.th || 0) + scriptResult.percentages.thai;
|
|
276
|
+
}
|
|
277
|
+
if (scriptResult.percentages.devanagari > 0.3) {
|
|
278
|
+
scores.hi = (scores.hi || 0) + scriptResult.percentages.devanagari;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Word-based scoring (weighted 0.5)
|
|
282
|
+
for (const [lang, data] of Object.entries(wordResult)) {
|
|
283
|
+
scores[lang] = (scores[lang] || 0) + data.score * 0.5;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// N-gram scoring (weighted 0.3)
|
|
287
|
+
for (const [lang, data] of Object.entries(ngramResult)) {
|
|
288
|
+
scores[lang] = (scores[lang] || 0) + data.score * 0.3;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Normalize scores
|
|
292
|
+
const maxScore = Math.max(...Object.values(scores), 0.001);
|
|
293
|
+
for (const lang of Object.keys(scores)) {
|
|
294
|
+
scores[lang] = scores[lang] / maxScore;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return scores;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Detect primary language of text
|
|
302
|
+
*/
|
|
303
|
+
function detectLanguage(text, options = {}) {
|
|
304
|
+
const minLength = options.minLength || 20;
|
|
305
|
+
|
|
306
|
+
// Clean text
|
|
307
|
+
const cleaned = cleanForDetection(text);
|
|
308
|
+
if (cleaned.length < minLength) {
|
|
309
|
+
return {
|
|
310
|
+
language: 'unknown',
|
|
311
|
+
languageName: 'Unknown',
|
|
312
|
+
confidence: 0,
|
|
313
|
+
reason: 'insufficient_text'
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Analyze
|
|
318
|
+
const scriptResult = detectScript(cleaned);
|
|
319
|
+
const wordResult = analyzeCommonWords(cleaned);
|
|
320
|
+
const ngramResult = analyzeNgrams(cleaned);
|
|
321
|
+
|
|
322
|
+
// Combine scores
|
|
323
|
+
const scores = combineLanguageScores(scriptResult, wordResult, ngramResult);
|
|
324
|
+
|
|
325
|
+
// Sort by score
|
|
326
|
+
const sorted = Object.entries(scores)
|
|
327
|
+
.filter(([_, score]) => score > 0)
|
|
328
|
+
.sort((a, b) => b[1] - a[1]);
|
|
329
|
+
|
|
330
|
+
if (sorted.length === 0) {
|
|
331
|
+
return {
|
|
332
|
+
language: 'unknown',
|
|
333
|
+
languageName: 'Unknown',
|
|
334
|
+
confidence: 0
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
const primary = sorted[0];
|
|
339
|
+
const secondary = sorted.length > 1 && sorted[1][1] > 0.3 ? sorted[1] : null;
|
|
340
|
+
|
|
341
|
+
return {
|
|
342
|
+
language: primary[0],
|
|
343
|
+
languageName: LANGUAGE_INFO[primary[0]]?.name || primary[0],
|
|
344
|
+
confidence: Math.min(primary[1], 1),
|
|
345
|
+
secondary: secondary ? {
|
|
346
|
+
language: secondary[0],
|
|
347
|
+
languageName: LANGUAGE_INFO[secondary[0]]?.name || secondary[0],
|
|
348
|
+
confidence: Math.min(secondary[1], 1)
|
|
349
|
+
} : null,
|
|
350
|
+
scripts: scriptResult.counts,
|
|
351
|
+
wordMatches: Object.fromEntries(
|
|
352
|
+
Object.entries(wordResult)
|
|
353
|
+
.filter(([_, d]) => d.totalOccurrences > 0)
|
|
354
|
+
.map(([lang, d]) => [lang, d.totalOccurrences])
|
|
355
|
+
),
|
|
356
|
+
allScores: scores
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Detect multiple languages in text (for mixed content)
|
|
362
|
+
*/
|
|
363
|
+
function detectMultipleLanguages(text, options = {}) {
|
|
364
|
+
const segmentSize = options.segmentSize || 300;
|
|
365
|
+
|
|
366
|
+
// Split into segments
|
|
367
|
+
const words = text.split(/\s+/);
|
|
368
|
+
const segments = [];
|
|
369
|
+
for (let i = 0; i < words.length; i += segmentSize / 5) {
|
|
370
|
+
const segmentWords = words.slice(i, i + segmentSize / 5);
|
|
371
|
+
if (segmentWords.length > 10) {
|
|
372
|
+
segments.push(segmentWords.join(' '));
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
if (segments.length === 0) {
|
|
377
|
+
return detectLanguage(text, options);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Analyze each segment
|
|
381
|
+
const languageCounts = {};
|
|
382
|
+
const segmentResults = [];
|
|
383
|
+
|
|
384
|
+
for (const segment of segments) {
|
|
385
|
+
const result = detectLanguage(segment, { minLength: 10 });
|
|
386
|
+
if (result.language !== 'unknown' && result.confidence > 0.3) {
|
|
387
|
+
languageCounts[result.language] = (languageCounts[result.language] || 0) + 1;
|
|
388
|
+
segmentResults.push({
|
|
389
|
+
preview: segment.substring(0, 50) + (segment.length > 50 ? '...' : ''),
|
|
390
|
+
language: result.language,
|
|
391
|
+
confidence: result.confidence
|
|
392
|
+
});
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Calculate distribution
|
|
397
|
+
const total = Object.values(languageCounts).reduce((a, b) => a + b, 0);
|
|
398
|
+
const distribution = {};
|
|
399
|
+
for (const [lang, count] of Object.entries(languageCounts)) {
|
|
400
|
+
distribution[lang] = total > 0 ? count / total : 0;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
const sortedLangs = Object.entries(distribution).sort((a, b) => b[1] - a[1]);
|
|
404
|
+
const primaryLang = sortedLangs[0]?.[0] || 'unknown';
|
|
405
|
+
|
|
406
|
+
return {
|
|
407
|
+
language: primaryLang,
|
|
408
|
+
languageName: LANGUAGE_INFO[primaryLang]?.name || primaryLang,
|
|
409
|
+
confidence: distribution[primaryLang] || 0,
|
|
410
|
+
isMultilingual: Object.keys(distribution).length > 1,
|
|
411
|
+
distribution: distribution,
|
|
412
|
+
segmentCount: segments.length,
|
|
413
|
+
segments: segmentResults.slice(0, 10) // Limit to first 10
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Get language info by code
|
|
419
|
+
*/
|
|
420
|
+
function getLanguageInfo(code) {
|
|
421
|
+
const info = LANGUAGE_INFO[code];
|
|
422
|
+
if (!info) {
|
|
423
|
+
return { code, name: 'Unknown', script: 'unknown', rtl: false, supported: false };
|
|
424
|
+
}
|
|
425
|
+
return {
|
|
426
|
+
code,
|
|
427
|
+
...info,
|
|
428
|
+
hasCommonWords: !!COMMON_WORDS[code],
|
|
429
|
+
hasTrigrams: !!TRIGRAM_PROFILES[code],
|
|
430
|
+
supported: true
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/**
|
|
435
|
+
* List all supported languages
|
|
436
|
+
*/
|
|
437
|
+
function listSupportedLanguages() {
|
|
438
|
+
return Object.entries(LANGUAGE_INFO).map(([code, info]) => ({
|
|
439
|
+
code,
|
|
440
|
+
...info,
|
|
441
|
+
tier: COMMON_WORDS[code] ? (TRIGRAM_PROFILES[code] ? 1 : 2) : 3
|
|
442
|
+
}));
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
module.exports = {
|
|
448
|
+
// Constants
|
|
449
|
+
SCRIPT_PATTERNS,
|
|
450
|
+
LANGUAGE_INFO,
|
|
451
|
+
COMMON_WORDS,
|
|
452
|
+
TRIGRAM_PROFILES,
|
|
453
|
+
// Functions
|
|
454
|
+
detectScript,
|
|
455
|
+
cleanForDetection,
|
|
456
|
+
extractWords,
|
|
457
|
+
analyzeCommonWords,
|
|
458
|
+
extractTrigrams,
|
|
459
|
+
analyzeNgrams,
|
|
460
|
+
combineLanguageScores,
|
|
461
|
+
detectLanguage,
|
|
462
|
+
detectMultipleLanguages,
|
|
463
|
+
getLanguageInfo,
|
|
464
|
+
listSupportedLanguages
|
|
465
|
+
};
|