whatlanguage 1.0.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/whatlanguage.rb CHANGED
@@ -1,66 +1,280 @@
1
- require 'whatlanguage/bloominsimple'
2
- require 'whatlanguage/bitfield'
3
- require 'digest/sha1'
4
-
5
- class WhatLanguage
6
- HASHER = lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
7
-
8
- BITFIELD_WIDTH = 2_000_000
9
-
10
- @@data = {}
11
-
12
- def initialize(*selection)
13
- @selection = (selection.empty?) ? [:all] : selection
14
- languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
15
- Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
16
- @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'whatlanguage/languages'
5
+ require_relative 'whatlanguage/version'
6
+
7
+ class WhatLanguage
8
+ MAX_TRIGRAM_DISTANCE = 300
9
+ MAX_TOTAL_DISTANCE = MAX_TRIGRAM_DISTANCE * MAX_TRIGRAM_DISTANCE # 90_000
10
+ TEXT_TRIGRAMS_SIZE = 600
11
+ DEFAULT_MIN_CHARS = 10
12
+
13
+ Result = Struct.new(:language, :iso, :score, :ranked, keyword_init: true) do
14
+ alias scores ranked
15
+ end
16
+
17
+ # Scripts that resolve to a single language by their Unicode block alone.
18
+ # (Hiragana and Katakana both indicate Japanese.) Scripts NOT listed here but
19
+ # present in the trigram dataset are disambiguated statistically instead.
20
+ DETERMINISTIC = {
21
+ 'Mandarin' => 'cmn', 'Bengali' => 'ben', 'Hangul' => 'kor',
22
+ 'Georgian' => 'kat', 'Greek' => 'ell', 'Kannada' => 'kan',
23
+ 'Tamil' => 'tam', 'Thai' => 'tha', 'Gujarati' => 'guj',
24
+ 'Gurmukhi' => 'pan', 'Telugu' => 'tel', 'Malayalam'=> 'mal',
25
+ 'Oriya' => 'ori', 'Myanmar' => 'mya', 'Sinhala' => 'sin',
26
+ 'Khmer' => 'khm', 'Armenian'=> 'hye', 'Hiragana' => 'jpn',
27
+ 'Katakana' => 'jpn'
28
+ }.freeze
29
+
30
+ # Unicode ranges per script, in detection priority order (mirrors whatlang's
31
+ # scripts/detect.rs). The first script whose range contains a character claims
32
+ # that character; the script with the most characters wins.
33
+ SCRIPT_RANGES = [
34
+ ['Latin', [[0x61,0x7A],[0x41,0x5A],[0x80,0xFF],[0x100,0x17F],[0x180,0x24F],
35
+ [0x250,0x2AF],[0x1D00,0x1D7F],[0x1D80,0x1DBF],[0x1E00,0x1EFF],
36
+ [0x2100,0x214F],[0x2C60,0x2C7F],[0xA720,0xA7FF],[0xAB30,0xAB6F]]],
37
+ ['Cyrillic', [[0x400,0x484],[0x487,0x52F],[0x2DE0,0x2DFF],[0xA640,0xA69D],
38
+ [0x1D2B,0x1D2B],[0x1D78,0x1D78],[0xA69F,0xA69F]]],
39
+ ['Arabic', [[0x600,0x6FF],[0x750,0x7FF],[0x8A0,0x8FF],[0xFB50,0xFDFF],
40
+ [0xFE70,0xFEFF],[0x10E60,0x10E7F],[0x1EE00,0x1EEFF]]],
41
+ ['Mandarin', [[0x2E80,0x2E99],[0x2E9B,0x2EF3],[0x2F00,0x2FD5],[0x3005,0x3005],
42
+ [0x3007,0x3007],[0x3021,0x3029],[0x3038,0x303B],[0x3400,0x4DB5],
43
+ [0x4E00,0x9FCC],[0xF900,0xFA6D],[0xFA70,0xFAD9]]],
44
+ ['Devanagari', [[0x900,0x97F],[0xA8E0,0xA8FF],[0x1CD0,0x1CFF]]],
45
+ ['Hebrew', [[0x590,0x5FF]]],
46
+ ['Ethiopic', [[0x1200,0x139F],[0x2D80,0x2DDF],[0xAB00,0xAB2F]]],
47
+ ['Georgian', [[0x10A0,0x10FF]]],
48
+ ['Bengali', [[0x980,0x9FF]]],
49
+ ['Hangul', [[0xAC00,0xD7AF],[0x1100,0x11FF],[0x3130,0x318F],[0x3200,0x32FF],
50
+ [0xA960,0xA97F],[0xD7B0,0xD7FF]]],
51
+ ['Hiragana', [[0x3040,0x309F]]],
52
+ ['Katakana', [[0x30A0,0x30FF]]],
53
+ ['Greek', [[0x370,0x3FF]]],
54
+ ['Kannada', [[0xC80,0xCFF]]],
55
+ ['Tamil', [[0xB80,0xBFF]]],
56
+ ['Thai', [[0xE00,0xE7F]]],
57
+ ['Gujarati', [[0xA80,0xAFF]]],
58
+ ['Gurmukhi', [[0xA00,0xA7F]]],
59
+ ['Telugu', [[0xC00,0xC7F]]],
60
+ ['Malayalam', [[0xD00,0xD7F]]],
61
+ ['Oriya', [[0xB00,0xB7F]]],
62
+ ['Myanmar', [[0x1000,0x109F]]],
63
+ ['Sinhala', [[0xD80,0xDFF]]],
64
+ ['Khmer', [[0x1780,0x17FF],[0x19E0,0x19FF]]],
65
+ ['Armenian', [[0x530,0x58F],[0xFB13,0xFB17]]]
66
+ ].freeze
67
+
68
+ # ISO 639-1 (with 639-3 fallback) lookup by language-name symbol, plus the
69
+ # historical nil => nil entry. Internal; kept for backward compatibility.
70
+ ISO_CODES = CODE_INFO.each_with_object(nil => nil) do |(_code, (name, iso)), h|
71
+ h[name] = iso
72
+ end.freeze
73
+
74
+ NAME_TO_CODE = CODE_INFO.each_with_object({}) do |(code, (name, _iso)), h|
75
+ h[name] ||= code
76
+ end.freeze
77
+
78
+ private_constant :MAX_TRIGRAM_DISTANCE, :MAX_TOTAL_DISTANCE, :TEXT_TRIGRAMS_SIZE,
79
+ :DEFAULT_MIN_CHARS, :DETERMINISTIC, :SCRIPT_RANGES, :ISO_CODES,
80
+ :NAME_TO_CODE
81
+
82
+ class << self
83
+ def detect(text)
84
+ default_detector.detect(text)
85
+ end
86
+
87
+ def language(text)
88
+ default_detector.language(text)
89
+ end
90
+
91
+ def language_iso(text)
92
+ default_detector.language_iso(text)
93
+ end
94
+
95
+ def ranked(text)
96
+ default_detector.ranked(text)
97
+ end
98
+
99
+ def score_hash(text)
100
+ default_detector.score_hash(text)
101
+ end
102
+
103
+ alias scores score_hash
104
+ alias process_text score_hash
105
+
106
+ def languages
107
+ NAME_TO_CODE.keys
108
+ end
109
+
110
+ # script name => [[code, [trigram, ...]], ...], loaded once and memoized.
111
+ def profiles
112
+ @profiles ||= JSON.parse(File.read(File.join(__dir__, 'whatlanguage', 'trigrams.json')))
113
+ .transform_values { |langs| langs.map { |code, str| [code, str.split('|')] } }
114
+ end
115
+
116
+ private
117
+
118
+ def default_detector
119
+ @default_detector ||= new
17
120
  end
18
121
  end
19
-
20
- # Very inefficient method for now.. but still beats the non-Bloom alternatives.
21
- # Change to better bit comparison technique later..
22
- def process_text(text)
23
- results = Hash.new(0)
24
- it = 0
25
- text.downcase.split.each do |word|
26
- it += 1
27
122
 
123
+ def initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS)
124
+ @selection = Array(only || (selection.empty? ? [:all] : selection))
125
+ validate_selection!
126
+ @min_chars = min_chars
127
+ end
128
+
129
+ # Language-name symbols this instance scores against: every supported language
130
+ # for :all, otherwise the requested selection intersected with the supported
131
+ # set (legacy aliases such as :pinyin resolved to their modern names).
132
+ def languages
133
+ @languages ||=
28
134
  if @selection.include?(:all)
29
- languages = @@data.keys
135
+ self.class.languages
30
136
  else
31
- languages = @@data.keys & @selection # intersection
137
+ wanted = @selection.map { |s| NAME_ALIASES.fetch(s, s) }
138
+ self.class.languages & wanted
32
139
  end
140
+ end
33
141
 
34
- languages.each do |lang|
35
- results[lang] += 1 if @@data[lang].includes?(word)
36
- end
37
-
38
- # Every now and then check to see if we have a really convincing result.. if so, exit early.
39
- if it % 4 == 0 && results.size > 1
40
- top_results = results.sort_by{|a,b| -b}[0..1]
41
-
42
- # Next line may need some tweaking one day..
43
- break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
44
- end
45
-
46
- #break if it > 100
142
+ # Per-language scores for the text (higher = more likely). Languages outside
143
+ # the current selection, or not under the detected script, are absent; the
144
+ # hash defaults to 0. Only the relative ranking is meaningful.
145
+ def score_hash(text)
146
+ results = Hash.new(0)
147
+ text = normalize_text(text)
148
+ script = detect_script(text)
149
+ return results unless script
150
+
151
+ if (code = DETERMINISTIC[script])
152
+ name = CODE_INFO[code].first
153
+ results[name] = MAX_TOTAL_DISTANCE if allowed?(name)
154
+ return results
155
+ end
156
+
157
+ candidates = self.class.profiles[script]
158
+ return results unless candidates
159
+ return results if significant_char_count(text) < @min_chars
160
+
161
+ positions = trigram_positions(text)
162
+ candidates.each do |code, trigrams|
163
+ name = CODE_INFO[code].first
164
+ next unless allowed?(name)
165
+
166
+ results[name] = MAX_TOTAL_DISTANCE - distance(trigrams, positions)
47
167
  end
48
168
  results
49
169
  end
50
-
170
+
171
+ alias scores score_hash
172
+ alias process_text score_hash
173
+
174
+ # Per-language scores as an array sorted from most likely to least likely.
175
+ def ranked(text)
176
+ score_hash(text).sort_by { |_name, score| -score }
177
+ end
178
+
179
+ # Detection result with the winning language, ISO code, winning score, and
180
+ # full ranked scores. Returns nil when the text is too short or unrecognized.
181
+ def detect(text)
182
+ ranked_scores = ranked(text)
183
+ return nil if ranked_scores.empty?
184
+
185
+ name, score = ranked_scores.first
186
+ Result.new(language: name, iso: ISO_CODES[name], score: score, ranked: ranked_scores)
187
+ end
188
+
189
+ # Most likely language as a name symbol, or nil when no language is detected.
51
190
  def language(text)
52
- process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
191
+ detect(text)&.language
53
192
  end
54
-
55
- def self.filter_from_dictionary(filename)
56
- bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
57
- File.open(filename).each { |word| bf.add(word) }
58
- bf
193
+
194
+ # Most likely language as an ISO 639-1 symbol (639-3 fallback), or nil.
195
+ def language_iso(text)
196
+ detect(text)&.iso
197
+ end
198
+
199
+ private
200
+
201
+ def normalize_text(text)
202
+ text.to_s.unicode_normalize(:nfkc)
203
+ end
204
+
205
+ def allowed?(name)
206
+ @selection.include?(:all) || languages.include?(name)
207
+ end
208
+
209
+ def validate_selection!
210
+ requested = @selection.reject { |name| name == :all }
211
+ unknown = requested.reject { |name| self.class.languages.include?(NAME_ALIASES.fetch(name, name)) }
212
+ return if unknown.empty?
213
+
214
+ raise ArgumentError, "Unknown language selection: #{unknown.map(&:inspect).join(', ')}"
215
+ end
216
+
217
+ def significant_char_count(text)
218
+ text.each_char.count { |ch| !stop_char?(ch.ord) }
219
+ end
220
+
221
+ # Dominant Unicode script of the text, or nil if it has no script characters.
222
+ def detect_script(text)
223
+ counts = Hash.new(0)
224
+ text.each_char do |ch|
225
+ cp = ch.ord
226
+ next if stop_char?(cp)
227
+
228
+ SCRIPT_RANGES.each do |name, ranges|
229
+ if ranges.any? { |lo, hi| cp >= lo && cp <= hi }
230
+ counts[name] += 1
231
+ break
232
+ end
233
+ end
234
+ end
235
+ return nil if counts.empty?
236
+
237
+ counts.max_by { |_name, n| n }.first
238
+ end
239
+
240
+ # Text trigrams ranked by descending frequency, mapped to their rank index.
241
+ # Mirrors whatlang's trigram extraction: punctuation/digits become spaces,
242
+ # the stream is bounded by spaces, and runs of spaces are collapsed.
243
+ def trigram_positions(text)
244
+ chars = text.downcase.each_char.map { |c| stop_char?(c.ord) ? ' ' : c }
245
+ return {} if chars.empty?
246
+
247
+ occurrences = Hash.new(0)
248
+ c1 = ' '
249
+ c2 = chars[0]
250
+ (chars[1..] + [' ']).each do |c3|
251
+ occurrences[c1 + c2 + c3] += 1 unless c2 == ' ' && (c1 == ' ' || c3 == ' ')
252
+ c1 = c2
253
+ c2 = c3
254
+ end
255
+
256
+ ranked = occurrences.to_a.sort { |a, b| [b[1], b[0]] <=> [a[1], a[0]] }.first(TEXT_TRIGRAMS_SIZE)
257
+ positions = {}
258
+ ranked.each_with_index { |(trigram, _count), i| positions[trigram] = i }
259
+ positions
260
+ end
261
+
262
+ # Out-of-place distance between a language's ordered trigram profile and the
263
+ # text's ranked trigrams. Lower means a closer match.
264
+ def distance(profile, positions)
265
+ total = 0
266
+ profile.each_with_index do |trigram, i|
267
+ pos = positions[trigram]
268
+ total += pos ? (pos - i).abs : MAX_TRIGRAM_DISTANCE
269
+ end
270
+
271
+ count = positions.size
272
+ total -= (MAX_TRIGRAM_DISTANCE - count) * MAX_TRIGRAM_DISTANCE if MAX_TRIGRAM_DISTANCE > count
273
+ total.clamp(0, MAX_TOTAL_DISTANCE)
59
274
  end
60
- end
61
275
 
62
- class String
63
- def language
64
- WhatLanguage.new(:all).language(self)
276
+ # Space, ASCII punctuation, or digit: no value for script/language detection.
277
+ def stop_char?(codepoint)
278
+ codepoint <= 0x40 || (codepoint >= 0x5B && codepoint <= 0x60) || (codepoint >= 0x7B && codepoint <= 0x7E)
65
279
  end
66
280
  end
data/whatlanguage.gemspec CHANGED
@@ -1,19 +1,30 @@
1
- # -*- encoding: utf-8 -*-
2
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
  require 'whatlanguage/version'
5
6
 
6
7
  Gem::Specification.new do |gem|
7
- gem.name = "whatlanguage"
8
+ gem.name = 'whatlanguage'
8
9
  gem.version = WhatLanguage::VERSION
9
- gem.authors = ["Peter Cooper"]
10
- gem.email = ["git@peterc.org"]
11
- gem.description = %q{WhatLanguage rapidly detects the language of a sample of text}
12
- gem.summary = %q{Natural language detection for text samples}
13
- gem.homepage = "https://github.com/peterc/whatlanguage"
10
+ gem.authors = ['Peter Cooper']
11
+ gem.email = ['git@peterc.org']
12
+ gem.description = 'WhatLanguage rapidly detects the language of a sample of text'
13
+ gem.summary = 'Natural language detection for text samples'
14
+ gem.homepage = 'https://github.com/peterc/whatlanguage'
15
+ gem.license = 'MIT'
16
+ gem.required_ruby_version = '>= 3.0'
17
+
18
+ gem.files = Dir['lib/**/*'] + [
19
+ 'README.md',
20
+ 'CHANGELOG.md',
21
+ 'LICENSE.txt',
22
+ 'Gemfile',
23
+ 'Rakefile',
24
+ 'whatlanguage.gemspec'
25
+ ]
26
+ gem.require_paths = ['lib']
14
27
 
15
- gem.files = `git ls-files`.split($/).reject { |f| f.start_with?("wordlists") }
16
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
- gem.require_paths = ["lib"]
19
- end
28
+ gem.add_development_dependency 'minitest', '~> 5.0'
29
+ gem.add_development_dependency 'rake'
30
+ end
metadata CHANGED
@@ -1,16 +1,42 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whatlanguage
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
5
- prerelease:
4
+ version: 2.0.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Cooper
9
- autorequire:
10
8
  bindir: bin
11
9
  cert_chain: []
12
- date: 2013-10-05 00:00:00.000000000 Z
13
- dependencies: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: minitest
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '5.0'
19
+ type: :development
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '5.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rake
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
14
40
  description: WhatLanguage rapidly detects the language of a sample of text
15
41
  email:
16
42
  - git@peterc.org
@@ -18,65 +44,35 @@ executables: []
18
44
  extensions: []
19
45
  extra_rdoc_files: []
20
46
  files:
21
- - .gitignore
47
+ - CHANGELOG.md
22
48
  - Gemfile
23
- - History.txt
24
49
  - LICENSE.txt
25
- - Manifest.txt
26
50
  - README.md
27
51
  - Rakefile
28
- - build_filter.rb
29
- - build_lang_from_wordlists.rb
30
- - copyright-en
31
- - example.rb
32
- - lang/arabic.lang
33
- - lang/dutch.lang
34
- - lang/english.lang
35
- - lang/farsi.lang
36
- - lang/finnish.lang
37
- - lang/french.lang
38
- - lang/german.lang
39
- - lang/greek.lang
40
- - lang/hebrew.lang
41
- - lang/hungarian.lang
42
- - lang/italian.lang
43
- - lang/korean.lang
44
- - lang/norwegian.lang
45
- - lang/pinyin.lang
46
- - lang/polish.lang
47
- - lang/portuguese.lang
48
- - lang/russian.lang
49
- - lang/spanish.lang
50
- - lang/swedish.lang
51
52
  - lib/whatlanguage.rb
52
- - lib/whatlanguage/bitfield.rb
53
- - lib/whatlanguage/bloominsimple.rb
53
+ - lib/whatlanguage/languages.rb
54
+ - lib/whatlanguage/trigrams.json
54
55
  - lib/whatlanguage/version.rb
55
- - test/test_whatlanguage.rb
56
56
  - whatlanguage.gemspec
57
57
  homepage: https://github.com/peterc/whatlanguage
58
- licenses: []
59
- post_install_message:
58
+ licenses:
59
+ - MIT
60
+ metadata: {}
60
61
  rdoc_options: []
61
62
  require_paths:
62
63
  - lib
63
64
  required_ruby_version: !ruby/object:Gem::Requirement
64
- none: false
65
65
  requirements:
66
- - - ! '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
68
+ version: '3.0'
69
69
  required_rubygems_version: !ruby/object:Gem::Requirement
70
- none: false
71
70
  requirements:
72
- - - ! '>='
71
+ - - ">="
73
72
  - !ruby/object:Gem::Version
74
73
  version: '0'
75
74
  requirements: []
76
- rubyforge_project:
77
- rubygems_version: 1.8.24
78
- signing_key:
79
- specification_version: 3
75
+ rubygems_version: 4.0.6
76
+ specification_version: 4
80
77
  summary: Natural language detection for text samples
81
- test_files:
82
- - test/test_whatlanguage.rb
78
+ test_files: []
data/.gitignore DELETED
@@ -1,17 +0,0 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp
data/History.txt DELETED
@@ -1,15 +0,0 @@
1
- == 1.0.5 / 2013-10-05
2
-
3
- * Many more languages supported
4
-
5
- == 1.0.4 / 2013-03-07
6
-
7
- == 1.0.1 / 2008-08-22
8
-
9
- * Public release
10
- * Removed wordlists from distribution to reduce size
11
-
12
- == 1.0.0 / 2007-07-02
13
-
14
- * First version with pre-built English, French, and Spanish filters
15
-
data/Manifest.txt DELETED
@@ -1,19 +0,0 @@
1
- History.txt
2
- Manifest.txt
3
- README.txt
4
- Rakefile
5
- build_filter.rb
6
- example.rb
7
- lang/dutch.lang
8
- lang/farsi.lang
9
- lang/german.lang
10
- lang/pinyin.lang
11
- lang/russian.lang
12
- lang/english.lang
13
- lang/portuguese.lang
14
- lang/french.lang
15
- lang/spanish.lang
16
- lib/bitfield.rb
17
- lib/bloominsimple.rb
18
- lib/whatlanguage.rb
19
- test/test_whatlanguage.rb
data/build_filter.rb DELETED
@@ -1,9 +0,0 @@
1
- # Use this to build new filters (for other languages, ideally) from /usr/share/dict/words style dictionaries..
2
- #
3
- # Call like so..
4
- # ruby build_filter.rb /usr/share/dict/words lang/english.lang
5
- # (replace params as necessary)
6
-
7
- require 'lib/whatlanguage'
8
- filter = WhatLanguage.filter_from_dictionary(ARGV[0])
9
- File.open(ARGV[1], 'wb') { |f| f.write filter.dump }
@@ -1,13 +0,0 @@
1
- # Builds all of the word lists in ./wordlists/ into filter files in ./lang/
2
-
3
- require 'lib/whatlanguage'
4
-
5
- languages_folder = File.join(File.dirname(__FILE__), "lang")
6
- wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
7
-
8
- Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
9
- next if lang == 'generators'
10
- puts "Doing #{lang}"
11
- filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
12
- File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
13
- end