whatlanguage 1.0.6 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/whatlanguage.rb CHANGED
@@ -1,101 +1,280 @@
1
- require 'whatlanguage/bloominsimple'
2
- require 'whatlanguage/bitfield'
3
- require 'digest/sha1'
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'whatlanguage/languages'
5
+ require_relative 'whatlanguage/version'
4
6
 
5
7
  class WhatLanguage
6
- HASHER = lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
7
-
8
- BITFIELD_WIDTH = 2_000_000
9
-
10
- ISO_CODES = {
11
- nil => nil,
12
- :arabic => :ar,
13
- :danish => :da,
14
- :dutch => :nl,
15
- :english => :en,
16
- :farsi => :fa,
17
- :finnish => :fi,
18
- :french => :fr,
19
- :german => :de,
20
- :greek => :el,
21
- :hebrew => :he,
22
- :hungarian => :hu,
23
- :italian => :it,
24
- :korean => :ko,
25
- :norwegian => :no,
26
- :pinyin => :zh,
27
- :polish => :pl,
28
- :portuguese => :pt,
29
- :russian => :ru,
30
- :spanish => :es,
31
- :swedish => :sv
32
- }
33
-
34
- @@data = {}
35
-
36
- def initialize(*selection)
37
- @selection = (selection.empty?) ? [:all] : selection
38
- if @@data.empty?
39
- languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
40
- Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
41
- @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
42
- end
8
+ MAX_TRIGRAM_DISTANCE = 300
9
+ MAX_TOTAL_DISTANCE = MAX_TRIGRAM_DISTANCE * MAX_TRIGRAM_DISTANCE # 90_000
10
+ TEXT_TRIGRAMS_SIZE = 600
11
+ DEFAULT_MIN_CHARS = 10
12
+
13
+ Result = Struct.new(:language, :iso, :score, :ranked, keyword_init: true) do
14
+ alias scores ranked
15
+ end
16
+
17
+ # Scripts that resolve to a single language by their Unicode block alone.
18
+ # (Hiragana and Katakana both indicate Japanese.) Scripts NOT listed here but
19
+ # present in the trigram dataset are disambiguated statistically instead.
20
+ DETERMINISTIC = {
21
+ 'Mandarin' => 'cmn', 'Bengali' => 'ben', 'Hangul' => 'kor',
22
+ 'Georgian' => 'kat', 'Greek' => 'ell', 'Kannada' => 'kan',
23
+ 'Tamil' => 'tam', 'Thai' => 'tha', 'Gujarati' => 'guj',
24
+ 'Gurmukhi' => 'pan', 'Telugu' => 'tel', 'Malayalam'=> 'mal',
25
+ 'Oriya' => 'ori', 'Myanmar' => 'mya', 'Sinhala' => 'sin',
26
+ 'Khmer' => 'khm', 'Armenian'=> 'hye', 'Hiragana' => 'jpn',
27
+ 'Katakana' => 'jpn'
28
+ }.freeze
29
+
30
+ # Unicode ranges per script, in detection priority order (mirrors whatlang's
31
+ # scripts/detect.rs). The first script whose range contains a character claims
32
+ # that character; the script with the most characters wins.
33
+ SCRIPT_RANGES = [
34
+ ['Latin', [[0x61,0x7A],[0x41,0x5A],[0x80,0xFF],[0x100,0x17F],[0x180,0x24F],
35
+ [0x250,0x2AF],[0x1D00,0x1D7F],[0x1D80,0x1DBF],[0x1E00,0x1EFF],
36
+ [0x2100,0x214F],[0x2C60,0x2C7F],[0xA720,0xA7FF],[0xAB30,0xAB6F]]],
37
+ ['Cyrillic', [[0x400,0x484],[0x487,0x52F],[0x2DE0,0x2DFF],[0xA640,0xA69D],
38
+ [0x1D2B,0x1D2B],[0x1D78,0x1D78],[0xA69F,0xA69F]]],
39
+ ['Arabic', [[0x600,0x6FF],[0x750,0x7FF],[0x8A0,0x8FF],[0xFB50,0xFDFF],
40
+ [0xFE70,0xFEFF],[0x10E60,0x10E7F],[0x1EE00,0x1EEFF]]],
41
+ ['Mandarin', [[0x2E80,0x2E99],[0x2E9B,0x2EF3],[0x2F00,0x2FD5],[0x3005,0x3005],
42
+ [0x3007,0x3007],[0x3021,0x3029],[0x3038,0x303B],[0x3400,0x4DB5],
43
+ [0x4E00,0x9FCC],[0xF900,0xFA6D],[0xFA70,0xFAD9]]],
44
+ ['Devanagari', [[0x900,0x97F],[0xA8E0,0xA8FF],[0x1CD0,0x1CFF]]],
45
+ ['Hebrew', [[0x590,0x5FF]]],
46
+ ['Ethiopic', [[0x1200,0x139F],[0x2D80,0x2DDF],[0xAB00,0xAB2F]]],
47
+ ['Georgian', [[0x10A0,0x10FF]]],
48
+ ['Bengali', [[0x980,0x9FF]]],
49
+ ['Hangul', [[0xAC00,0xD7AF],[0x1100,0x11FF],[0x3130,0x318F],[0x3200,0x32FF],
50
+ [0xA960,0xA97F],[0xD7B0,0xD7FF]]],
51
+ ['Hiragana', [[0x3040,0x309F]]],
52
+ ['Katakana', [[0x30A0,0x30FF]]],
53
+ ['Greek', [[0x370,0x3FF]]],
54
+ ['Kannada', [[0xC80,0xCFF]]],
55
+ ['Tamil', [[0xB80,0xBFF]]],
56
+ ['Thai', [[0xE00,0xE7F]]],
57
+ ['Gujarati', [[0xA80,0xAFF]]],
58
+ ['Gurmukhi', [[0xA00,0xA7F]]],
59
+ ['Telugu', [[0xC00,0xC7F]]],
60
+ ['Malayalam', [[0xD00,0xD7F]]],
61
+ ['Oriya', [[0xB00,0xB7F]]],
62
+ ['Myanmar', [[0x1000,0x109F]]],
63
+ ['Sinhala', [[0xD80,0xDFF]]],
64
+ ['Khmer', [[0x1780,0x17FF],[0x19E0,0x19FF]]],
65
+ ['Armenian', [[0x530,0x58F],[0xFB13,0xFB17]]]
66
+ ].freeze
67
+
68
+ # ISO 639-1 (with 639-3 fallback) lookup by language-name symbol, plus the
69
+ # historical nil => nil entry. Internal; kept for backward compatibility.
70
+ ISO_CODES = CODE_INFO.each_with_object(nil => nil) do |(_code, (name, iso)), h|
71
+ h[name] = iso
72
+ end.freeze
73
+
74
+ NAME_TO_CODE = CODE_INFO.each_with_object({}) do |(code, (name, _iso)), h|
75
+ h[name] ||= code
76
+ end.freeze
77
+
78
+ private_constant :MAX_TRIGRAM_DISTANCE, :MAX_TOTAL_DISTANCE, :TEXT_TRIGRAMS_SIZE,
79
+ :DEFAULT_MIN_CHARS, :DETERMINISTIC, :SCRIPT_RANGES, :ISO_CODES,
80
+ :NAME_TO_CODE
81
+
82
+ class << self
83
+ def detect(text)
84
+ default_detector.detect(text)
85
+ end
86
+
87
+ def language(text)
88
+ default_detector.language(text)
89
+ end
90
+
91
+ def language_iso(text)
92
+ default_detector.language_iso(text)
93
+ end
94
+
95
+ def ranked(text)
96
+ default_detector.ranked(text)
43
97
  end
98
+
99
+ def score_hash(text)
100
+ default_detector.score_hash(text)
101
+ end
102
+
103
+ alias scores score_hash
104
+ alias process_text score_hash
105
+
106
+ def languages
107
+ NAME_TO_CODE.keys
108
+ end
109
+
110
+ # script name => [[code, [trigram, ...]], ...], loaded once and memoized.
111
+ def profiles
112
+ @profiles ||= JSON.parse(File.read(File.join(__dir__, 'whatlanguage', 'trigrams.json')))
113
+ .transform_values { |langs| langs.map { |code, str| [code, str.split('|')] } }
114
+ end
115
+
116
+ private
117
+
118
+ def default_detector
119
+ @default_detector ||= new
120
+ end
121
+ end
122
+
123
+ def initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS)
124
+ @selection = Array(only || (selection.empty? ? [:all] : selection))
125
+ validate_selection!
126
+ @min_chars = min_chars
44
127
  end
45
128
 
129
+ # Language-name symbols this instance scores against: every supported language
130
+ # for :all, otherwise the requested selection intersected with the supported
131
+ # set (legacy aliases such as :pinyin resolved to their modern names).
46
132
  def languages
47
133
  @languages ||=
48
- begin
49
- if @selection.include?(:all)
50
- languages = @@data.keys
51
- else
52
- languages = @@data.keys & @selection # intersection
53
- end
134
+ if @selection.include?(:all)
135
+ self.class.languages
136
+ else
137
+ wanted = @selection.map { |s| NAME_ALIASES.fetch(s, s) }
138
+ self.class.languages & wanted
54
139
  end
55
140
  end
56
141
 
57
- # Very inefficient method for now.. but still beats the non-Bloom alternatives.
58
- # Change to better bit comparison technique later..
59
- def process_text(text)
142
+ # Per-language scores for the text (higher = more likely). Languages outside
143
+ # the current selection, or not under the detected script, are absent; the
144
+ # hash defaults to 0. Only the relative ranking is meaningful.
145
+ def score_hash(text)
60
146
  results = Hash.new(0)
61
- it = 0
62
- to_lowercase(text).split.each do |word|
63
- it += 1
147
+ text = normalize_text(text)
148
+ script = detect_script(text)
149
+ return results unless script
64
150
 
65
- languages.each do |lang|
66
- results[lang] += 1 if @@data[lang].includes?(word)
67
- end
151
+ if (code = DETERMINISTIC[script])
152
+ name = CODE_INFO[code].first
153
+ results[name] = MAX_TOTAL_DISTANCE if allowed?(name)
154
+ return results
155
+ end
68
156
 
69
- # Every now and then check to see if we have a really convincing result.. if so, exit early.
70
- if it % 4 == 0 && results.size > 1
71
- top_results = results.sort_by{|a,b| -b}[0..1]
157
+ candidates = self.class.profiles[script]
158
+ return results unless candidates
159
+ return results if significant_char_count(text) < @min_chars
72
160
 
73
- # Next line may need some tweaking one day..
74
- break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
75
- end
161
+ positions = trigram_positions(text)
162
+ candidates.each do |code, trigrams|
163
+ name = CODE_INFO[code].first
164
+ next unless allowed?(name)
76
165
 
77
- #break if it > 100
166
+ results[name] = MAX_TOTAL_DISTANCE - distance(trigrams, positions)
78
167
  end
79
168
  results
80
169
  end
81
170
 
171
+ alias scores score_hash
172
+ alias process_text score_hash
173
+
174
+ # Per-language scores as an array sorted from most likely to least likely.
175
+ def ranked(text)
176
+ score_hash(text).sort_by { |_name, score| -score }
177
+ end
178
+
179
+ # Detection result with the winning language, ISO code, winning score, and
180
+ # full ranked scores. Returns nil when the text is too short or unrecognized.
181
+ def detect(text)
182
+ ranked_scores = ranked(text)
183
+ return nil if ranked_scores.empty?
184
+
185
+ name, score = ranked_scores.first
186
+ Result.new(language: name, iso: ISO_CODES[name], score: score, ranked: ranked_scores)
187
+ end
188
+
189
+ # Most likely language as a name symbol, or nil when no language is detected.
82
190
  def language(text)
83
- process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
191
+ detect(text)&.language
84
192
  end
85
193
 
194
+ # Most likely language as an ISO 639-1 symbol (639-3 fallback), or nil.
86
195
  def language_iso(text)
87
- ISO_CODES[language(text)]
196
+ detect(text)&.iso
197
+ end
198
+
199
+ private
200
+
201
+ def normalize_text(text)
202
+ text.to_s.unicode_normalize(:nfkc)
203
+ end
204
+
205
+ def allowed?(name)
206
+ @selection.include?(:all) || languages.include?(name)
88
207
  end
89
208
 
90
- def self.filter_from_dictionary(filename)
91
- bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
92
- File.open(filename).each { |word| bf.add(word) }
93
- bf
209
+ def validate_selection!
210
+ requested = @selection.reject { |name| name == :all }
211
+ unknown = requested.reject { |name| self.class.languages.include?(NAME_ALIASES.fetch(name, name)) }
212
+ return if unknown.empty?
213
+
214
+ raise ArgumentError, "Unknown language selection: #{unknown.map(&:inspect).join(', ')}"
215
+ end
216
+
217
+ def significant_char_count(text)
218
+ text.each_char.count { |ch| !stop_char?(ch.ord) }
219
+ end
220
+
221
+ # Dominant Unicode script of the text, or nil if it has no script characters.
222
+ def detect_script(text)
223
+ counts = Hash.new(0)
224
+ text.each_char do |ch|
225
+ cp = ch.ord
226
+ next if stop_char?(cp)
227
+
228
+ SCRIPT_RANGES.each do |name, ranges|
229
+ if ranges.any? { |lo, hi| cp >= lo && cp <= hi }
230
+ counts[name] += 1
231
+ break
232
+ end
233
+ end
234
+ end
235
+ return nil if counts.empty?
236
+
237
+ counts.max_by { |_name, n| n }.first
238
+ end
239
+
240
+ # Text trigrams ranked by descending frequency, mapped to their rank index.
241
+ # Mirrors whatlang's trigram extraction: punctuation/digits become spaces,
242
+ # the stream is bounded by spaces, and runs of spaces are collapsed.
243
+ def trigram_positions(text)
244
+ chars = text.downcase.each_char.map { |c| stop_char?(c.ord) ? ' ' : c }
245
+ return {} if chars.empty?
246
+
247
+ occurrences = Hash.new(0)
248
+ c1 = ' '
249
+ c2 = chars[0]
250
+ (chars[1..] + [' ']).each do |c3|
251
+ occurrences[c1 + c2 + c3] += 1 unless c2 == ' ' && (c1 == ' ' || c3 == ' ')
252
+ c1 = c2
253
+ c2 = c3
254
+ end
255
+
256
+ ranked = occurrences.to_a.sort { |a, b| [b[1], b[0]] <=> [a[1], a[0]] }.first(TEXT_TRIGRAMS_SIZE)
257
+ positions = {}
258
+ ranked.each_with_index { |(trigram, _count), i| positions[trigram] = i }
259
+ positions
260
+ end
261
+
262
+ # Out-of-place distance between a language's ordered trigram profile and the
263
+ # text's ranked trigrams. Lower means a closer match.
264
+ def distance(profile, positions)
265
+ total = 0
266
+ profile.each_with_index do |trigram, i|
267
+ pos = positions[trigram]
268
+ total += pos ? (pos - i).abs : MAX_TRIGRAM_DISTANCE
269
+ end
270
+
271
+ count = positions.size
272
+ total -= (MAX_TRIGRAM_DISTANCE - count) * MAX_TRIGRAM_DISTANCE if MAX_TRIGRAM_DISTANCE > count
273
+ total.clamp(0, MAX_TOTAL_DISTANCE)
94
274
  end
95
275
 
96
- if !defined? UnicodeUtils
97
- define_method(:to_lowercase) { |str| str.downcase }
98
- else
99
- define_method(:to_lowercase) { |str| UnicodeUtils.casefold(str) }
276
+ # Space, ASCII punctuation, or digit: no value for script/language detection.
277
+ def stop_char?(codepoint)
278
+ codepoint <= 0x40 || (codepoint >= 0x5B && codepoint <= 0x60) || (codepoint >= 0x7B && codepoint <= 0x7E)
100
279
  end
101
280
  end
data/whatlanguage.gemspec CHANGED
@@ -1,19 +1,30 @@
1
- # -*- encoding: utf-8 -*-
2
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
  require 'whatlanguage/version'
5
6
 
6
7
  Gem::Specification.new do |gem|
7
- gem.name = "whatlanguage"
8
+ gem.name = 'whatlanguage'
8
9
  gem.version = WhatLanguage::VERSION
9
- gem.authors = ["Peter Cooper"]
10
- gem.email = ["git@peterc.org"]
11
- gem.description = %q{WhatLanguage rapidly detects the language of a sample of text}
12
- gem.summary = %q{Natural language detection for text samples}
13
- gem.homepage = "https://github.com/peterc/whatlanguage"
10
+ gem.authors = ['Peter Cooper']
11
+ gem.email = ['git@peterc.org']
12
+ gem.description = 'WhatLanguage rapidly detects the language of a sample of text'
13
+ gem.summary = 'Natural language detection for text samples'
14
+ gem.homepage = 'https://github.com/peterc/whatlanguage'
15
+ gem.license = 'MIT'
16
+ gem.required_ruby_version = '>= 3.0'
17
+
18
+ gem.files = Dir['lib/**/*'] + [
19
+ 'README.md',
20
+ 'CHANGELOG.md',
21
+ 'LICENSE.txt',
22
+ 'Gemfile',
23
+ 'Rakefile',
24
+ 'whatlanguage.gemspec'
25
+ ]
26
+ gem.require_paths = ['lib']
14
27
 
15
- gem.files = `git ls-files`.split($/).reject { |f| f.start_with?("wordlists") }
16
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
- gem.require_paths = ["lib"]
19
- end
28
+ gem.add_development_dependency 'minitest', '~> 5.0'
29
+ gem.add_development_dependency 'rake'
30
+ end
metadata CHANGED
@@ -1,15 +1,42 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whatlanguage
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Cooper
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2016-01-28 00:00:00.000000000 Z
12
- dependencies: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: minitest
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '5.0'
19
+ type: :development
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '5.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rake
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
13
40
  description: WhatLanguage rapidly detects the language of a sample of text
14
41
  email:
15
42
  - git@peterc.org
@@ -17,48 +44,20 @@ executables: []
17
44
  extensions: []
18
45
  extra_rdoc_files: []
19
46
  files:
20
- - ".gitignore"
47
+ - CHANGELOG.md
21
48
  - Gemfile
22
- - History.txt
23
49
  - LICENSE.txt
24
- - Manifest.txt
25
50
  - README.md
26
51
  - Rakefile
27
- - build_filter.rb
28
- - build_lang_from_wordlists.rb
29
- - copyright-en
30
- - example.rb
31
- - lang/arabic.lang
32
- - lang/danish.lang
33
- - lang/dutch.lang
34
- - lang/english.lang
35
- - lang/farsi.lang
36
- - lang/finnish.lang
37
- - lang/french.lang
38
- - lang/german.lang
39
- - lang/greek.lang
40
- - lang/hebrew.lang
41
- - lang/hungarian.lang
42
- - lang/italian.lang
43
- - lang/korean.lang
44
- - lang/norwegian.lang
45
- - lang/pinyin.lang
46
- - lang/polish.lang
47
- - lang/portuguese.lang
48
- - lang/russian.lang
49
- - lang/spanish.lang
50
- - lang/swedish.lang
51
52
  - lib/whatlanguage.rb
52
- - lib/whatlanguage/bitfield.rb
53
- - lib/whatlanguage/bloominsimple.rb
54
- - lib/whatlanguage/string.rb
53
+ - lib/whatlanguage/languages.rb
54
+ - lib/whatlanguage/trigrams.json
55
55
  - lib/whatlanguage/version.rb
56
- - test/test_whatlanguage.rb
57
56
  - whatlanguage.gemspec
58
57
  homepage: https://github.com/peterc/whatlanguage
59
- licenses: []
58
+ licenses:
59
+ - MIT
60
60
  metadata: {}
61
- post_install_message:
62
61
  rdoc_options: []
63
62
  require_paths:
64
63
  - lib
@@ -66,17 +65,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
66
65
  requirements:
67
66
  - - ">="
68
67
  - !ruby/object:Gem::Version
69
- version: '0'
68
+ version: '3.0'
70
69
  required_rubygems_version: !ruby/object:Gem::Requirement
71
70
  requirements:
72
71
  - - ">="
73
72
  - !ruby/object:Gem::Version
74
73
  version: '0'
75
74
  requirements: []
76
- rubyforge_project:
77
- rubygems_version: 2.4.5
78
- signing_key:
75
+ rubygems_version: 4.0.6
79
76
  specification_version: 4
80
77
  summary: Natural language detection for text samples
81
- test_files:
82
- - test/test_whatlanguage.rb
78
+ test_files: []
data/.gitignore DELETED
@@ -1,17 +0,0 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp
data/History.txt DELETED
@@ -1,20 +0,0 @@
1
- == 1.0.6 / 2016-01-28
2
-
3
- * Minor test fixes and tweaks
4
- * New release taking into account a handful of pull requests
5
-
6
- == 1.0.5 / 2013-10-05
7
-
8
- * Many more languages supported
9
-
10
- == 1.0.4 / 2013-03-07
11
-
12
- == 1.0.1 / 2008-08-22
13
-
14
- * Public release
15
- * Removed wordlists from distribution to reduce size
16
-
17
- == 1.0.0 / 2007-07-02
18
-
19
- * First version with pre-built English, French, and Spanish filters
20
-
data/Manifest.txt DELETED
@@ -1,19 +0,0 @@
1
- History.txt
2
- Manifest.txt
3
- README.txt
4
- Rakefile
5
- build_filter.rb
6
- example.rb
7
- lang/dutch.lang
8
- lang/farsi.lang
9
- lang/german.lang
10
- lang/pinyin.lang
11
- lang/russian.lang
12
- lang/english.lang
13
- lang/portuguese.lang
14
- lang/french.lang
15
- lang/spanish.lang
16
- lib/bitfield.rb
17
- lib/bloominsimple.rb
18
- lib/whatlanguage.rb
19
- test/test_whatlanguage.rb
data/build_filter.rb DELETED
@@ -1,9 +0,0 @@
1
- # Use this to build new filters (for other languages, ideally) from /usr/share/dict/words style dictionaries..
2
- #
3
- # Call like so..
4
- # ruby build_filter.rb /usr/share/dict/words lang/english.lang
5
- # (replace params as necessary)
6
-
7
- require 'lib/whatlanguage'
8
- filter = WhatLanguage.filter_from_dictionary(ARGV[0])
9
- File.open(ARGV[1], 'wb') { |f| f.write filter.dump }
@@ -1,13 +0,0 @@
1
- # Builds all of the word lists in ./wordlists/ into filter files in ./lang/
2
-
3
- require 'lib/whatlanguage'
4
-
5
- languages_folder = File.join(File.dirname(__FILE__), "lang")
6
- wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
7
-
8
- Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
9
- next if lang == 'generators'
10
- puts "Doing #{lang}"
11
- filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
12
- File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
13
- end