RubyGems - whatlanguage - Versions diffs - 1.0.6 → 2.0.0 - Mend

whatlanguage 1.0.6 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +5 -5
data/CHANGELOG.md +41 -0
data/Gemfile +2 -2
data/LICENSE.txt +42 -9
data/README.md +42 -83
data/Rakefile +9 -3
data/lib/whatlanguage/languages.rb +180 -0
data/lib/whatlanguage/trigrams.json +1 -0
data/lib/whatlanguage/version.rb +3 -1
data/lib/whatlanguage.rb +251 -72
data/whatlanguage.gemspec +24 -13
metadata +39 -43
data/.gitignore +0 -17
data/History.txt +0 -20
data/Manifest.txt +0 -19
data/build_filter.rb +0 -9
data/build_lang_from_wordlists.rb +0 -13
data/copyright-en +0 -243
data/example.rb +0 -51
data/lang/arabic.lang +0 -0
data/lang/danish.lang +0 -0
data/lang/dutch.lang +0 -0
data/lang/english.lang +0 -0
data/lang/farsi.lang +0 -0
data/lang/finnish.lang +0 -0
data/lang/french.lang +0 -0
data/lang/german.lang +0 -0
data/lang/greek.lang +0 -0
data/lang/hebrew.lang +0 -0
data/lang/hungarian.lang +0 -0
data/lang/italian.lang +0 -0
data/lang/korean.lang +0 -0
data/lang/norwegian.lang +0 -0
data/lang/pinyin.lang +0 -0
data/lang/polish.lang +0 -0
data/lang/portuguese.lang +0 -0
data/lang/russian.lang +0 -0
data/lang/spanish.lang +0 -0
data/lang/swedish.lang +0 -0
data/lib/whatlanguage/bitfield.rb +0 -64
data/lib/whatlanguage/bloominsimple.rb +0 -88
data/lib/whatlanguage/string.rb +0 -11
data/test/test_whatlanguage.rb +0 -129

data/lib/whatlanguage.rb CHANGED Viewed

@@ -1,101 +1,280 @@
-require 'whatlanguage/bloominsimple'
-require 'whatlanguage/bitfield'
-require 'digest/sha1'
+# frozen_string_literal: true
+require 'json'
+require_relative 'whatlanguage/languages'
+require_relative 'whatlanguage/version'
 class WhatLanguage
-  HASHER = lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
-  BITFIELD_WIDTH = 2_000_000
-  ISO_CODES = {
-    nil => nil,
-    :arabic => :ar,
-    :danish => :da,
-    :dutch  => :nl,
-    :english => :en,
-    :farsi => :fa,
-    :finnish => :fi,
-    :french => :fr,
-    :german => :de,
-    :greek => :el,
-    :hebrew => :he,
-    :hungarian => :hu,
-    :italian => :it,
-    :korean => :ko,
-    :norwegian => :no,
-    :pinyin => :zh,
-    :polish => :pl,
-    :portuguese => :pt,
-    :russian => :ru,
-    :spanish => :es,
-    :swedish => :sv
-  }
-  @@data = {}
-  def initialize(*selection)
-    @selection = (selection.empty?) ? [:all] : selection
-    if @@data.empty?
-      languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
-      Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
-        @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
-      end
+  MAX_TRIGRAM_DISTANCE = 300
+  MAX_TOTAL_DISTANCE   = MAX_TRIGRAM_DISTANCE * MAX_TRIGRAM_DISTANCE # 90_000
+  TEXT_TRIGRAMS_SIZE   = 600
+  DEFAULT_MIN_CHARS    = 10
+  Result = Struct.new(:language, :iso, :score, :ranked, keyword_init: true) do
+    alias scores ranked
+  end
+  # Scripts that resolve to a single language by their Unicode block alone.
+  # (Hiragana and Katakana both indicate Japanese.) Scripts NOT listed here but
+  # present in the trigram dataset are disambiguated statistically instead.
+  DETERMINISTIC = {
+    'Mandarin'  => 'cmn', 'Bengali' => 'ben', 'Hangul'   => 'kor',
+    'Georgian'  => 'kat', 'Greek'   => 'ell', 'Kannada'  => 'kan',
+    'Tamil'     => 'tam', 'Thai'    => 'tha', 'Gujarati' => 'guj',
+    'Gurmukhi'  => 'pan', 'Telugu'  => 'tel', 'Malayalam'=> 'mal',
+    'Oriya'     => 'ori', 'Myanmar' => 'mya', 'Sinhala'  => 'sin',
+    'Khmer'     => 'khm', 'Armenian'=> 'hye', 'Hiragana' => 'jpn',
+    'Katakana'  => 'jpn'
+  }.freeze
+  # Unicode ranges per script, in detection priority order (mirrors whatlang's
+  # scripts/detect.rs). The first script whose range contains a character claims
+  # that character; the script with the most characters wins.
+  SCRIPT_RANGES = [
+    ['Latin',      [[0x61,0x7A],[0x41,0x5A],[0x80,0xFF],[0x100,0x17F],[0x180,0x24F],
+                    [0x250,0x2AF],[0x1D00,0x1D7F],[0x1D80,0x1DBF],[0x1E00,0x1EFF],
+                    [0x2100,0x214F],[0x2C60,0x2C7F],[0xA720,0xA7FF],[0xAB30,0xAB6F]]],
+    ['Cyrillic',   [[0x400,0x484],[0x487,0x52F],[0x2DE0,0x2DFF],[0xA640,0xA69D],
+                    [0x1D2B,0x1D2B],[0x1D78,0x1D78],[0xA69F,0xA69F]]],
+    ['Arabic',     [[0x600,0x6FF],[0x750,0x7FF],[0x8A0,0x8FF],[0xFB50,0xFDFF],
+                    [0xFE70,0xFEFF],[0x10E60,0x10E7F],[0x1EE00,0x1EEFF]]],
+    ['Mandarin',   [[0x2E80,0x2E99],[0x2E9B,0x2EF3],[0x2F00,0x2FD5],[0x3005,0x3005],
+                    [0x3007,0x3007],[0x3021,0x3029],[0x3038,0x303B],[0x3400,0x4DB5],
+                    [0x4E00,0x9FCC],[0xF900,0xFA6D],[0xFA70,0xFAD9]]],
+    ['Devanagari', [[0x900,0x97F],[0xA8E0,0xA8FF],[0x1CD0,0x1CFF]]],
+    ['Hebrew',     [[0x590,0x5FF]]],
+    ['Ethiopic',   [[0x1200,0x139F],[0x2D80,0x2DDF],[0xAB00,0xAB2F]]],
+    ['Georgian',   [[0x10A0,0x10FF]]],
+    ['Bengali',    [[0x980,0x9FF]]],
+    ['Hangul',     [[0xAC00,0xD7AF],[0x1100,0x11FF],[0x3130,0x318F],[0x3200,0x32FF],
+                    [0xA960,0xA97F],[0xD7B0,0xD7FF]]],
+    ['Hiragana',   [[0x3040,0x309F]]],
+    ['Katakana',   [[0x30A0,0x30FF]]],
+    ['Greek',      [[0x370,0x3FF]]],
+    ['Kannada',    [[0xC80,0xCFF]]],
+    ['Tamil',      [[0xB80,0xBFF]]],
+    ['Thai',       [[0xE00,0xE7F]]],
+    ['Gujarati',   [[0xA80,0xAFF]]],
+    ['Gurmukhi',   [[0xA00,0xA7F]]],
+    ['Telugu',     [[0xC00,0xC7F]]],
+    ['Malayalam',  [[0xD00,0xD7F]]],
+    ['Oriya',      [[0xB00,0xB7F]]],
+    ['Myanmar',    [[0x1000,0x109F]]],
+    ['Sinhala',    [[0xD80,0xDFF]]],
+    ['Khmer',      [[0x1780,0x17FF],[0x19E0,0x19FF]]],
+    ['Armenian',   [[0x530,0x58F],[0xFB13,0xFB17]]]
+  ].freeze
+  # ISO 639-1 (with 639-3 fallback) lookup by language-name symbol, plus the
+  # historical nil => nil entry. Internal; kept for backward compatibility.
+  ISO_CODES = CODE_INFO.each_with_object(nil => nil) do |(_code, (name, iso)), h|
+    h[name] = iso
+  end.freeze
+  NAME_TO_CODE = CODE_INFO.each_with_object({}) do |(code, (name, _iso)), h|
+    h[name] ||= code
+  end.freeze
+  private_constant :MAX_TRIGRAM_DISTANCE, :MAX_TOTAL_DISTANCE, :TEXT_TRIGRAMS_SIZE,
+                   :DEFAULT_MIN_CHARS, :DETERMINISTIC, :SCRIPT_RANGES, :ISO_CODES,
+                   :NAME_TO_CODE
+  class << self
+    def detect(text)
+      default_detector.detect(text)
+    end
+    def language(text)
+      default_detector.language(text)
+    end
+    def language_iso(text)
+      default_detector.language_iso(text)
+    end
+    def ranked(text)
+      default_detector.ranked(text)
     end
+    def score_hash(text)
+      default_detector.score_hash(text)
+    end
+    alias scores score_hash
+    alias process_text score_hash
+    def languages
+      NAME_TO_CODE.keys
+    end
+    # script name => [[code, [trigram, ...]], ...], loaded once and memoized.
+    def profiles
+      @profiles ||= JSON.parse(File.read(File.join(__dir__, 'whatlanguage', 'trigrams.json')))
+                        .transform_values { |langs| langs.map { |code, str| [code, str.split('|')] } }
+    end
+    private
+    def default_detector
+      @default_detector ||= new
+    end
+  end
+  def initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS)
+    @selection = Array(only || (selection.empty? ? [:all] : selection))
+    validate_selection!
+    @min_chars = min_chars
   end
+  # Language-name symbols this instance scores against: every supported language
+  # for :all, otherwise the requested selection intersected with the supported
+  # set (legacy aliases such as :pinyin resolved to their modern names).
   def languages
     @languages ||=
-      begin
-        if @selection.include?(:all)
-          languages = @@data.keys
-        else
-          languages = @@data.keys & @selection  # intersection
-        end
+      if @selection.include?(:all)
+        self.class.languages
+      else
+        wanted = @selection.map { |s| NAME_ALIASES.fetch(s, s) }
+        self.class.languages & wanted
       end
   end
-  # Very inefficient method for now.. but still beats the non-Bloom alternatives.
-  # Change to better bit comparison technique later..
-  def process_text(text)
+  # Per-language scores for the text (higher = more likely). Languages outside
+  # the current selection, or not under the detected script, are absent; the
+  # hash defaults to 0. Only the relative ranking is meaningful.
+  def score_hash(text)
     results = Hash.new(0)
-    it = 0
-    to_lowercase(text).split.each do |word|
-      it += 1
+    text = normalize_text(text)
+    script = detect_script(text)
+    return results unless script
-      languages.each do |lang|
-        results[lang] += 1 if @@data[lang].includes?(word)
-      end
+    if (code = DETERMINISTIC[script])
+      name = CODE_INFO[code].first
+      results[name] = MAX_TOTAL_DISTANCE if allowed?(name)
+      return results
+    end
-      # Every now and then check to see if we have a really convincing result.. if so, exit early.
-      if it % 4 == 0 && results.size > 1
-        top_results = results.sort_by{|a,b| -b}[0..1]
+    candidates = self.class.profiles[script]
+    return results unless candidates
+    return results if significant_char_count(text) < @min_chars
-        # Next line may need some tweaking one day..
-        break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
-      end
+    positions = trigram_positions(text)
+    candidates.each do |code, trigrams|
+      name = CODE_INFO[code].first
+      next unless allowed?(name)
-      #break if it > 100
+      results[name] = MAX_TOTAL_DISTANCE - distance(trigrams, positions)
     end
     results
   end
+  alias scores score_hash
+  alias process_text score_hash
+  # Per-language scores as an array sorted from most likely to least likely.
+  def ranked(text)
+    score_hash(text).sort_by { |_name, score| -score }
+  end
+  # Detection result with the winning language, ISO code, winning score, and
+  # full ranked scores. Returns nil when the text is too short or unrecognized.
+  def detect(text)
+    ranked_scores = ranked(text)
+    return nil if ranked_scores.empty?
+    name, score = ranked_scores.first
+    Result.new(language: name, iso: ISO_CODES[name], score: score, ranked: ranked_scores)
+  end
+  # Most likely language as a name symbol, or nil when no language is detected.
   def language(text)
-    process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
+    detect(text)&.language
   end
+  # Most likely language as an ISO 639-1 symbol (639-3 fallback), or nil.
   def language_iso(text)
-    ISO_CODES[language(text)]
+    detect(text)&.iso
+  end
+  private
+  def normalize_text(text)
+    text.to_s.unicode_normalize(:nfkc)
+  end
+  def allowed?(name)
+    @selection.include?(:all) || languages.include?(name)
   end
-  def self.filter_from_dictionary(filename)
-    bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
-    File.open(filename).each { |word| bf.add(word) }
-    bf
+  def validate_selection!
+    requested = @selection.reject { |name| name == :all }
+    unknown = requested.reject { |name| self.class.languages.include?(NAME_ALIASES.fetch(name, name)) }
+    return if unknown.empty?
+    raise ArgumentError, "Unknown language selection: #{unknown.map(&:inspect).join(', ')}"
+  end
+  def significant_char_count(text)
+    text.each_char.count { |ch| !stop_char?(ch.ord) }
+  end
+  # Dominant Unicode script of the text, or nil if it has no script characters.
+  def detect_script(text)
+    counts = Hash.new(0)
+    text.each_char do |ch|
+      cp = ch.ord
+      next if stop_char?(cp)
+      SCRIPT_RANGES.each do |name, ranges|
+        if ranges.any? { |lo, hi| cp >= lo && cp <= hi }
+          counts[name] += 1
+          break
+        end
+      end
+    end
+    return nil if counts.empty?
+    counts.max_by { |_name, n| n }.first
+  end
+  # Text trigrams ranked by descending frequency, mapped to their rank index.
+  # Mirrors whatlang's trigram extraction: punctuation/digits become spaces,
+  # the stream is bounded by spaces, and runs of spaces are collapsed.
+  def trigram_positions(text)
+    chars = text.downcase.each_char.map { |c| stop_char?(c.ord) ? ' ' : c }
+    return {} if chars.empty?
+    occurrences = Hash.new(0)
+    c1 = ' '
+    c2 = chars[0]
+    (chars[1..] + [' ']).each do |c3|
+      occurrences[c1 + c2 + c3] += 1 unless c2 == ' ' && (c1 == ' ' || c3 == ' ')
+      c1 = c2
+      c2 = c3
+    end
+    ranked = occurrences.to_a.sort { |a, b| [b[1], b[0]] <=> [a[1], a[0]] }.first(TEXT_TRIGRAMS_SIZE)
+    positions = {}
+    ranked.each_with_index { |(trigram, _count), i| positions[trigram] = i }
+    positions
+  end
+  # Out-of-place distance between a language's ordered trigram profile and the
+  # text's ranked trigrams. Lower means a closer match.
+  def distance(profile, positions)
+    total = 0
+    profile.each_with_index do |trigram, i|
+      pos = positions[trigram]
+      total += pos ? (pos - i).abs : MAX_TRIGRAM_DISTANCE
+    end
+    count = positions.size
+    total -= (MAX_TRIGRAM_DISTANCE - count) * MAX_TRIGRAM_DISTANCE if MAX_TRIGRAM_DISTANCE > count
+    total.clamp(0, MAX_TOTAL_DISTANCE)
   end
-  if !defined? UnicodeUtils
-    define_method(:to_lowercase) { |str| str.downcase }
-  else
-    define_method(:to_lowercase) { |str| UnicodeUtils.casefold(str) }
+  # Space, ASCII punctuation, or digit: no value for script/language detection.
+  def stop_char?(codepoint)
+    codepoint <= 0x40 || (codepoint >= 0x5B && codepoint <= 0x60) || (codepoint >= 0x7B && codepoint <= 0x7E)
   end
 end

data/whatlanguage.gemspec CHANGED Viewed

@@ -1,19 +1,30 @@
-# -*- encoding: utf-8 -*-
-lib = File.expand_path('../lib', __FILE__)
+# frozen_string_literal: true
+lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'whatlanguage/version'
 Gem::Specification.new do |gem|
-  gem.name          = "whatlanguage"
+  gem.name          = 'whatlanguage'
   gem.version       = WhatLanguage::VERSION
-  gem.authors       = ["Peter Cooper"]
-  gem.email         = ["git@peterc.org"]
-  gem.description   = %q{WhatLanguage rapidly detects the language of a sample of text}
-  gem.summary       = %q{Natural language detection for text samples}
-  gem.homepage      = "https://github.com/peterc/whatlanguage"
+  gem.authors       = ['Peter Cooper']
+  gem.email         = ['git@peterc.org']
+  gem.description   = 'WhatLanguage rapidly detects the language of a sample of text'
+  gem.summary       = 'Natural language detection for text samples'
+  gem.homepage      = 'https://github.com/peterc/whatlanguage'
+  gem.license       = 'MIT'
+  gem.required_ruby_version = '>= 3.0'
+  gem.files = Dir['lib/**/*'] + [
+    'README.md',
+    'CHANGELOG.md',
+    'LICENSE.txt',
+    'Gemfile',
+    'Rakefile',
+    'whatlanguage.gemspec'
+  ]
+  gem.require_paths = ['lib']
-  gem.files         = `git ls-files`.split($/).reject { |f| f.start_with?("wordlists") }
-  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
-  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
-  gem.require_paths = ["lib"]
-end
+  gem.add_development_dependency 'minitest', '~> 5.0'
+  gem.add_development_dependency 'rake'
+end

metadata CHANGED Viewed

@@ -1,15 +1,42 @@
 --- !ruby/object:Gem::Specification
 name: whatlanguage
 version: !ruby/object:Gem::Version
-  version: 1.0.6
+  version: 2.0.0
 platform: ruby
 authors:
 - Peter Cooper
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-28 00:00:00.000000000 Z
-dependencies: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: WhatLanguage rapidly detects the language of a sample of text
 email:
 - git@peterc.org
@@ -17,48 +44,20 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- ".gitignore"
+- CHANGELOG.md
 - Gemfile
-- History.txt
 - LICENSE.txt
-- Manifest.txt
 - README.md
 - Rakefile
-- build_filter.rb
-- build_lang_from_wordlists.rb
-- copyright-en
-- example.rb
-- lang/arabic.lang
-- lang/danish.lang
-- lang/dutch.lang
-- lang/english.lang
-- lang/farsi.lang
-- lang/finnish.lang
-- lang/french.lang
-- lang/german.lang
-- lang/greek.lang
-- lang/hebrew.lang
-- lang/hungarian.lang
-- lang/italian.lang
-- lang/korean.lang
-- lang/norwegian.lang
-- lang/pinyin.lang
-- lang/polish.lang
-- lang/portuguese.lang
-- lang/russian.lang
-- lang/spanish.lang
-- lang/swedish.lang
 - lib/whatlanguage.rb
-- lib/whatlanguage/bitfield.rb
-- lib/whatlanguage/bloominsimple.rb
-- lib/whatlanguage/string.rb
+- lib/whatlanguage/languages.rb
+- lib/whatlanguage/trigrams.json
 - lib/whatlanguage/version.rb
-- test/test_whatlanguage.rb
 - whatlanguage.gemspec
 homepage: https://github.com/peterc/whatlanguage
-licenses: []
+licenses:
+- MIT
 metadata: {}
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -66,17 +65,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: '3.0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.4.5
-signing_key:
+rubygems_version: 4.0.6
 specification_version: 4
 summary: Natural language detection for text samples
-test_files:
-- test/test_whatlanguage.rb
+test_files: []

data/.gitignore DELETED Viewed

@@ -1,17 +0,0 @@
-*.gem
-*.rbc
-.bundle
-.config
-.yardoc
-Gemfile.lock
-InstalledFiles
-_yardoc
-coverage
-doc/
-lib/bundler/man
-pkg
-rdoc
-spec/reports
-test/tmp
-test/version_tmp
-tmp

data/History.txt DELETED Viewed

@@ -1,20 +0,0 @@
-== 1.0.6 / 2016-01-28
-* Minor test fixes and tweaks
-* New release taking into account a handful of pull requests
-== 1.0.5 / 2013-10-05
-* Many more languages supported
-== 1.0.4 / 2013-03-07
-== 1.0.1 / 2008-08-22
-* Public release
-* Removed wordlists from distribution to reduce size
-== 1.0.0 / 2007-07-02
-* First version with pre-built English, French, and Spanish filters

data/Manifest.txt DELETED Viewed

@@ -1,19 +0,0 @@
-History.txt
-Manifest.txt
-README.txt
-Rakefile
-build_filter.rb
-example.rb
-lang/dutch.lang
-lang/farsi.lang
-lang/german.lang
-lang/pinyin.lang
-lang/russian.lang
-lang/english.lang
-lang/portuguese.lang
-lang/french.lang
-lang/spanish.lang
-lib/bitfield.rb
-lib/bloominsimple.rb
-lib/whatlanguage.rb
-test/test_whatlanguage.rb

data/build_filter.rb DELETED Viewed

@@ -1,9 +0,0 @@
-# Use this to build new filters (for other languages, ideally) from /usr/share/dict/words style dictionaries..
-#
-# Call like so..
-#   ruby build_filter.rb /usr/share/dict/words lang/english.lang
-# (replace params as necessary)
-require 'lib/whatlanguage'
-filter = WhatLanguage.filter_from_dictionary(ARGV[0])
-File.open(ARGV[1], 'wb') { |f| f.write filter.dump }

data/build_lang_from_wordlists.rb DELETED Viewed

@@ -1,13 +0,0 @@
-# Builds all of the word lists in ./wordlists/ into filter files in ./lang/
-require 'lib/whatlanguage'
-languages_folder = File.join(File.dirname(__FILE__), "lang")
-wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
-Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
-  next if lang == 'generators'
-  puts "Doing #{lang}"
-  filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
-  File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
-end