RubyGems - whatlanguage - Versions diffs - 1.0.5 → 2.0.0 - Mend

whatlanguage 1.0.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +41 -0
data/Gemfile +2 -2
data/LICENSE.txt +42 -9
data/README.md +50 -76
data/Rakefile +9 -3
data/lib/whatlanguage/languages.rb +180 -0
data/lib/whatlanguage/trigrams.json +1 -0
data/lib/whatlanguage/version.rb +3 -1
data/lib/whatlanguage.rb +264 -50
data/whatlanguage.gemspec +24 -13
metadata +43 -47
data/.gitignore +0 -17
data/History.txt +0 -15
data/Manifest.txt +0 -19
data/build_filter.rb +0 -9
data/build_lang_from_wordlists.rb +0 -13
data/copyright-en +0 -243
data/example.rb +0 -51
data/lang/arabic.lang +0 -0
data/lang/dutch.lang +0 -0
data/lang/english.lang +0 -0
data/lang/farsi.lang +0 -0
data/lang/finnish.lang +0 -0
data/lang/french.lang +0 -0
data/lang/german.lang +0 -0
data/lang/greek.lang +0 -0
data/lang/hebrew.lang +0 -0
data/lang/hungarian.lang +0 -0
data/lang/italian.lang +0 -0
data/lang/korean.lang +0 -0
data/lang/norwegian.lang +0 -0
data/lang/pinyin.lang +0 -0
data/lang/polish.lang +0 -0
data/lang/portuguese.lang +0 -0
data/lang/russian.lang +0 -0
data/lang/spanish.lang +0 -0
data/lang/swedish.lang +0 -0
data/lib/whatlanguage/bitfield.rb +0 -64
data/lib/whatlanguage/bloominsimple.rb +0 -88
data/test/test_whatlanguage.rb +0 -113

data/lib/whatlanguage.rb CHANGED Viewed

@@ -1,66 +1,280 @@
-require 'whatlanguage/bloominsimple'
-require 'whatlanguage/bitfield'
-require 'digest/sha1'
-class WhatLanguage
-  HASHER = lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
-  BITFIELD_WIDTH = 2_000_000
-  @@data = {}
-  def initialize(*selection)
-    @selection = (selection.empty?) ? [:all] : selection
-    languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
-    Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
-      @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
+# frozen_string_literal: true
+require 'json'
+require_relative 'whatlanguage/languages'
+require_relative 'whatlanguage/version'
+class WhatLanguage
+  MAX_TRIGRAM_DISTANCE = 300
+  MAX_TOTAL_DISTANCE   = MAX_TRIGRAM_DISTANCE * MAX_TRIGRAM_DISTANCE # 90_000
+  TEXT_TRIGRAMS_SIZE   = 600
+  DEFAULT_MIN_CHARS    = 10
+  Result = Struct.new(:language, :iso, :score, :ranked, keyword_init: true) do
+    alias scores ranked
+  end
+  # Scripts that resolve to a single language by their Unicode block alone.
+  # (Hiragana and Katakana both indicate Japanese.) Scripts NOT listed here but
+  # present in the trigram dataset are disambiguated statistically instead.
+  DETERMINISTIC = {
+    'Mandarin'  => 'cmn', 'Bengali' => 'ben', 'Hangul'   => 'kor',
+    'Georgian'  => 'kat', 'Greek'   => 'ell', 'Kannada'  => 'kan',
+    'Tamil'     => 'tam', 'Thai'    => 'tha', 'Gujarati' => 'guj',
+    'Gurmukhi'  => 'pan', 'Telugu'  => 'tel', 'Malayalam'=> 'mal',
+    'Oriya'     => 'ori', 'Myanmar' => 'mya', 'Sinhala'  => 'sin',
+    'Khmer'     => 'khm', 'Armenian'=> 'hye', 'Hiragana' => 'jpn',
+    'Katakana'  => 'jpn'
+  }.freeze
+  # Unicode ranges per script, in detection priority order (mirrors whatlang's
+  # scripts/detect.rs). The first script whose range contains a character claims
+  # that character; the script with the most characters wins.
+  SCRIPT_RANGES = [
+    ['Latin',      [[0x61,0x7A],[0x41,0x5A],[0x80,0xFF],[0x100,0x17F],[0x180,0x24F],
+                    [0x250,0x2AF],[0x1D00,0x1D7F],[0x1D80,0x1DBF],[0x1E00,0x1EFF],
+                    [0x2100,0x214F],[0x2C60,0x2C7F],[0xA720,0xA7FF],[0xAB30,0xAB6F]]],
+    ['Cyrillic',   [[0x400,0x484],[0x487,0x52F],[0x2DE0,0x2DFF],[0xA640,0xA69D],
+                    [0x1D2B,0x1D2B],[0x1D78,0x1D78],[0xA69F,0xA69F]]],
+    ['Arabic',     [[0x600,0x6FF],[0x750,0x7FF],[0x8A0,0x8FF],[0xFB50,0xFDFF],
+                    [0xFE70,0xFEFF],[0x10E60,0x10E7F],[0x1EE00,0x1EEFF]]],
+    ['Mandarin',   [[0x2E80,0x2E99],[0x2E9B,0x2EF3],[0x2F00,0x2FD5],[0x3005,0x3005],
+                    [0x3007,0x3007],[0x3021,0x3029],[0x3038,0x303B],[0x3400,0x4DB5],
+                    [0x4E00,0x9FCC],[0xF900,0xFA6D],[0xFA70,0xFAD9]]],
+    ['Devanagari', [[0x900,0x97F],[0xA8E0,0xA8FF],[0x1CD0,0x1CFF]]],
+    ['Hebrew',     [[0x590,0x5FF]]],
+    ['Ethiopic',   [[0x1200,0x139F],[0x2D80,0x2DDF],[0xAB00,0xAB2F]]],
+    ['Georgian',   [[0x10A0,0x10FF]]],
+    ['Bengali',    [[0x980,0x9FF]]],
+    ['Hangul',     [[0xAC00,0xD7AF],[0x1100,0x11FF],[0x3130,0x318F],[0x3200,0x32FF],
+                    [0xA960,0xA97F],[0xD7B0,0xD7FF]]],
+    ['Hiragana',   [[0x3040,0x309F]]],
+    ['Katakana',   [[0x30A0,0x30FF]]],
+    ['Greek',      [[0x370,0x3FF]]],
+    ['Kannada',    [[0xC80,0xCFF]]],
+    ['Tamil',      [[0xB80,0xBFF]]],
+    ['Thai',       [[0xE00,0xE7F]]],
+    ['Gujarati',   [[0xA80,0xAFF]]],
+    ['Gurmukhi',   [[0xA00,0xA7F]]],
+    ['Telugu',     [[0xC00,0xC7F]]],
+    ['Malayalam',  [[0xD00,0xD7F]]],
+    ['Oriya',      [[0xB00,0xB7F]]],
+    ['Myanmar',    [[0x1000,0x109F]]],
+    ['Sinhala',    [[0xD80,0xDFF]]],
+    ['Khmer',      [[0x1780,0x17FF],[0x19E0,0x19FF]]],
+    ['Armenian',   [[0x530,0x58F],[0xFB13,0xFB17]]]
+  ].freeze
+  # ISO 639-1 (with 639-3 fallback) lookup by language-name symbol, plus the
+  # historical nil => nil entry. Internal; kept for backward compatibility.
+  ISO_CODES = CODE_INFO.each_with_object(nil => nil) do |(_code, (name, iso)), h|
+    h[name] = iso
+  end.freeze
+  NAME_TO_CODE = CODE_INFO.each_with_object({}) do |(code, (name, _iso)), h|
+    h[name] ||= code
+  end.freeze
+  private_constant :MAX_TRIGRAM_DISTANCE, :MAX_TOTAL_DISTANCE, :TEXT_TRIGRAMS_SIZE,
+                   :DEFAULT_MIN_CHARS, :DETERMINISTIC, :SCRIPT_RANGES, :ISO_CODES,
+                   :NAME_TO_CODE
+  class << self
+    def detect(text)
+      default_detector.detect(text)
+    end
+    def language(text)
+      default_detector.language(text)
+    end
+    def language_iso(text)
+      default_detector.language_iso(text)
+    end
+    def ranked(text)
+      default_detector.ranked(text)
+    end
+    def score_hash(text)
+      default_detector.score_hash(text)
+    end
+    alias scores score_hash
+    alias process_text score_hash
+    def languages
+      NAME_TO_CODE.keys
+    end
+    # script name => [[code, [trigram, ...]], ...], loaded once and memoized.
+    def profiles
+      @profiles ||= JSON.parse(File.read(File.join(__dir__, 'whatlanguage', 'trigrams.json')))
+                        .transform_values { |langs| langs.map { |code, str| [code, str.split('|')] } }
+    end
+    private
+    def default_detector
+      @default_detector ||= new
     end
   end
-  # Very inefficient method for now.. but still beats the non-Bloom alternatives.
-  # Change to better bit comparison technique later..
-  def process_text(text)
-    results = Hash.new(0)
-    it = 0
-    text.downcase.split.each do |word|
-      it += 1
+  def initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS)
+    @selection = Array(only || (selection.empty? ? [:all] : selection))
+    validate_selection!
+    @min_chars = min_chars
+  end
+  # Language-name symbols this instance scores against: every supported language
+  # for :all, otherwise the requested selection intersected with the supported
+  # set (legacy aliases such as :pinyin resolved to their modern names).
+  def languages
+    @languages ||=
       if @selection.include?(:all)
-        languages = @@data.keys
+        self.class.languages
       else
-        languages = @@data.keys & @selection  # intersection
+        wanted = @selection.map { |s| NAME_ALIASES.fetch(s, s) }
+        self.class.languages & wanted
       end
+  end
-      languages.each do |lang|
-        results[lang] += 1 if @@data[lang].includes?(word)
-      end
-      # Every now and then check to see if we have a really convincing result.. if so, exit early.
-      if it % 4 == 0 && results.size > 1
-        top_results = results.sort_by{|a,b| -b}[0..1]
-        # Next line may need some tweaking one day..
-        break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
-      end
-      #break if it > 100
+  # Per-language scores for the text (higher = more likely). Languages outside
+  # the current selection, or not under the detected script, are absent; the
+  # hash defaults to 0. Only the relative ranking is meaningful.
+  def score_hash(text)
+    results = Hash.new(0)
+    text = normalize_text(text)
+    script = detect_script(text)
+    return results unless script
+    if (code = DETERMINISTIC[script])
+      name = CODE_INFO[code].first
+      results[name] = MAX_TOTAL_DISTANCE if allowed?(name)
+      return results
+    end
+    candidates = self.class.profiles[script]
+    return results unless candidates
+    return results if significant_char_count(text) < @min_chars
+    positions = trigram_positions(text)
+    candidates.each do |code, trigrams|
+      name = CODE_INFO[code].first
+      next unless allowed?(name)
+      results[name] = MAX_TOTAL_DISTANCE - distance(trigrams, positions)
     end
     results
   end
+  alias scores score_hash
+  alias process_text score_hash
+  # Per-language scores as an array sorted from most likely to least likely.
+  def ranked(text)
+    score_hash(text).sort_by { |_name, score| -score }
+  end
+  # Detection result with the winning language, ISO code, winning score, and
+  # full ranked scores. Returns nil when the text is too short or unrecognized.
+  def detect(text)
+    ranked_scores = ranked(text)
+    return nil if ranked_scores.empty?
+    name, score = ranked_scores.first
+    Result.new(language: name, iso: ISO_CODES[name], score: score, ranked: ranked_scores)
+  end
+  # Most likely language as a name symbol, or nil when no language is detected.
   def language(text)
-    process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
+    detect(text)&.language
   end
-  def self.filter_from_dictionary(filename)
-    bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
-    File.open(filename).each { |word| bf.add(word) }
-    bf
+  # Most likely language as an ISO 639-1 symbol (639-3 fallback), or nil.
+  def language_iso(text)
+    detect(text)&.iso
+  end
+  private
+  def normalize_text(text)
+    text.to_s.unicode_normalize(:nfkc)
+  end
+  def allowed?(name)
+    @selection.include?(:all) || languages.include?(name)
+  end
+  def validate_selection!
+    requested = @selection.reject { |name| name == :all }
+    unknown = requested.reject { |name| self.class.languages.include?(NAME_ALIASES.fetch(name, name)) }
+    return if unknown.empty?
+    raise ArgumentError, "Unknown language selection: #{unknown.map(&:inspect).join(', ')}"
+  end
+  def significant_char_count(text)
+    text.each_char.count { |ch| !stop_char?(ch.ord) }
+  end
+  # Dominant Unicode script of the text, or nil if it has no script characters.
+  def detect_script(text)
+    counts = Hash.new(0)
+    text.each_char do |ch|
+      cp = ch.ord
+      next if stop_char?(cp)
+      SCRIPT_RANGES.each do |name, ranges|
+        if ranges.any? { |lo, hi| cp >= lo && cp <= hi }
+          counts[name] += 1
+          break
+        end
+      end
+    end
+    return nil if counts.empty?
+    counts.max_by { |_name, n| n }.first
+  end
+  # Text trigrams ranked by descending frequency, mapped to their rank index.
+  # Mirrors whatlang's trigram extraction: punctuation/digits become spaces,
+  # the stream is bounded by spaces, and runs of spaces are collapsed.
+  def trigram_positions(text)
+    chars = text.downcase.each_char.map { |c| stop_char?(c.ord) ? ' ' : c }
+    return {} if chars.empty?
+    occurrences = Hash.new(0)
+    c1 = ' '
+    c2 = chars[0]
+    (chars[1..] + [' ']).each do |c3|
+      occurrences[c1 + c2 + c3] += 1 unless c2 == ' ' && (c1 == ' ' || c3 == ' ')
+      c1 = c2
+      c2 = c3
+    end
+    ranked = occurrences.to_a.sort { |a, b| [b[1], b[0]] <=> [a[1], a[0]] }.first(TEXT_TRIGRAMS_SIZE)
+    positions = {}
+    ranked.each_with_index { |(trigram, _count), i| positions[trigram] = i }
+    positions
+  end
+  # Out-of-place distance between a language's ordered trigram profile and the
+  # text's ranked trigrams. Lower means a closer match.
+  def distance(profile, positions)
+    total = 0
+    profile.each_with_index do |trigram, i|
+      pos = positions[trigram]
+      total += pos ? (pos - i).abs : MAX_TRIGRAM_DISTANCE
+    end
+    count = positions.size
+    total -= (MAX_TRIGRAM_DISTANCE - count) * MAX_TRIGRAM_DISTANCE if MAX_TRIGRAM_DISTANCE > count
+    total.clamp(0, MAX_TOTAL_DISTANCE)
   end
-end
-class String
-  def language
-    WhatLanguage.new(:all).language(self)
+  # Space, ASCII punctuation, or digit: no value for script/language detection.
+  def stop_char?(codepoint)
+    codepoint <= 0x40 || (codepoint >= 0x5B && codepoint <= 0x60) || (codepoint >= 0x7B && codepoint <= 0x7E)
   end
 end

data/whatlanguage.gemspec CHANGED Viewed

@@ -1,19 +1,30 @@
-# -*- encoding: utf-8 -*-
-lib = File.expand_path('../lib', __FILE__)
+# frozen_string_literal: true
+lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'whatlanguage/version'
 Gem::Specification.new do |gem|
-  gem.name          = "whatlanguage"
+  gem.name          = 'whatlanguage'
   gem.version       = WhatLanguage::VERSION
-  gem.authors       = ["Peter Cooper"]
-  gem.email         = ["git@peterc.org"]
-  gem.description   = %q{WhatLanguage rapidly detects the language of a sample of text}
-  gem.summary       = %q{Natural language detection for text samples}
-  gem.homepage      = "https://github.com/peterc/whatlanguage"
+  gem.authors       = ['Peter Cooper']
+  gem.email         = ['git@peterc.org']
+  gem.description   = 'WhatLanguage rapidly detects the language of a sample of text'
+  gem.summary       = 'Natural language detection for text samples'
+  gem.homepage      = 'https://github.com/peterc/whatlanguage'
+  gem.license       = 'MIT'
+  gem.required_ruby_version = '>= 3.0'
+  gem.files = Dir['lib/**/*'] + [
+    'README.md',
+    'CHANGELOG.md',
+    'LICENSE.txt',
+    'Gemfile',
+    'Rakefile',
+    'whatlanguage.gemspec'
+  ]
+  gem.require_paths = ['lib']
-  gem.files         = `git ls-files`.split($/).reject { |f| f.start_with?("wordlists") }
-  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
-  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
-  gem.require_paths = ["lib"]
-end
+  gem.add_development_dependency 'minitest', '~> 5.0'
+  gem.add_development_dependency 'rake'
+end

metadata CHANGED Viewed

@@ -1,16 +1,42 @@
 --- !ruby/object:Gem::Specification
 name: whatlanguage
 version: !ruby/object:Gem::Version
-  version: 1.0.5
-  prerelease:
+  version: 2.0.0
 platform: ruby
 authors:
 - Peter Cooper
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-05 00:00:00.000000000 Z
-dependencies: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: WhatLanguage rapidly detects the language of a sample of text
 email:
 - git@peterc.org
@@ -18,65 +44,35 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
+- CHANGELOG.md
 - Gemfile
-- History.txt
 - LICENSE.txt
-- Manifest.txt
 - README.md
 - Rakefile
-- build_filter.rb
-- build_lang_from_wordlists.rb
-- copyright-en
-- example.rb
-- lang/arabic.lang
-- lang/dutch.lang
-- lang/english.lang
-- lang/farsi.lang
-- lang/finnish.lang
-- lang/french.lang
-- lang/german.lang
-- lang/greek.lang
-- lang/hebrew.lang
-- lang/hungarian.lang
-- lang/italian.lang
-- lang/korean.lang
-- lang/norwegian.lang
-- lang/pinyin.lang
-- lang/polish.lang
-- lang/portuguese.lang
-- lang/russian.lang
-- lang/spanish.lang
-- lang/swedish.lang
 - lib/whatlanguage.rb
-- lib/whatlanguage/bitfield.rb
-- lib/whatlanguage/bloominsimple.rb
+- lib/whatlanguage/languages.rb
+- lib/whatlanguage/trigrams.json
 - lib/whatlanguage/version.rb
-- test/test_whatlanguage.rb
 - whatlanguage.gemspec
 homepage: https://github.com/peterc/whatlanguage
-licenses: []
-post_install_message:
+licenses:
+- MIT
+metadata: {}
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: '3.0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 1.8.24
-signing_key:
-specification_version: 3
+rubygems_version: 4.0.6
+specification_version: 4
 summary: Natural language detection for text samples
-test_files:
-- test/test_whatlanguage.rb
+test_files: []

data/.gitignore DELETED Viewed

@@ -1,17 +0,0 @@
-*.gem
-*.rbc
-.bundle
-.config
-.yardoc
-Gemfile.lock
-InstalledFiles
-_yardoc
-coverage
-doc/
-lib/bundler/man
-pkg
-rdoc
-spec/reports
-test/tmp
-test/version_tmp
-tmp

data/History.txt DELETED Viewed

@@ -1,15 +0,0 @@
-== 1.0.5 / 2013-10-05
-* Many more languages supported
-== 1.0.4 / 2013-03-07
-== 1.0.1 / 2008-08-22
-* Public release
-* Removed wordlists from distribution to reduce size
-== 1.0.0 / 2007-07-02
-* First version with pre-built English, French, and Spanish filters

data/Manifest.txt DELETED Viewed

@@ -1,19 +0,0 @@
-History.txt
-Manifest.txt
-README.txt
-Rakefile
-build_filter.rb
-example.rb
-lang/dutch.lang
-lang/farsi.lang
-lang/german.lang
-lang/pinyin.lang
-lang/russian.lang
-lang/english.lang
-lang/portuguese.lang
-lang/french.lang
-lang/spanish.lang
-lib/bitfield.rb
-lib/bloominsimple.rb
-lib/whatlanguage.rb
-test/test_whatlanguage.rb

data/build_filter.rb DELETED Viewed

@@ -1,9 +0,0 @@
-# Use this to build new filters (for other languages, ideally) from /usr/share/dict/words style dictionaries..
-#
-# Call like so..
-#   ruby build_filter.rb /usr/share/dict/words lang/english.lang
-# (replace params as necessary)
-require 'lib/whatlanguage'
-filter = WhatLanguage.filter_from_dictionary(ARGV[0])
-File.open(ARGV[1], 'wb') { |f| f.write filter.dump }

data/build_lang_from_wordlists.rb DELETED Viewed

@@ -1,13 +0,0 @@
-# Builds all of the word lists in ./wordlists/ into filter files in ./lang/
-require 'lib/whatlanguage'
-languages_folder = File.join(File.dirname(__FILE__), "lang")
-wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
-Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
-  next if lang == 'generators'
-  puts "Doing #{lang}"
-  filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
-  File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
-end