RubyGems - stuff-classifier-chinese - Versions diffs - 0.51 - Mend

stuff-classifier-chinese 0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/.gitignore +6 -0
data/Gemfile +3 -0
data/LICENSE.txt +20 -0
data/README.md +162 -0
data/Rakefile +12 -0
data/lib/stuff-classifier.rb +17 -0
data/lib/stuff-classifier/base.rb +190 -0
data/lib/stuff-classifier/bayes.rb +81 -0
data/lib/stuff-classifier/storage.rb +122 -0
data/lib/stuff-classifier/tf-idf.rb +45 -0
data/lib/stuff-classifier/tokenizer.rb +96 -0
data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb +81 -0
data/lib/stuff-classifier/version.rb +4 -0
data/stuff-classifier.gemspec +36 -0
data/test/helper.rb +50 -0
data/test/test_001_tokenizer.rb +51 -0
data/test/test_002_base.rb +39 -0
data/test/test_003_naive_bayes.rb +57 -0
data/test/test_004_tf_idf.rb +38 -0
data/test/test_005_in_memory_storage.rb +32 -0
data/test/test_006_file_storage.rb +78 -0
data/test/test_007_redis_storage.rb +82 -0
metadata +253 -0

data/lib/stuff-classifier/storage.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# -*- encoding : utf-8 -*-
+module StuffClassifier
+  class Storage
+    module ActAsStorable
+        def storable(*to_store)
+          @to_store = to_store
+        end
+        def to_store
+          @to_store || []
+        end
+    end
+    attr_accessor :storage
+    def initialize(*opts)
+      @storage = {}
+    end
+    def storage_to_classifier(classifier)
+      if @storage.key? classifier.name
+        @storage[classifier.name].each do |var,value|
+          classifier.instance_variable_set "@#{var}",value
+        end
+      end
+    end
+    def classifier_to_storage(classifier)
+      to_store = classifier.class.to_store + classifier.class.superclass.to_store
+      @storage[classifier.name] =  to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
+    end
+    def clear_storage(classifier)
+      @storage.delete(classifier.name)
+    end
+  end
+  class InMemoryStorage < Storage
+    def initialize
+      super
+    end
+    def load_state(classifier)
+      storage_to_classifier(classifier)
+    end
+    def save_state(classifier)
+      classifier_to_storage(classifier)
+    end
+    def purge_state(classifier)
+      clear_storage(classifier)
+    end
+  end
+  class FileStorage < Storage
+    def initialize(path)
+      super
+      @path = path
+    end
+    def load_state(classifier)
+      if @storage.length == 0 && File.exists?(@path)
+        data = File.open(@path, 'rb') { |f| f.read }
+        @storage = Marshal.load(data)
+      end
+      storage_to_classifier(classifier)
+    end
+    def save_state(classifier)
+      classifier_to_storage(classifier)
+      _write_to_file
+    end
+    def purge_state(classifier)
+      clear_storage(classifier)
+      _write_to_file
+    end
+    def _write_to_file
+      File.open(@path, 'wb') do |fh|
+        fh.flock(File::LOCK_EX)
+        fh.write(Marshal.dump(@storage))
+      end
+    end
+  end
+  class RedisStorage < Storage
+    def initialize(key, redis_options=nil)
+      super
+      @key = key
+      @redis = Redis.new(redis_options || {})
+    end
+    def load_state(classifier)
+      if @storage.length == 0 && @redis.exists(@key)
+        data = @redis.get(@key)
+        @storage = Marshal.load(data)
+      end
+      storage_to_classifier(classifier)
+    end
+    def save_state(classifier)
+      classifier_to_storage(classifier)
+      _write_to_redis
+    end
+    def purge_state(classifier)
+      clear_storage(classifier)
+      _write_to_redis
+    end
+    private
+    def _write_to_redis
+      data = Marshal.dump(@storage)
+      @redis.set(@key, data)
+    end
+  end
+end

data/lib/stuff-classifier/tf-idf.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# -*- encoding : utf-8 -*-
+class StuffClassifier::TfIdf < StuffClassifier::Base
+  extend StuffClassifier::Storage::ActAsStorable
+  def initialize(name, opts={})
+    super(name, opts)
+  end
+  def word_prob(word, cat)
+    word_cat_nr = word_count(word, cat)
+    cat_nr = cat_count(cat)
+    tf = 1.0 * word_cat_nr / cat_nr
+    idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
+    tf * idf
+  end
+  def text_prob(text, cat)
+    @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
+  end
+  def cat_scores(text)
+    probs = {}
+    categories.each do |cat|
+      p = text_prob(text, cat)
+      probs[cat] = p
+    end
+    probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
+  end
+  def word_classification_detail(word)
+    p "tf_idf"
+    result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
+    ap result
+    p "text_prob"
+    result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
+    ap result
+  end
+end

data/lib/stuff-classifier/tokenizer.rb ADDED Viewed

@@ -0,0 +1,96 @@
+# -*- encoding : utf-8 -*-
+require "lingua/stemmer"
+require 'rmmseg'
+require 'debugger'
+class StuffClassifier::Tokenizer
+  require  "stuff-classifier/tokenizer/tokenizer_properties"
+  include RMMSeg
+  RMMSeg::Dictionary.load_dictionaries
+  def initialize(opts={})
+    @language = opts.key?(:language) ? opts[:language] : "en"
+    @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
+    @stemming = opts.key?(:stemming) ? opts[:stemming] : true
+    if @stemming
+      @stemmer = Lingua::Stemmer.new(:language => @language)
+    end
+  end
+  def language
+    @language
+  end
+  def preprocessing_regexps=(value)
+    @preprocessing_regexps = value
+  end
+  def preprocessing_regexps
+    @preprocessing_regexps || @properties[:preprocessing_regexps]
+  end
+  def ignore_words=(value)
+    @ignore_words = value
+  end
+  def ignore_words
+    @ignore_words || @properties[:stop_word]
+  end
+  def stemming?
+    @stemming || false
+  end
+  def each_word(string)
+    string = string.strip
+    return if string == ''
+    words = []
+    # tokenize string
+    string.split("\n").each do |line|
+      # Apply preprocessing regexps
+      if preprocessing_regexps
+        preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
+      end
+      segment(line).each do |w|
+          next if w == '' || ignore_words.member?(w.downcase)
+        if stemming? and stemable?(w)
+          w = @stemmer.stem(w).downcase
+          next if ignore_words.member?(w)
+        else
+          w = w.downcase
+        end
+        words << (block_given? ? (yield w) : w)
+      end
+    end
+    return words
+  end
+private
+  def stemable?(word)
+    true
+    #word =~ /^\p{Alpha}+$/
+  end
+  def segment text
+    algor = RMMSeg::Algorithm.new(text)
+    result = []
+    loop do
+      tok = algor.next_token
+      break if tok.nil?
+      result << tok.text
+    end
+    result
+  end
+end

data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# -*- encoding : utf-8 -*-
+require 'set'
+StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
+  "en" => {
+    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
+    :stop_word => Set.new([
+    '的','个','得',
+    'a', 'about', 'above', 'across', 'after', 'afterwards',
+    'again', 'against', 'all', 'almost', 'alone', 'along',
+    'already', 'also', 'although', 'always', 'am', 'among',
+    'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
+    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
+    'are', 'around', 'as', 'at', 'back', 'be',
+    'became', 'because', 'become', 'becomes', 'becoming', 'been',
+    'before', 'beforehand', 'behind', 'being', 'below', 'beside',
+    'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
+    'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
+    'co', 'computer', 'con', 'could', 'couldnt', 'cry',
+    'de', 'describe', 'detail', 'do', 'done', 'down',
+    'due', 'during', 'each', 'eg', 'eight', 'either',
+    'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
+    'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
+    'fify', 'fill', 'find', 'fire', 'first', 'five',
+    'for', 'former', 'formerly', 'forty', 'found', 'four',
+    'from', 'front', 'full', 'further', 'get', 'give',
+    'go', 'had', 'has', 'hasnt', 'have', 'he',
+    'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
+    'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
+    'how', 'however', 'hundred', 'i', 'ie', 'if',
+    'in', 'inc', 'indeed', 'interest', 'into', 'is',
+    'it', 'its', 'itself', 'keep', 'last', 'latter',
+    'latterly', 'least', 'less', 'ltd', 'made', 'many',
+    'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
+    'more', 'moreover', 'most', 'mostly', 'move', 'much',
+    'must', 'my', 'myself', 'name', 'namely', 'neither',
+    'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
+    'none', 'noone', 'nor', 'not', 'nothing', 'now',
+    'nowhere', 'of', 'off', 'often', 'on', 'once',
+    'one', 'only', 'onto', 'or', 'other', 'others',
+    'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
+    'own', 'part', 'per', 'perhaps', 'please', 'put',
+    'rather', 're', 'same', 'see', 'seem', 'seemed',
+    'seeming', 'seems', 'serious', 'several', 'she', 'should',
+    'show', 'side', 'since', 'sincere', 'six', 'sixty',
+    'so', 'some', 'somehow', 'someone', 'something', 'sometime',
+    'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
+    'ten', 'than', 'that', 'the', 'their', 'them',
+    'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
+    'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
+    'thin', 'third', 'this', 'those', 'though', 'three',
+    'through', 'throughout', 'thru', 'thus', 'to', 'together',
+    'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
+    'two', 'un', 'under', 'until', 'up', 'upon',
+    'us', 'very', 'via', 'was', 'we', 'well',
+    'were', 'what', 'whatever', 'when', 'whence', 'whenever',
+    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
+    'wherever', 'whether', 'which', 'while', 'whither', 'who',
+    'whoever', 'whole', 'whom', 'whose', 'why', 'will',
+    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
+    'yourself', 'yourselves'
+])
+},"fr" => {
+  :stop_word => Set.new([
+  'au',  'aux',  'avec',  'ce',  'ces',  'dans',  'de',  'des',  'du',  'elle',  'en',  'et',  'eux',
+  'il',  'je',  'la',  'le',  'leur',  'lui',  'ma',  'mais',  'me',  'même',  'mes',  'moi',  'mon',
+  'ne',  'nos',  'notre',  'nous',  'on',  'ou',  'par',  'pas',  'pour',  'qu',  'que',  'qui',  'sa',
+  'se',  'ses',  'son',  'sur',  'ta',  'te',  'tes',  'toi',  'ton',  'tu',  'un',  'une',  'vos',  'votre',
+  'vous',  'c',  'd',  'j',  'l',  'à',  'm',  'n',  's',  't',  'y',  'été',  'étée',  'étées',
+  'étés',  'étant',  'suis',  'es',  'est',  'sommes',  'êtes',  'sont',  'serai',  'seras',
+  'sera',  'serons',  'serez',  'seront',  'serais',  'serait',  'serions',  'seriez',  'seraient',
+  'étais',  'était',  'étions',  'étiez',  'étaient',  'fus',  'fut',  'fûmes',  'fûtes',
+  'furent',  'sois',  'soit',  'soyons',  'soyez',  'soient',  'fusse',  'fusses',  'fût',
+  'fussions',  'fussiez',  'fussent',  'ayant',  'eu',  'eue',  'eues',  'eus',  'ai',  'as',
+  'avons',  'avez',  'ont',  'aurai',  'auras',  'aura',  'aurons',  'aurez',  'auront',  'aurais',
+  'aurait',  'aurions',  'auriez',  'auraient',  'avais',  'avait',  'avions',  'aviez',  'avaient',
+  'eut',  'eûmes',  'eûtes',  'eurent',  'aie',  'aies',  'ait',  'ayons',  'ayez',  'aient',  'eusse',
+  'eusses',  'eût',  'eussions',  'eussiez',  'eussent',  'ceci',  'celà ',  'cet',  'cette',  'ici',
+  'ils',  'les',  'leurs',  'quel',  'quels',  'quelle',  'quelles',  'sans',  'soi'
+  ])
+  }
+}

data/lib/stuff-classifier/version.rb ADDED Viewed

@@ -0,0 +1,4 @@
+# -*- encoding : utf-8 -*-
+module StuffClassifier
+  VERSION = '0.51'
+end

data/stuff-classifier.gemspec ADDED Viewed

@@ -0,0 +1,36 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "stuff-classifier/version"
+Gem::Specification.new do |s|
+  s.name        = "stuff-classifier-chinese"
+  s.version     = StuffClassifier::VERSION
+  s.authors     = ["Tim Lang"]
+  s.email       = ["langyong135@gmail.com"]
+  s.homepage    = "https://github.com/TimLang/stuff-classifier/"
+  s.summary     = %q{Simple text classifier(s) implemetation Chinese version}
+  s.description = %q{forked from https://github.com/alexandru/stuff-classifier, 2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.required_ruby_version = '>= 1.9.1'
+  s.add_runtime_dependency "ruby-stemmer"
+  s.add_runtime_dependency "sequel"
+  s.add_runtime_dependency "redis"
+  s.add_development_dependency "bundler"
+  s.add_development_dependency "rake", ">= 0.9.2"
+  s.add_development_dependency "minitest", "~> 4"
+  s.add_development_dependency "turn", ">= 0.8.3"
+  s.add_development_dependency "simplecov"
+  s.add_development_dependency "awesome_print"
+  s.add_development_dependency "rmmseg-cpp-huacnlee"
+  s.add_development_dependency "debugger"
+end

data/test/helper.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# -*- encoding : utf-8 -*-
+require 'simplecov'
+SimpleCov.start
+require 'turn'
+require 'minitest/autorun'
+require 'stuff-classifier'
+Turn.config do |c|
+ # use one of output formats:
+ # :outline  - turn's original case/test outline mode [default]
+ # :progress - indicates progress with progress bar
+ # :dotted   - test/unit's traditional dot-progress mode
+ # :pretty   - new pretty reporter
+ # :marshal  - dump output as YAML (normal run mode only)
+ # :cue      - interactive testing
+ c.format  = :cue
+ # turn on invoke/execute tracing, enable full backtrace
+ c.trace   = true
+ # use humanized test names (works only with :outline format)
+ c.natural = true
+end
+class TestBase < MiniTest::Unit::TestCase
+  def self.before(&block)
+    @on_setup = block if block
+    @on_setup
+  end
+  def setup
+    on_setup = self.class.before
+    instance_eval(&on_setup) if on_setup
+  end
+  def set_classifier(instance)
+    @classifier = instance
+  end
+  def classifier
+    @classifier
+  end
+  def train(category, value)
+    @classifier.train(category, value)
+  end
+  def should_be(category, value)
+    assert_equal category, @classifier.classify(value), value
+  end
+end

data/test/test_001_tokenizer.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# -*- encoding : utf-8 -*-
+# -*- coding: utf-8 -*-
+require './helper.rb'
+class Test001Tokenizer < TestBase
+  before do
+    @en_tokenizer = StuffClassifier::Tokenizer.new
+    @fr_tokenizer = StuffClassifier::Tokenizer.new(:language => "fr")
+  end
+  def test_simple_tokens
+     words =  @en_tokenizer.each_word('Hello world! How are you?')
+     should_return = ["hello", "world"]
+     assert_equal should_return, words
+  end
+  def test_with_stemming
+    words =  @en_tokenizer.each_word('Lots of dogs, lots of cats! This really is the information highway')
+    should_return =["lot", "dog", "lot", "cat", "realli" ,"inform", "highway" ]
+    assert_equal should_return, words
+  end
+  def test_complicated_tokens
+    words = @en_tokenizer.each_word("I don't really get what you want to
+      accomplish. There is a class TestEval2, you can do test_eval2 =
+      TestEval2.new afterwards. And: class A ... end always yields nil, so
+      your output is ok I guess ;-)")
+    should_return = [
+      "realli", "want", "accomplish", "class",
+      "testeval2",  "test", "eval2","testeval2", "new", "class", "end",
+      "yield", "nil", "output", "ok", "guess"]
+    assert_equal should_return, words
+  end
+  def test_unicode
+    words = @fr_tokenizer.each_word("il s'appelle le vilain petit canard : en référence à Hans Christian Andersen, se démarquer négativement")
+    should_return = [
+      "appel", "vilain", "pet", "canard", "référent",
+      "han", "christian", "andersen", "démarqu", "négat"]
+    assert_equal should_return, words
+  end
+end