RubyGems - stuff-classifier-zh - Versions diffs - 0.5.2 - Mend

stuff-classifier-zh 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/.gitignore +6 -0
data/Gemfile +3 -0
data/LICENSE.txt +20 -0
data/README.md +162 -0
data/Rakefile +12 -0
data/lib/stuff-classifier.rb +16 -0
data/lib/stuff-classifier/base.rb +190 -0
data/lib/stuff-classifier/bayes.rb +81 -0
data/lib/stuff-classifier/storage.rb +122 -0
data/lib/stuff-classifier/tf-idf.rb +44 -0
data/lib/stuff-classifier/tokenizer.rb +94 -0
data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb +107 -0
data/lib/stuff-classifier/version.rb +3 -0
data/stuff-classifier.gemspec +36 -0
data/test/helper.rb +49 -0
data/test/test_001_tokenizer.rb +62 -0
data/test/test_002_base.rb +38 -0
data/test/test_003_naive_bayes.rb +56 -0
data/test/test_004_tf_idf.rb +37 -0
data/test/test_005_in_memory_storage.rb +31 -0
data/test/test_006_file_storage.rb +77 -0
data/test/test_007_redis_storage.rb +81 -0
metadata +228 -0

data/lib/stuff-classifier/storage.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# encoding : UTF-8
+module StuffClassifier
+  class Storage
+    module ActAsStorable
+        def storable(*to_store)
+          @to_store = to_store
+        end
+        def to_store
+          @to_store || []
+        end
+    end
+    attr_accessor :storage
+    def initialize(*opts)
+      @storage = {}
+    end
+    def storage_to_classifier(classifier)
+      if @storage.key? classifier.name
+        @storage[classifier.name].each do |var,value|
+          classifier.instance_variable_set "@#{var}",value
+        end
+      end
+    end
+    def classifier_to_storage(classifier)
+      to_store = classifier.class.to_store + classifier.class.superclass.to_store
+      @storage[classifier.name] =  to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
+    end
+    def clear_storage(classifier)
+      @storage.delete(classifier.name)
+    end
+  end
+  class InMemoryStorage < Storage
+    def initialize
+      super
+    end
+    def load_state(classifier)
+      storage_to_classifier(classifier)
+    end
+    def save_state(classifier)
+      classifier_to_storage(classifier)
+    end
+    def purge_state(classifier)
+      clear_storage(classifier)
+    end
+  end
+  class FileStorage < Storage
+    def initialize(path)
+      super
+      @path = path
+    end
+    def load_state(classifier)
+      if @storage.length == 0 && File.exists?(@path)
+        data = File.open(@path, 'rb') { |f| f.read }
+        @storage = Marshal.load(data)
+      end
+      storage_to_classifier(classifier)
+    end
+    def save_state(classifier)
+      classifier_to_storage(classifier)
+      _write_to_file
+    end
+    def purge_state(classifier)
+      clear_storage(classifier)
+      _write_to_file
+    end
+    def _write_to_file
+      File.open(@path, 'wb') do |fh|
+        fh.flock(File::LOCK_EX)
+        fh.write(Marshal.dump(@storage))
+      end
+    end
+  end
+  class RedisStorage < Storage
+    def initialize(key, redis_options=nil)
+      super
+      @key = key
+      @redis = Redis.new(redis_options || {})
+    end
+    def load_state(classifier)
+      if @storage.length == 0 && @redis.exists(@key)
+        data = @redis.get(@key)
+        @storage = Marshal.load(data)
+      end
+      storage_to_classifier(classifier)
+    end
+    def save_state(classifier)
+      classifier_to_storage(classifier)
+      _write_to_redis
+    end
+    def purge_state(classifier)
+      clear_storage(classifier)
+      _write_to_redis
+    end
+    private
+    def _write_to_redis
+      data = Marshal.dump(@storage)
+      @redis.set(@key, data)
+    end
+  end
+end

data/lib/stuff-classifier/tf-idf.rb ADDED Viewed

@@ -0,0 +1,44 @@
+class StuffClassifier::TfIdf < StuffClassifier::Base
+  extend StuffClassifier::Storage::ActAsStorable
+  def initialize(name, opts={})
+    super(name, opts)
+  end
+  def word_prob(word, cat)
+    word_cat_nr = word_count(word, cat)
+    cat_nr = cat_count(cat)
+    tf = 1.0 * word_cat_nr / cat_nr
+    idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
+    tf * idf
+  end
+  def text_prob(text, cat)
+    @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
+  end
+  def cat_scores(text)
+    probs = {}
+    categories.each do |cat|
+      p = text_prob(text, cat)
+      probs[cat] = p
+    end
+    probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
+  end
+  def word_classification_detail(word)
+    p "tf_idf"
+    result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
+    ap result
+    p "text_prob"
+    result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
+    ap result
+  end
+end

data/lib/stuff-classifier/tokenizer.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# encoding: utf-8
+require "lingua/stemmer"
+require 'rmmseg'
+require 'rmmseg/dictionary'
+RMMSeg::Dictionary.load_dictionaries
+class StuffClassifier::Tokenizer
+  require  "stuff-classifier/tokenizer/tokenizer_properties"
+  def initialize(opts={})
+    @language = opts.key?(:language) ? opts[:language] : "en"
+    @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
+    @stemming = opts.key?(:stemming) ? opts[:stemming] : true
+    if @stemming
+      @stemmer = Lingua::Stemmer.new(:language => @language)
+    end
+  end
+  def language
+    @language
+  end
+  def preprocessing_regexps=(value)
+    @preprocessing_regexps = value
+  end
+  def preprocessing_regexps
+    @preprocessing_regexps || @properties[:preprocessing_regexps]
+  end
+  def ignore_words=(value)
+    @ignore_words = value
+  end
+  def ignore_words
+    @ignore_words || @properties[:stop_word]
+  end
+  def stemming?
+    @stemming || false
+  end
+  def each_word(string)
+    string = string.strip
+    return if string == ''
+    words = []
+    # tokenize string
+    string.split("\n").each do |line|
+      # Apply preprocessing regexps
+      if preprocessing_regexps
+        preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
+      end
+      list = language == 'zh' ? segment(line) : line.gsub(/\p{Word}+/)
+      list.each do |w|
+          next if w == '' || ignore_words.member?(w.downcase)
+        if stemming? and stemable?(w)
+          w = @stemmer.stem(w).downcase
+          next if ignore_words.member?(w)
+        else
+          w = w.downcase
+        end
+        words << (block_given? ? (yield w) : w)
+      end
+    end
+    return words
+  end
+private
+  def stemable?(word)
+    word =~ /^\p{Alpha}+$/
+  end
+  def segment text
+    algor = RMMSeg::Algorithm.new(text)
+    result = []
+    loop do
+      tok = algor.next_token
+      break if tok.nil?
+      result << tok.text.force_encoding('utf-8')
+    end
+    result
+   end
+end

data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb ADDED Viewed

@@ -0,0 +1,107 @@
+# encoding: utf-8
+require 'set'
+StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
+  "en" => {
+    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
+    :stop_word => Set.new([
+    'a', 'about', 'above', 'across', 'after', 'afterwards',
+    'again', 'against', 'all', 'almost', 'alone', 'along',
+    'already', 'also', 'although', 'always', 'am', 'among',
+    'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
+    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
+    'are', 'around', 'as', 'at', 'back', 'be',
+    'became', 'because', 'become', 'becomes', 'becoming', 'been',
+    'before', 'beforehand', 'behind', 'being', 'below', 'beside',
+    'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
+    'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
+    'co', 'computer', 'con', 'could', 'couldnt', 'cry',
+    'de', 'describe', 'detail', 'do', 'done', 'down',
+    'due', 'during', 'each', 'eg', 'eight', 'either',
+    'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
+    'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
+    'fify', 'fill', 'find', 'fire', 'first', 'five',
+    'for', 'former', 'formerly', 'forty', 'found', 'four',
+    'from', 'front', 'full', 'further', 'get', 'give',
+    'go', 'had', 'has', 'hasnt', 'have', 'he',
+    'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
+    'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
+    'how', 'however', 'hundred', 'i', 'ie', 'if',
+    'in', 'inc', 'indeed', 'interest', 'into', 'is',
+    'it', 'its', 'itself', 'keep', 'last', 'latter',
+    'latterly', 'least', 'less', 'ltd', 'made', 'many',
+    'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
+    'more', 'moreover', 'most', 'mostly', 'move', 'much',
+    'must', 'my', 'myself', 'name', 'namely', 'neither',
+    'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
+    'none', 'noone', 'nor', 'not', 'nothing', 'now',
+    'nowhere', 'of', 'off', 'often', 'on', 'once',
+    'one', 'only', 'onto', 'or', 'other', 'others',
+    'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
+    'own', 'part', 'per', 'perhaps', 'please', 'put',
+    'rather', 're', 'same', 'see', 'seem', 'seemed',
+    'seeming', 'seems', 'serious', 'several', 'she', 'should',
+    'show', 'side', 'since', 'sincere', 'six', 'sixty',
+    'so', 'some', 'somehow', 'someone', 'something', 'sometime',
+    'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
+    'ten', 'than', 'that', 'the', 'their', 'them',
+    'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
+    'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
+    'thin', 'third', 'this', 'those', 'though', 'three',
+    'through', 'throughout', 'thru', 'thus', 'to', 'together',
+    'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
+    'two', 'un', 'under', 'until', 'up', 'upon',
+    'us', 'very', 'via', 'was', 'we', 'well',
+    'were', 'what', 'whatever', 'when', 'whence', 'whenever',
+    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
+    'wherever', 'whether', 'which', 'while', 'whither', 'who',
+    'whoever', 'whole', 'whom', 'whose', 'why', 'will',
+    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
+    'yourself', 'yourselves'
+    ])
+  },
+  "fr" => {
+    :stop_word => Set.new([
+    'au',  'aux',  'avec',  'ce',  'ces',  'dans',  'de',  'des',  'du',  'elle',  'en',  'et',  'eux',
+    'il',  'je',  'la',  'le',  'leur',  'lui',  'ma',  'mais',  'me',  'même',  'mes',  'moi',  'mon',
+    'ne',  'nos',  'notre',  'nous',  'on',  'ou',  'par',  'pas',  'pour',  'qu',  'que',  'qui',  'sa',
+    'se',  'ses',  'son',  'sur',  'ta',  'te',  'tes',  'toi',  'ton',  'tu',  'un',  'une',  'vos',  'votre',
+    'vous',  'c',  'd',  'j',  'l',  'à',  'm',  'n',  's',  't',  'y',  'été',  'étée',  'étées',
+    'étés',  'étant',  'suis',  'es',  'est',  'sommes',  'êtes',  'sont',  'serai',  'seras',
+    'sera',  'serons',  'serez',  'seront',  'serais',  'serait',  'serions',  'seriez',  'seraient',
+    'étais',  'était',  'étions',  'étiez',  'étaient',  'fus',  'fut',  'fûmes',  'fûtes',
+    'furent',  'sois',  'soit',  'soyons',  'soyez',  'soient',  'fusse',  'fusses',  'fût',
+    'fussions',  'fussiez',  'fussent',  'ayant',  'eu',  'eue',  'eues',  'eus',  'ai',  'as',
+    'avons',  'avez',  'ont',  'aurai',  'auras',  'aura',  'aurons',  'aurez',  'auront',  'aurais',
+    'aurait',  'aurions',  'auriez',  'auraient',  'avais',  'avait',  'avions',  'aviez',  'avaient',
+    'eut',  'eûmes',  'eûtes',  'eurent',  'aie',  'aies',  'ait',  'ayons',  'ayez',  'aient',  'eusse',
+    'eusses',  'eût',  'eussions',  'eussiez',  'eussent',  'ceci',  'celà ',  'cet',  'cette',  'ici',
+    'ils',  'les',  'leurs',  'quel',  'quels',  'quelle',  'quelles',  'sans',  'soi'
+    ])
+  },
+  "de" => {
+    :stop_word => Set.new([
+    'aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere',
+    'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf',
+    'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das',
+    'daß', 'dass', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe',
+    'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du',
+    'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen',
+    'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es',
+    'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat',
+    'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres',
+    'euch', 'im', 'in', 'indem', 'ins', 'ist', 'jede', 'jedem', 'jeden', 'jeder', 'jedes', 'jene', 'jenem', 'jenen', 'jener',
+    'jenes', 'jetzt', 'kann', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'können', 'könnte', 'machen', 'man', 'manche',
+    'manchem', 'manchen', 'mancher', 'manches', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mit', 'muss', 'musste', 'nach',
+    'nicht', 'nichts', 'noch', 'nun', 'nur', 'ob', 'oder', 'ohne', 'sehr', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'selbst',
+    'sich', 'sie', 'ihnen', 'sind', 'so', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollte', 'sondern', 'sonst', 'über',
+    'um', 'und', 'uns', 'unse', 'unsem', 'unsen', 'unser', 'unses', 'unter', 'viel', 'vom', 'von', 'vor', 'während', 'war', 'waren', 'warst',
+    'was', 'weg', 'weil', 'weiter', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wenn', 'werde', 'werden', 'wie', 'wieder', 'will',
+    'wir', 'wird', 'wirst', 'wo', 'wollen', 'wollte', 'würde', 'würden', 'zu', 'zum', 'zur', 'zwar', 'zwischen'
+    ])
+  },
+  "zh" =>{
+    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
+    :stop_word => Set.new([
+    ])
+  }
+}

data/lib/stuff-classifier/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module StuffClassifier
+  VERSION = '0.5.2'
+end

data/stuff-classifier.gemspec ADDED Viewed

@@ -0,0 +1,36 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "stuff-classifier/version"
+Gem::Specification.new do |s|
+  s.name        = "stuff-classifier-zh"
+  s.version     = StuffClassifier::VERSION
+  s.authors     = ["Alexandru Nedelcu"]
+  s.email       = ["github@contact.bionicspirit.com"]
+  s.homepage    = "https://github.com/alexandru/stuff-classifier/"
+  s.summary     = %q{Simple text classifier(s) implemetation}
+  s.description = %q{2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.required_ruby_version = '>= 1.9.1'
+  s.add_runtime_dependency "ruby-stemmer"
+  s.add_runtime_dependency "sequel"
+  s.add_runtime_dependency "redis"
+  s.add_runtime_dependency "rmmseg-cpp"
+  s.add_development_dependency "bundler"
+  s.add_development_dependency "rake", ">= 0.9.2"
+  s.add_development_dependency "minitest", "~> 4"
+  s.add_development_dependency "turn", ">= 0.8.3"
+  s.add_development_dependency "simplecov"
+  s.add_development_dependency "awesome_print"
+  s.add_development_dependency "byebug"
+end

data/test/helper.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'simplecov'
+SimpleCov.start
+require 'turn'
+require 'minitest/autorun'
+require 'stuff-classifier'
+Turn.config do |c|
+ # use one of output formats:
+ # :outline  - turn's original case/test outline mode [default]
+ # :progress - indicates progress with progress bar
+ # :dotted   - test/unit's traditional dot-progress mode
+ # :pretty   - new pretty reporter
+ # :marshal  - dump output as YAML (normal run mode only)
+ # :cue      - interactive testing
+ c.format  = :outline
+ # turn on invoke/execute tracing, enable full backtrace
+ c.trace   = true
+ # use humanized test names (works only with :outline format)
+ c.natural = true
+end
+class TestBase < MiniTest::Unit::TestCase
+  def self.before(&block)
+    @on_setup = block if block
+    @on_setup
+  end
+  def setup
+    on_setup = self.class.before
+    instance_eval(&on_setup) if on_setup
+  end
+  def set_classifier(instance)
+    @classifier = instance
+  end
+  def classifier
+    @classifier
+  end
+  def train(category, value)
+    @classifier.train(category, value)
+  end
+  def should_be(category, value)
+    assert_equal category, @classifier.classify(value), value
+  end
+end