RubyGems - omnicat - Versions diffs - 0.1.3 → 0.2.0 - Mend

omnicat 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/.gitignore +1 -0
data/.travis.yml +1 -3
data/CHANGELOG.txt +8 -0
data/README.md +22 -66
data/lib/omnicat.rb +17 -2
data/lib/omnicat/classifier.rb +58 -0
data/lib/omnicat/classifiers/strategy.rb +178 -0
data/lib/omnicat/classifiers/strategy_internals/category.rb +18 -0
data/lib/omnicat/configuration.rb +35 -0
data/lib/omnicat/doc.rb +52 -0
data/lib/omnicat/result.rb +2 -0
data/lib/omnicat/version.rb +1 -1
data/lib/test/unit/classifier_test.rb +1 -0
data/lib/test/unit/classifiers/strategy_test.rb +46 -0
data/lib/test/unit/doc_test.rb +40 -0
data/lib/test/unit/hash_test.rb +4 -2
metadata +24 -17
checksums.yaml +0 -7
data/lib/omnicat/bayes.rb +0 -3
data/lib/omnicat/classifiers/base.rb +0 -55
data/lib/omnicat/classifiers/bayes.rb +0 -174
data/lib/omnicat/classifiers/bayes_internals/category.rb +0 -16
data/lib/omnicat/string.rb +0 -10
data/lib/test/unit/base_test.rb +0 -49
data/lib/test/unit/bayes_test.rb +0 -85
data/lib/test/unit/string_test.rb +0 -17

data/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ /omnicat*.gem

data/.travis.yml CHANGED Viewed

@@ -1,6 +1,4 @@
 language: ruby
 rvm:
   - 1.9.3
-  - 2.0.0
-  - jruby-19mode
-  - rbx-19mode
+  - 2.0.0

data/CHANGELOG.txt CHANGED Viewed

@@ -1,3 +1,11 @@
+Master Branch
+0.2.0
+# bayes classifier moved to another gem which is 'omnicat-bayes'
+# applied 'Strategy Software Design Pattern' for classifiers
+# configuration added with Singleton Software Design Pattern
+# string methods moved to OmniCat::Doc class
 0.1.3
 # refactoring at bayes algorithm

data/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 [![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat) [![Code Climate](https://codeclimate.com/github/mustafaturan/omnicat.png)](https://codeclimate.com/github/mustafaturan/omnicat)
-A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
+A generalized framework for text classifications.
 ## Installation
@@ -20,76 +20,32 @@ Or install it yourself as:
 ## Usage
-See rdoc for detailed usage.
+Stand-alone version of omnicat is just a strategy holder for developers. Its aim is providing omnification of methods for text classification gems with loseless conversion of a strategy to another one. End-users should see 'classifier strategies' section and 'changing classifier strategy' sub section.
-### Bayes classifier
-Create a Bayes classifier object.
+### Changing classifier strategy
-    bayes = OmniCat::Classifiers::Bayes.new
+OmniCat allows you to change strategy on runtime.
-### Create categories
-Create a classification category.
+    # Declare classifier with Naive Bayes classifier
+    classifier = OmniCat::Classifier.new(OmniCat::Classifiers::Bayes.new())
+    ...
+    # do some operations like adding category, training, etc...
+    ...
+    # make some classification using Bayes
+    classifier.classify('I am happy :)')
+    ...
+    # change strategy to Support Vector Machine (SVM) on runtime
+    classifier = OmniCat::Classifier.new(OmniCat::Classifiers::SVM.new())
+    # now you do not need to re-train, add category and so on..
+    # just classify with new strategy
+    classifier.classify('I am happy :)')
-    bayes.add_category('positive')
-    bayes.add_category('negative')
+## Classifier strategies
+Here is the classifier list avaliable for OmniCat.
-### Train
-Train category with a document.
-    bayes.train('positive', 'great if you are in a slap happy mood .')
-    bayes.train('negative', 'bad tracking issue')
-### Train batch
-Train category with multiple documents.
-    bayes.train_batch('positive', [
-      'a feel-good picture in the best sense of the term...',
-      'it is a feel-good movie about which you can actually feel good.',
-      'love and money both of them are good choises'
-    ])
-    bayes.train_batch('negative', [
-      'simplistic , silly and tedious .',
-      'interesting , but not compelling . ',
-      'seems clever but not especially compelling'
-    ])
-### Classify
-Classify a document.
-    result = bayes.classify('I feel so good and happy')
-    => #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
-    result.to_hash
-    => {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
-### Classify batch
-Classify multiple documents at a time.
-    results = bayes.classify_batch(
-      [
-        'the movie is silly so not compelling enough',
-        'a good piece of work'
-      ]
-    )
-    => [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
-### Convert to hash
-Convert full Bayes object to hash.
-    # For storing, restoring modal data
-    bayes_hash = bayes.to_hash
-    => {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
-### Load from hash
-Load full Bayes object from hash.
-    another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
-    => #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
-    another_bayes_obj.classify('best senses')
-    => #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
-## Todo
-* Add more text classification modules such as Support Vector Machine (SVM).
-* Add text cleaning/manipulating extensions such as stopwords cleaner, stemmer, and pos-tagger, etc...
+### Naive Bayes classifier
+* gem 'omnicat-bayes'
+* Details: http://github.com/mustafaturan/omnicat-bayes
 ## Contributing

data/lib/omnicat.rb CHANGED Viewed

@@ -1,7 +1,22 @@
 require File.dirname(__FILE__) + '/omnicat/version'
-require File.dirname(__FILE__) + '/omnicat/string'
+require File.dirname(__FILE__) + '/omnicat/configuration'
 require File.dirname(__FILE__) + '/omnicat/array'
 require File.dirname(__FILE__) + '/omnicat/hash'
 require File.dirname(__FILE__) + '/omnicat/base'
+require File.dirname(__FILE__) + '/omnicat/doc'
 require File.dirname(__FILE__) + '/omnicat/result'
-require File.dirname(__FILE__) + '/omnicat/bayes'
+require File.dirname(__FILE__) + '/omnicat/classifier'
+module OmniCat
+  def self.config
+    OmniCat::Configuration.instance
+  end
+  def self.configure
+    yield config
+  end
+  def self.logger
+    config.logger
+  end
+end

data/lib/omnicat/classifier.rb ADDED Viewed

@@ -0,0 +1,58 @@
+require File.dirname(__FILE__) + '/classifiers/strategy'
+require File.dirname(__FILE__) + '/classifiers/strategy_internals/category'
+require 'forwardable'
+module OmniCat
+  class Classifier
+    extend Forwardable
+    # classification strategy
+    attr_accessor :strategy
+    # delegate category methods
+    def_delegators :@strategy, :add_category, :add_categories
+    # delegate training methods
+    def_delegators :@strategy, :train, :train_batch, :untrain, :untrain_batch
+    # delegate classification methods
+    def_delegators :@strategy, :classify, :classify_batch
+    # delegate base methods
+    def_delegator :@strategy, :to_hash
+    # nodoc
+    def initialize(classifier)
+      @strategy = classifier
+    end
+    def strategy=(classifier)
+      is_interchangeable?(classifier)
+      if @strategy && classifier.doc_count == 0
+        previous_strategy = @strategy
+        @strategy = classifier
+        # pass previous strategy contents into the new one
+        previous_strategy.categories.each do |category_name, category|
+          @strategy.add_category(category_name)
+          category.docs.each do |_, doc|
+            doc.count.times do
+              @strategy.train(category_name, doc.content)
+            end
+          end
+        end
+      else
+        @strategy = classifier
+      end
+    end
+    private
+      def is_interchangeable?(classifier)
+        if classifier.category_size_limit
+          if @strategy.category_count > classifier.category_size_limit
+            raise StandardError,
+              'New classifier category size limit is less than the current classifier\'s category count.'
+          end
+        end
+      end
+  end
+end

data/lib/omnicat/classifiers/strategy.rb ADDED Viewed

@@ -0,0 +1,178 @@
+require 'omnicat'
+module OmniCat
+  module Classifiers
+    #
+    # Author::    Mustafa Turan (mailto:mustafaturan.net@gmail.com)
+    # Copyright:: Copyright (c) 2013 Mustafa Turan
+    # License::   MIT
+    #
+    # The class supplies abstract methods for possible text classifiers
+    class Strategy < ::OmniCat::Base
+      attr_accessor :categories # ::OmniCat::Hash - Hash of categories
+      attr_accessor :category_count # Integer - Total category count
+      attr_accessor :category_size_limit # Integer - Max allowed category
+      attr_accessor :doc_count # Integer - Total token count
+      attr_accessor :token_count # Integer - Total token count
+      attr_accessor :uniq_token_count # Integer - Total uniq token count
+      def initialize(strategy_hash = {})
+        @categories = ::OmniCat::Hash.new
+        @category_count = strategy_hash[:category_count].to_i
+        @category_size_limit = strategy_hash[:category_size_limit].to_i
+        @doc_count = strategy_hash[:doc_count].to_i
+        @token_count = strategy_hash[:token_count].to_i
+        @uniq_token_count = strategy_hash[:uniq_token_count].to_i
+      end
+      # Abstract method for adding new classification category
+      #
+      # ==== Parameters
+      #
+      # * +name+ - Name for category
+      #
+      def add_category(name)
+        not_implemented_error(__callee__)
+      end
+      # Allows adding multiple classification categories
+      #
+      # ==== Parameters
+      #
+      # * +names+ - Array of categories
+      #
+      def add_categories(names)
+        names.each { |name| add_category(name) }
+      end
+      # Abstract method for training the desired category with a document
+      #
+      # ==== Parameters
+      #
+      # * +category+ - Name of the category from added categories list
+      # * +doc+ - Document text
+      #
+      def train(category_name, doc)
+        not_implemented_error(__callee__)
+      end
+      # Train the desired category with multiple documents
+      #
+      # ==== Parameters
+      #
+      # * +category+ - Name of the category from added categories list
+      # * +docs+ - Array of documents
+      #
+      def train_batch(category, docs)
+        docs.each { |doc| train(category, doc) }
+      end
+      # Abstract method for untraining the desired category with a document
+      #
+      # ==== Parameters
+      #
+      # * +category+ - Name of the category from added categories list
+      # * +doc+ - Document text
+      #
+      def untrain(category_name, doc)
+        not_implemented_error(__callee__)
+      end
+      # Untrain the desired category with multiple documents
+      #
+      # ==== Parameters
+      #
+      # * +category+ - Name of the category from added categories list
+      # * +docs+ - Array of documents
+      #
+      def untrain_batch(category, docs)
+        docs.each { |doc| untrain(category, doc) }
+      end
+      # Abstract method for classifying the given document
+      #
+      # ==== Parameters
+      #
+      # * +doc+ - The document for classification
+      #
+      # ==== Returns
+      #
+      # * +result+ - OmniCat::Result object
+      #
+      def classify(doc)
+        not_implemented_error(__callee__)
+      end
+      # Classify the multiple documents at a time
+      #
+      # ==== Parameters
+      #
+      # * +docs+ - Array of documents
+      #
+      # ==== Returns
+      #
+      # * +result_set+ - Array of OmniCat::Result objects
+      #
+      def classify_batch(docs)
+        docs.collect { |doc| classify(doc) }
+      end
+      private
+        # nodoc
+        def not_implemented_error(method_name)
+          raise NotImplementedError.new("#{self.class.name}##{method_name} method is not implemented!")
+        end
+      protected
+        # nodoc
+        def category_exists?(category_name)
+          categories.has_key?(category_name)
+        end
+        # nodoc
+        def increment_category_count
+          @category_count += 1
+        end
+        # nodoc
+        def decrement_category_count
+          @category_count -= 1
+        end
+        # nodoc
+        def increment_doc_counts(category_name)
+          @doc_count += 1
+          @categories[category_name].doc_count += 1
+        end
+        # nodoc
+        def decrement_doc_counts(category_name)
+          @doc_count -= 1
+          @categories[category_name].doc_count -= 1
+        end
+        # nodoc
+        def classifiable?
+          if category_count < 2
+            raise StandardError,
+                  'At least 2 categories needed for classification process!'
+            false
+          elsif doc_avability? == false
+            raise StandardError,
+                  'Each category must trained with at least one document!'
+            false
+          else
+            true
+          end
+        end
+        # nodoc
+        def doc_avability?
+          @categories.each do |_, category|
+            return false if category.doc_count == 0
+          end
+          true
+        end
+    end
+  end
+end

data/lib/omnicat/classifiers/strategy_internals/category.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'omnicat'
+module OmniCat
+  module Classifiers
+    module StrategyInternals
+      class Category < ::OmniCat::Base
+        attr_accessor :doc_count, :docs, :tokens, :token_count
+        def initialize(category_hash = {})
+          @doc_count = category_hash[:doc_count].to_i
+          @docs = category_hash[:docs] || {}
+          @tokens = category_hash[:tokens] || {}
+          @token_count = category_hash[:token_count].to_i
+        end
+      end
+    end
+  end
+end

data/lib/omnicat/configuration.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# encoding: UTF-8
+require 'singleton'
+require 'logger'
+module OmniCat
+  class Configuration
+    include Singleton
+    attr_accessor :logger
+    attr_accessor :exclude_tokens, :logger, :token_patterns
+    def self.default_logger
+      logger = Logger.new(STDOUT)
+      logger.progname = 'omnicat'
+      logger
+    end
+    @@defaults = {
+      exclude_tokens: ['a','about','across','after','all','almost','also','am','among','an','and','are','as','at','be','because','been','by','did','do','does','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','may','me','might','most','must','my','of','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','will','with','would','yet','you','your'],
+      logger: default_logger,
+      token_patterns: {
+        minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
+        plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
+      }
+    }
+    def self.defaults
+      @@defaults
+    end
+    def initialize
+      @@defaults.each_pair{|k,v| self.send("#{k}=",v)}
+    end
+  end
+end

data/lib/omnicat/doc.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# encoding: UTF-8
+require File.dirname(__FILE__) + '/base'
+module OmniCat
+  class Doc < ::OmniCat::Base
+    attr_reader :content, :count, :tokens
+    def initialize(doc_hash = {})
+      @content = doc_hash[:content]
+      @count = (doc_hash[:count] || 1).to_i
+      @tokens = tokenize_with_counts unless @tokens.is_a?(Hash)
+    end
+    def increment_count
+      @count += 1
+    end
+    def decrement_count
+      @count -= 1 if @count > 0
+    end
+    private
+      # nodoc
+      def minus_tokens
+        body = @content
+        OmniCat.config.token_patterns[:minus].each { |p| body.gsub!(p, ' ') }
+        body
+      end
+      # nodoc
+      def plus_tokens(body)
+        body_tokens = []
+        OmniCat.config.token_patterns[:plus].each { |p| body_tokens += body.scan(p) }
+        body_tokens
+      end
+      # nodoc
+      def exclude_tokens
+        OmniCat.config.exclude_tokens
+      end
+      # nodoc
+      def tokenize_with_counts
+        tokenize.hashify_with_counts
+      end
+      # nodoc
+      def tokenize
+        plus_tokens(minus_tokens) - exclude_tokens
+      end
+  end
+end

data/lib/omnicat/result.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require File.dirname(__FILE__) + '/base'
 module OmniCat
   class Result < ::OmniCat::Base
     attr_accessor :category, :scores, :total_score

data/lib/omnicat/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OmniCat
-  VERSION = "0.1.3"
+  VERSION = "0.2.0"
 end

data/lib/test/unit/classifier_test.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require File.dirname(__FILE__) + '/classifiers/strategy_test'

data/lib/test/unit/classifiers/strategy_test.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'test_helper'))
+class TestStrategy < Test::Unit::TestCase
+  def setup
+    @strategy = OmniCat::Classifiers::Strategy.new
+  end
+  def test_add_category
+    assert_raise(NotImplementedError) { @strategy.add_category("positive") }
+  end
+  def test_add_categories
+    assert_raise(NotImplementedError) { @strategy.add_categories(
+      ["neutral", "positive", "negative"]) }
+  end
+  def test_train
+    assert_raise(NotImplementedError) { @strategy.train("positive", "good") }
+  end
+  def test_train_batch
+    assert_raise(NotImplementedError) {
+      @strategy.train_batch("positive", ["good job ever", "valid syntax",
+      "best moments of my life"])
+    }
+  end
+  def test_untrain
+    assert_raise(NotImplementedError) { @strategy.untrain("positive", "good") }
+  end
+  def test_untrain_batch
+    assert_raise(NotImplementedError) { @strategy.untrain_batch(
+      "positive", ["good work", "well done"]) }
+  end
+  def test_classify
+    assert_raise(NotImplementedError) { @strategy.classify("good job") }
+  end
+  def test_classify_batch
+    assert_raise(NotImplementedError) {
+      @strategy.classify_batch(["good job", "you did well"])
+    }
+  end
+end

data/lib/test/unit/doc_test.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# encoding: UTF-8
+require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
+class TestDoc < Test::Unit::TestCase
+  def setup
+    OmniCat.configure do |config|
+      config.exclude_tokens = ["was", "at", "by"]
+      config.token_patterns = {
+        minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
+        plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
+      }
+    end
+    @doc = OmniCat::Doc.new(
+      content: "omnicat v-01 was written at 2011, omnicat by @mustafaturan"
+    )
+  end
+  def test_omnicat_tokenize
+    assert_equal(
+      {"omnicat" => 2, "v-01" => 1, "written" => 1, "2011" => 1},
+      @doc.tokens
+    )
+  end
+  def test_increment_count
+    @doc.increment_count
+    assert_equal(2, @doc.count)
+  end
+  def test_decrement_count
+    @doc.decrement_count
+    assert_equal(0, @doc.count)
+  end
+  def test_decrement_count_if_zero
+    @doc.decrement_count
+    @doc.decrement_count
+    assert_equal(0, @doc.count)
+  end
+end

data/lib/test/unit/hash_test.rb CHANGED Viewed

@@ -2,9 +2,11 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
 class TestHash < Test::Unit::TestCase
   def test_to_hash
-    categories_hash = { "pos" => { doc_count: 0, prior: 0.0, tokens: {}, token_count: 0 } }
+    categories_hash = {
+      "pos" => { doc_count: 0, docs: {}, tokens: {}, token_count: 0 }
+    }
     categories = OmniCat::Hash.new
-    categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
+    categories["pos"] = OmniCat::Classifiers::StrategyInternals::Category.new(categories_hash["pos"])
     assert_equal(categories_hash, categories.to_hash)
   end
 end

metadata CHANGED Viewed

@@ -1,18 +1,20 @@
 --- !ruby/object:Gem::Specification
 name: omnicat
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.2.0
+  prerelease:
 platform: ruby
 authors:
 - Mustafa Turan
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-18 00:00:00.000000000 Z
+date: 2013-07-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -20,6 +22,7 @@ dependencies:
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -27,15 +30,17 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 description: A generalized framework for text classifications.
@@ -45,6 +50,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- .gitignore
 - .travis.yml
 - CHANGELOG.txt
 - Gemfile
@@ -54,43 +60,44 @@ files:
 - lib/omnicat.rb
 - lib/omnicat/array.rb
 - lib/omnicat/base.rb
-- lib/omnicat/bayes.rb
-- lib/omnicat/classifiers/base.rb
-- lib/omnicat/classifiers/bayes.rb
-- lib/omnicat/classifiers/bayes_internals/category.rb
+- lib/omnicat/classifier.rb
+- lib/omnicat/classifiers/strategy.rb
+- lib/omnicat/classifiers/strategy_internals/category.rb
+- lib/omnicat/configuration.rb
+- lib/omnicat/doc.rb
 - lib/omnicat/hash.rb
 - lib/omnicat/result.rb
-- lib/omnicat/string.rb
 - lib/omnicat/version.rb
 - lib/test/test_helper.rb
 - lib/test/unit/array_test.rb
-- lib/test/unit/base_test.rb
-- lib/test/unit/bayes_test.rb
+- lib/test/unit/classifier_test.rb
+- lib/test/unit/classifiers/strategy_test.rb
+- lib/test/unit/doc_test.rb
 - lib/test/unit/hash_test.rb
-- lib/test/unit/string_test.rb
 - omnicat.gemspec
 homepage: https://github.com/mustafaturan/omnicat
 licenses:
 - MIT
-metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.3
+rubygems_version: 1.8.23
 signing_key:
-specification_version: 4
+specification_version: 3
 summary: A generalized framework for text classifications.
 test_files: []

checksums.yaml DELETED Viewed

@@ -1,7 +0,0 @@
----
-SHA1:
-  metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
-  data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
-SHA512:
-  metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
-  data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80

data/lib/omnicat/bayes.rb DELETED Viewed

@@ -1,3 +0,0 @@
-require File.dirname(__FILE__) + '/classifiers/base'
-require File.dirname(__FILE__) + '/classifiers/bayes'
-require File.dirname(__FILE__) + '/classifiers/bayes_internals/category'

data/lib/omnicat/classifiers/base.rb DELETED Viewed

@@ -1,55 +0,0 @@
-module OmniCat
-  module Classifiers
-    class Base < ::OmniCat::Base
-      # Allows adding multiple classification categories
-      #
-      # ==== Parameters
-      #
-      # * +names+ - Array of categories
-      #
-      # ==== Examples
-      #
-      #   # Add multiple categories for classification
-      #   bayes.add_categories(["positive", "negative", "neutral"])
-      def add_categories(names)
-        names.each { |name| add_category(name) }
-      end
-      # Train the desired category with multiple documents
-      #
-      # ==== Parameters
-      #
-      # * +category+ - Name of the category from added categories list
-      # * +docs+ - Array of documents
-      #
-      # ==== Examples
-      #
-      #   # Add multiple docs for training the category
-      #   bayes.train("positive", ["clear documentation", "good, very well"])
-      #   bayes.train("negative", ["bad interface", "damn"])
-      def train_batch(category, docs)
-        docs.each { |doc| train(category, doc) }
-      end
-      # Classify the multiple documents at a time
-      #
-      # ==== Parameters
-      #
-      # * +docs+ - Array of documents
-      #
-      # ==== Returns
-      #
-      # * +result_set+ - Array of OmniCat::Result objects
-      #
-      # ==== Examples
-      #
-      #   # Classify multiple documents
-      #   bayes.classify_batch(["good documentation", "damn workin again"])
-      #   =>
-      def classify_batch(docs)
-        docs.collect { |doc| classify(doc) }
-      end
-    end
-  end
-end

data/lib/omnicat/classifiers/bayes.rb DELETED Viewed

@@ -1,174 +0,0 @@
-module OmniCat
-  module Classifiers
-    class Bayes < ::OmniCat::Classifiers::Base
-      attr_accessor :categories # ::OmniCat::Hash - Hash of categories
-      attr_accessor :category_count # Integer - Total category count
-      attr_accessor :doc_count # Integer - Total token count
-      attr_accessor :token_count # Integer - Total token count
-      attr_accessor :uniq_token_count # Integer - Total uniq token count
-      attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
-      def initialize(bayes_hash = {})
-        self.categories = ::OmniCat::Hash.new
-        if bayes_hash.has_key?(:categories)
-          bayes_hash[:categories].each do |name, category|
-            self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new(category)
-          end
-        end
-        self.category_count = bayes_hash[:category_count].to_i
-        self.doc_count = bayes_hash[:doc_count].to_i
-        self.k_value = bayes_hash[:k_value] || 1.0
-        self.token_count = bayes_hash[:token_count].to_i
-        self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
-      end
-      # Allows adding new classification category
-      #
-      # ==== Parameters
-      #
-      # * +name+ - Name for category
-      #
-      # ==== Examples
-      #
-      #   # Create a classification category
-      #   bayes = Bayes.new
-      #   bayes.add_category("positive")
-      def add_category(name)
-        if category_exists?(name)
-          raise StandardError,
-                "Category with name '#{name}' is already exists!"
-        else
-          self.category_count +=1
-          self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new
-        end
-      end
-      # Train the desired category with a document
-      #
-      # ==== Parameters
-      #
-      # * +category+ - Name of the category from added categories list
-      # * +doc+ - Document text
-      #
-      # ==== Examples
-      #
-      #   # Train the desired category
-      #   bayes.train("positive", "clear documentation")
-      #   bayes.train("positive", "good, very well")
-      #   bayes.train("negative", "bad dog")
-      #   bayes.train("neutral", "how is the management gui")
-      def train(category_name, doc)
-        if category_exists?(category_name)
-          increment_doc_counts(category_name)
-          update_priors
-          doc.tokenize_with_counts.each do |token, count|
-            increment_token_counts(category_name, token, count)
-            self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
-          end
-        else
-          raise StandardError,
-                "Category with name '#{category_name}' does not exist!"
-        end
-      end
-      # Classify the given document
-      #
-      # ==== Parameters
-      #
-      # * +doc+ - The document for classification
-      #
-      # ==== Returns
-      #
-      # * +result+ - OmniCat::Result object
-      #
-      # ==== Examples
-      #
-      #   # Classify a document
-      #   bayes.classify("good documentation")
-      #   =>
-      def classify(doc)
-        if category_count < 2
-          return raise StandardError,
-                       "At least 2 categories needed for classification process!"
-        end
-        score = -1000000
-        result = ::OmniCat::Result.new
-        self.categories.each do |category_name, category|
-          result.scores[category_name] = doc_probability(category, doc)
-          if result.scores[category_name] > score
-            result.category[:name] = category_name
-            score = result.scores[category_name]
-          end
-          result.total_score += result.scores[category_name]
-        end
-        result.total_score = 1 if result.total_score == 0
-        result.category[:percentage] = (
-          result.scores[result.category[:name]] * 100.0 /
-          result.total_score
-        ).floor
-        result
-      end
-      private
-        # nodoc
-        def category_exists?(category_name)
-          categories.has_key?(category_name)
-        end
-        # nodoc
-        def increment_doc_counts(category_name)
-          self.doc_count += 1
-          self.categories[category_name].doc_count += 1
-        end
-        # nodoc
-        def update_priors
-          self.categories.each do |_, category|
-            category.prior = category.doc_count / doc_count.to_f
-          end
-        end
-        # nodoc
-        def increment_token_counts(category_name, token, count)
-          increment_uniq_token_count(token)
-          self.token_count += count
-          self.categories[category_name].token_count += count
-        end
-        # nodoc
-        def increment_uniq_token_count(token)
-          uniq_token_addition = 0
-          categories.each do |_, category|
-             if category.tokens.has_key?(token)
-               uniq_token_addition = 1
-               break
-             end
-          end
-          self.uniq_token_count += 1 if uniq_token_addition == 0
-        end
-        # nodoc
-        def doc_probability(category, doc)
-          score = k_value
-          doc.tokenize_with_counts.each do |token, count|
-            score *= token_probability(category, token, count)
-          end
-          category.prior * score
-        end
-        # nodoc
-        def token_probability(category, token, count)
-          if category.tokens[token].to_i == 0
-            k_value / token_count
-          else
-            count * (
-              (category.tokens[token].to_i + k_value) /
-              (category.token_count + uniq_token_count)
-            )
-          end
-        end
-    end
-  end
-end

data/lib/omnicat/classifiers/bayes_internals/category.rb DELETED Viewed

@@ -1,16 +0,0 @@
-module OmniCat
-  module Classifiers
-    module BayesInternals
-      class Category < ::OmniCat::Base
-        attr_accessor :doc_count, :prior, :tokens, :token_count
-        def initialize(category_hash = {})
-          self.doc_count = category_hash[:doc_count].to_i
-          self.prior = category_hash[:prior].to_f
-          self.tokens = category_hash[:tokens] || {}
-          self.token_count = category_hash[:token_count].to_i
-        end
-      end
-    end
-  end
-end

data/lib/omnicat/string.rb DELETED Viewed

@@ -1,10 +0,0 @@
-# encoding: UTF-8
-class String
-  def omnicat_tokenize
-    self.scan(/([\p{L}\-0-9]{2,})/).collect{ |str_arr| str_arr.first }
-  end
-  def tokenize_with_counts
-    self.omnicat_tokenize.hashify_with_counts
-  end
-end

data/lib/test/unit/base_test.rb DELETED Viewed

@@ -1,49 +0,0 @@
-require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
-class TestBase < Test::Unit::TestCase
-  def setup
-    @bayes = OmniCat::Classifiers::Bayes.new
-  end
-  def test_add_categories
-    @bayes.add_categories ["neutral", "positive", "negative"]
-    assert_not_nil(@bayes.categories["neutral"])
-    assert_equal(
-      ["neutral", "positive", "negative"],
-      @bayes.categories.keys
-    )
-  end
-  def test_train_batch
-    @bayes.add_category "positive"
-    @bayes.train_batch "positive", ["good job ever", "valid syntax",
-      "best moments of my life"]
-    assert_equal(
-      3,
-      @bayes.categories["positive"].doc_count
-    )
-  end
-  def test_classify_batch
-    @bayes.add_category "positive"
-    @bayes.add_category "negative"
-    @bayes.train_batch "positive", ["good job ever", "valid syntax",
-      "best moments of my life"]
-    @bayes.train_batch("negative", ["bad work", "awfull day", "never liked it"])
-    results = @bayes.classify_batch(
-      ["good sytanx research", "bad words"]
-    )
-    assert_equal(2, results.count)
-    assert_equal(
-      "positive",
-      results[0].category[:name]
-    )
-    assert_equal(
-      "negative",
-      results[1].category[:name]
-    )
-  end
-end

data/lib/test/unit/bayes_test.rb DELETED Viewed

@@ -1,85 +0,0 @@
-require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
-class TestBayes < Test::Unit::TestCase
-  def setup
-    @bayes = OmniCat::Classifiers::Bayes.new
-  end
-  def test_add_category
-    @bayes.add_category "neutral"
-    assert_not_nil(@bayes.categories["neutral"])
-    assert_equal(
-      ["neutral"],
-      @bayes.categories.keys
-    )
-    assert_equal(
-      0,
-      @bayes.categories["neutral"].doc_count
-    )
-    assert_equal(
-      {},
-      @bayes.categories["neutral"].tokens
-    )
-    assert_equal(
-      0,
-      @bayes.categories["neutral"].token_count
-    )
-  end
-  def test_add_category_that_already_exists
-    @bayes.add_category "neutral"
-    assert_raise(StandardError) { @bayes.add_category "neutral" }
-  end
-  def test_train_valid_category
-    @bayes.add_category "neutral"
-    @bayes.train "neutral", "how are you?"
-    assert_equal(
-      1,
-      @bayes.categories["neutral"].doc_count
-    )
-    assert_equal(
-      {"how" => 1, "are" => 1, "you" => 1},
-      @bayes.categories["neutral"].tokens
-    )
-    assert_equal(
-      3,
-      @bayes.categories["neutral"].token_count
-    )
-  end
-  def test_train_missing_category
-    assert_raise(StandardError) { @bayes.train "neutral", "how are you?" }
-  end
-  def test_classify
-    @bayes.add_category "positive"
-    @bayes.add_category "negative"
-    @bayes.train("positive", "good job")
-    @bayes.train("negative", "bad work")
-    assert_equal(
-      "positive",
-      @bayes.classify("very good position for this sentence").category[:name]
-    )
-    assert_equal(
-      "negative",
-      @bayes.classify("bad words").category[:name]
-    )
-  end
-  def test_initialize_with_hash
-    bayes1 = ::OmniCat::Classifiers::Bayes.new
-    bayes1.add_category "positive"
-    bayes1.add_category "negative"
-    bayes1.train("positive", "good job")
-    bayes1.train("negative", "bad work")
-    h1 = bayes1.to_hash
-    bayes2 = ::OmniCat::Classifiers::Bayes.new(h1)
-    assert_equal(h1, bayes2.to_hash)
-  end
-  def test_classify_with_insufficient_categories
-    assert_raise(StandardError) { @bayes.classify "blank" }
-  end
-end

data/lib/test/unit/string_test.rb DELETED Viewed

@@ -1,17 +0,0 @@
-require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
-class TestString < Test::Unit::TestCase
-  def test_omnicat_tokenize
-    assert_equal(
-      ["mustafa", "turan", "omni-cat-v0", "1986"],
-      "mustafa turan omni-cat-v0 1986 1 a s d".omnicat_tokenize
-    )
-  end
-  def test_tokenize_with_counts
-    assert_equal(
-      {"omnicat" => 2, "written" => 1, "at" => 1, "2011" => 1},
-      "omnicat written at 2011, omnicat".tokenize_with_counts
-    )
-  end
-end