RubyGems - tiny-classifier - Versions diffs - 1.2 → 1.3 - Mend

tiny-classifier 1.2 → 1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +23 -3
data/bin/tc-classify +1 -1
data/bin/tc-generate-classifier +20 -0
data/bin/tc-train +1 -1
data/lib/tiny-classifier/base.rb +72 -88
data/lib/tiny-classifier/classifier-generator.rb +88 -0
data/lib/tiny-classifier/classifier.rb +20 -17
data/lib/tiny-classifier/tokenizer.rb +51 -0
data/lib/tiny-classifier/trainer.rb +44 -42
data/tiny-classifier.gemspec +1 -1
metadata +5 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e4640ed17e08cec0d4c7e06a7acedc0527669506
-  data.tar.gz: 982f06aad8df916300a18b04ca00ac9e0ae2dceb
+  metadata.gz: 9f1938c2074b9e055cba5fcfe02fd5dc89d39d89
+  data.tar.gz: da79e06a3acbe42ebad1b4576194cf774314298c
 SHA512:
-  metadata.gz: dd7a63f5ff86e02b32a63a852f40d17ffce45ccd9ee5d68ee1fc131f68d770f595fae5c12a039d207ef557e2ee23229c2bea0755d7c36b84444bd0721e27e29e
-  data.tar.gz: 67ad761ba648d8718b1dd6339e83c221856223e180787d14e7761106b7589e588468f6d47d6bc3f3471d2c591868a0d72756532d5e530f944bc7f8f5e10409f4
+  metadata.gz: 4b5f7099909ca9da6f754829881ee7de94a703f6c4abc32d958fe94f3a10be7f056acc17f5e1e3aadd66b8e3caa2a72f8c6ed2bcdfe6a8b6e20540114324a685
+  data.tar.gz: fb1b1aa131bee28c831c81a59ede9fa7f909b50e03bafb3652397111acfccc09c714480b097a0b6dd38de5ea9baa47d02df7eb3354317bd954efa257d8c0747e

data/README.md CHANGED Viewed

@@ -33,15 +33,30 @@ Training:
 % echo "Oh my god!"           | tc-train --labels=positive,negative negative
 ```
-The training data will be saved as `tc.positive-negative.dat` (`tc.` is the fixed prefix, `.dat` is the fixed suffix. The middle part is filled by given labels automatically.) in the current directory. If you hope the file to be saved in any different place, please specify `--base-dir=/path/to/data/directory`.
+The training data will be saved as `tc.negative-positive.dat` (`tc.` is the fixed prefix, `.dat` is the fixed suffix. The middle part is filled by given labels automatically.) in the current directory. If you hope the file to be saved in any different place, please specify `--base-dir=/path/to/data/directory`.
-Classifying:
+Testing to classify:
 ~~~
 % echo "Happy day?" | tc-classify --labels=positive,negative
 positive
 ~~~
+If you think that the classifier has been enoughly trained, then you can generate a fixed classifier:
+~~~
+% tc-generate-classifier --labels=positive,negative --output-dir=/path/to/dir
+~~~
+Then a fixed classifier (executable Ruby script) will be generated as `tc-classify-negative-positive` (`tc-classify-` is the fixed prefix, rest is filled by given labels automatically.)
+~~~
+% ls /path/to/dir/
+tc-classify-negative-positive
+% echo "Happy day?" | /path/to/dir/tc-classify-negative-positive
+positive
+~~~
 ## Command line parameters
 ### Common
@@ -55,10 +70,15 @@ positive
 `-t`, `--tokenizer=TOKENIZER` (optional)
 : Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
-### Trainer
+### `tc-train` specific parameters
 The `tc-train` requires one command line argument: the label. You need to specify one of labels given via the `--labels` parameter.
+### `tc-generate-classifier` specific parameters
+`-o`, `--output-dir=PATH` (optional)
+: The path to the directory that the classifier to be saved. The current directory is the default value.
 ## Copyright
 Copyright (c) 2017 YUKI "Piro" Hiroshi

data/bin/tc-classify CHANGED Viewed

@@ -17,4 +17,4 @@
 require "tiny-classifier/classifier"
-Classifier.run
+TinyClassifier::Classifier.run

data/bin/tc-generate-classifier ADDED Viewed

@@ -0,0 +1,20 @@
+#!/usr/bin/env ruby
+# Copyright (C) 2017 YUKI "Piro" Hiroshi
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require "tiny-classifier/classifier-generator"
+TinyClassifier::ClassifierGenerator.run

data/bin/tc-train CHANGED Viewed

@@ -17,4 +17,4 @@
 require "tiny-classifier/trainer"
-Trainer.run
+TinyClassifier::Trainer.run

data/lib/tiny-classifier/base.rb CHANGED Viewed

@@ -16,119 +16,103 @@
 require "pathname"
 require "optparse"
 require "classifier-reborn"
+require "tiny-classifier/tokenizer"
-class TinyClassifierBase
-  TOKENIZERS = [:none, :mecab]
+module TinyClassifier
+  class Base
+    attr_reader :tokenizer
-  def initialize
-    @tokenizer = :none
-    @data_dir = Dir.pwd
-  end
-  def parse_command_line_options(command_line_options)
-    option_parser.parse!(command_line_options)
-  end
-  private
-  def option_parser
-    @option_parser ||= create_option_parser
-  end
-  def create_option_parser
-    parser = OptionParser.new
+    def initialize
+      @tokenizer = Tokenizer.new
+      @data_dir = Dir.pwd
+    end
-    parser.on("-d PATH", "--data-dir=PATH",
-              "Path to the directory to store training data file (default=current directory)") do |data_dir|
-      @data_dir = data_dir
+    def parse_command_line_options(command_line_options)
+      option_parser.parse!(command_line_options)
     end
-    parser.on("-l LABELS", "--labels=LABELS",
-              "List of labels (comma-separated)") do |labels|
-      @labels = normalize_labels(labels)
+    def classifier
+      @classifier ||= prepare_classifier
     end
-    parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
-              "Tokenizer (default=#{@tokenizer})") do |tokenizer|
-      @tokenizer = tokenizer.downcase.to_sym
+    private
+    def option_parser
+      @option_parser ||= create_option_parser
     end
-    parser
-  end
+    def create_option_parser
+      parser = OptionParser.new
-  def normalize_labels(labels)
-    labels
-      .strip
-      .downcase
-      .split(",")
-      .collect(&:strip)
-      .reject do |label|
-        label.empty?
+      parser.on("-d PATH", "--data-dir=PATH",
+                "Path to the directory to store training data file (default=current directory)") do |data_dir|
+        @data_dir = data_dir
       end
-      .sort
-      .collect(&:capitalize)
-  end
-  def data_file_name
-    @data_file_basename ||= prepare_data_file_name
-  end
+      parser.on("-l LABELS", "--labels=LABELS",
+                "List of labels (comma-separated)") do |labels|
+        @labels = normalize_labels(labels)
+      end
-  def prepare_data_file_name
-    labels = @labels.join("-").downcase
-    "tc.#{labels}.dat"
-  end
+      parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
+                "Tokenizer (default=#{@tokenizer})") do |tokenizer|
+        @tokenizer.type = tokenizer
+      end
-  def data_file_path
-    @data_file_path ||= prepare_data_file_path
-  end
+      parser
+    end
-  def prepare_data_file_path
-    path = Pathname(@data_dir)
-    path + data_file_name
-  end
+    def normalize_labels(labels)
+      labels
+        .strip
+        .downcase
+        .split(",")
+        .collect(&:strip)
+        .reject do |label|
+          label.empty?
+        end
+        .sort
+        .collect(&:capitalize)
+    end
-  def classifier
-    @classifier ||= prepare_classifier
-  end
+    def data_file_name
+      @data_file_basename ||= prepare_data_file_name
+    end
-  def prepare_classifier
-    if data_file_path.exist?
-      data = File.read(data_file_path.to_s)
-      Marshal.load(data)
-    else
-      ClassifierReborn::Bayes.new(*@labels)
+    def prepare_data_file_name
+      labels = @labels.join("-").downcase
+      "tc.#{labels}.dat"
     end
-  end
-  def input
-    @input ||= prepare_input
-  end
+    def data_file_path
+      @data_file_path ||= prepare_data_file_path
+    end
-  def prepare_input
-    unless File.pipe?(STDIN)
-      STDERR.puts("Error: No effective input. You need to give any input via the STDIN.")
-      exit(false)
+    def prepare_data_file_path
+      path = Pathname(@data_dir)
+      path + data_file_name
+    end
+    def prepare_classifier
+      if data_file_path.exist?
+        data = File.read(data_file_path.to_s)
+        Marshal.load(data)
+      else
+        ClassifierReborn::Bayes.new(*@labels)
+      end
     end
-    @input = $stdin.readlines.join("\n")
-    tokenize
-    @input.strip!
-  end
-  def tokenize
-    case @tokenizer
-    when :mecab
-      tokenize_by_mecab
+    def input
+      @input ||= prepare_input
     end
-  end
-  def tokenize_by_mecab
-    require "natto"
-    natto = Natto::MeCab.new
-    terms = []
-    natto.parse(@input) do |term|
-      if term.feature =~ /\A(名詞|形容詞|動詞)/
-        terms << term.surface
+    def prepare_input
+      unless File.pipe?(STDIN)
+        STDERR.puts("Error: No effective input. You need to give any input via the STDIN.")
+        exit(false)
       end
+      @input = $stdin.readlines.join(" ")
+      @tokenizer.tokenize(@input)
+      @input.strip!
     end
-    @input = terms.join(" ").strip
   end
 end

data/lib/tiny-classifier/classifier-generator.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# Copyright (C) 2017 YUKI "Piro" Hiroshi
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+require "tiny-classifier/base"
+require "tiny-classifier/classifier"
+require "fileutils"
+require "base64"
+module TinyClassifier
+  class ClassifierGenerator < Base
+    class << self
+      def run(argv=nil)
+        argv ||= ARGV.dup
+        generator = new
+        generator.parse_command_line_options(argv)
+        generator.run
+      end
+    end
+    def initialize
+      super
+      @output_dir = Dir.pwd
+      option_parser.on("-o PATH", "--output-dir=PATH",
+                       "Path to the classifier command to be saved (default=current directory)") do |output_dir|
+        @output_dir = output_dir
+      end
+    end
+    def run
+      File.open(output_file_path, "w") do |file|
+        file.puts("#!/usr/bin/env ruby")
+        file.puts("require \"base64\"")
+        file.puts("require \"classifier-reborn\"")
+        file.puts("require \"tiny-classifier/classifier\"")
+        file.puts("classifier_code = Base64.strict_decode64(\"#{encoded_classifier}\")")
+        file.puts("classifier = TinyClassifier::Classifier.new")
+        file.puts("classifier.classifier = Marshal.load(classifier_code)")
+        file.puts("classifier.tokenizer.type = \"#{@tokenizer.type}\"")
+        file.puts("classifier.run")
+      end
+      FileUtils.chmod("a+x", output_file_path)
+    end
+    private
+    def encoded_classifier
+      @encoded_classifier ||= prepare_encoded_classifier
+    end
+    def prepare_encoded_classifier
+      classifier = Classifier.new
+      classifier.parse_command_line_options(ARGV.dup)
+      FileUtils.mkdir_p(output_file_path.parent)
+      classifier_code = Marshal.dump(classifier.classifier)
+      Base64.strict_encode64(classifier_code)
+    end
+    def classifier_name
+      @classifier_name ||= prepare_classifier_name
+    end
+    def prepare_classifier_name
+      labels = @labels.join("-").downcase
+      "tc-classify-#{labels}"
+    end
+    def output_file_path
+      @output_file_path ||= prepare_output_file_path
+    end
+    def prepare_output_file_path
+      path = Pathname(@output_dir)
+      path + classifier_name
+    end
+  end
+end

data/lib/tiny-classifier/classifier.rb CHANGED Viewed

@@ -15,25 +15,28 @@
 require "tiny-classifier/base"
-class Classifier < TinyClassifierBase
-  class << self
-    def run(argv=nil)
-      argv ||= ARGV.dup
-      classifier = new
-      classifier.parse_command_line_options(argv)
-      classifier.run
+module TinyClassifier
+  class Classifier < Base
+    class << self
+      def run(argv=nil)
+        argv ||= ARGV.dup
+        classifier = new
+        classifier.parse_command_line_options(argv)
+        classifier.run
+      end
     end
-  end
-  def run(params)
-    @label = params[:label]
-    if input.empty?
-      STDERR.puts("Error: No effective input.")
-      false
-    else
-      label = classifier.classify(input)
-      puts label.downcase
-      true
+    attr_writer :classifier
+    def run
+      if input.empty?
+        STDERR.puts("Error: No effective input.")
+        false
+      else
+        label = classifier.classify(input)
+        puts label.downcase
+        true
+      end
     end
   end
 end

data/lib/tiny-classifier/tokenizer.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright (C) 2017 YUKI "Piro" Hiroshi
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+module TinyClassifier
+  class Tokenizer
+    TOKENIZERS = [:none, :mecab]
+    attr_accessor :type
+    def initialize(params = nil)
+      if params
+        @type = params[:type]
+      end
+      @type ||= :none
+    end
+    def tokenize(input)
+      case @tokenizer.to_s.downcase.to_sym
+      when :mecab
+        tokenize_by_mecab(input)
+      else
+        input
+      end
+    end
+    private
+    def tokenize_by_mecab(input)
+      require "natto"
+      natto = Natto::MeCab.new
+      terms = []
+      natto.parse(input) do |term|
+        if term.feature =~ /\A(名詞|形容詞|動詞)/
+          terms << term.surface
+        end
+      end
+      terms.join(" ").strip
+    end
+  end
+end

data/lib/tiny-classifier/trainer.rb CHANGED Viewed

@@ -15,58 +15,60 @@
 require "tiny-classifier/base"
-class Trainer < TinyClassifierBase
-  class << self
-    def run(argv=nil)
-      argv ||= ARGV.dup
-      trainer = new
-      *labels = trainer.parse_command_line_options(argv)
-      trainer.run(label: labels.first)
+module TinyClassifier
+  class Trainer < Base
+    class << self
+      def run(argv=nil)
+        argv ||= ARGV.dup
+        trainer = new
+        *labels = trainer.parse_command_line_options(argv)
+        trainer.run(label: labels.first)
+      end
     end
-  end
-  def initialize
-    super
-    option_parser.banner += " LABEL"
-  end
-  def run(params)
-    @label = params[:label]
-    prepare_label
-    if input.empty?
-      STDERR.puts("Error: No effective input.")
-      false
-    else
-      classifier.send("train_#{@label}", input)
-      save
-      true
+    def initialize
+      super
+      option_parser.banner += " LABEL"
     end
-  end
-  private
-  def prepare_label
-    unless @label
-      STDERR.puts("Error: You need to specify the label for the input.")
-      exit(false)
+    def run(params)
+      @label = params[:label]
+      prepare_label
+      if input.empty?
+        STDERR.puts("Error: No effective input.")
+        false
+      else
+        classifier.send("train_#{@label}", input)
+        save
+        true
+      end
     end
-    @label = @label.downcase.strip
+    private
+    def prepare_label
+      unless @label
+        STDERR.puts("Error: You need to specify the label for the input.")
+        exit(false)
+      end
-    if @label.empty?
-      STDERR.puts("Error: You need to specify the label for the input.")
-      exit(false)
-    end
+      @label = @label.downcase.strip
+      if @label.empty?
+        STDERR.puts("Error: You need to specify the label for the input.")
+        exit(false)
+      end
-    unless @labels.include?(@label.capitalize)
-      STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
-      exit(false)
+      unless @labels.include?(@label.capitalize)
+        STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
+        exit(false)
+      end
     end
-  end
-  def save
-    data = Marshal.dump(classifier)
-    File.open(data_file_path, "w") do |file|
-      file.write(data)
+    def save
+      data = Marshal.dump(classifier)
+      File.open(data_file_path, "w") do |file|
+        file.write(data)
+      end
     end
   end
 end

data/tiny-classifier.gemspec CHANGED Viewed

@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
 Gem::Specification.new do |spec|
   spec.name = "tiny-classifier"
-  spec.version = "1.2"
+  spec.version = "1.3"
   spec.homepage = "https://github.com/piroor/tiny-classifier"
   spec.authors = ["YUKI \"Piro\" Hiroshi"]
   spec.email = ["piro.outsider.reflex@gmail.com"]

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tiny-classifier
 version: !ruby/object:Gem::Version
-  version: '1.2'
+  version: '1.3'
 platform: ruby
 authors:
 - YUKI "Piro" Hiroshi
@@ -42,6 +42,7 @@ description: ''
 email:
 - piro.outsider.reflex@gmail.com
 executables:
+- tc-generate-classifier
 - tc-train
 - tc-classify
 extensions: []
@@ -51,9 +52,12 @@ files:
 - README.md
 - Rakefile
 - bin/tc-classify
+- bin/tc-generate-classifier
 - bin/tc-train
 - lib/tiny-classifier/base.rb
+- lib/tiny-classifier/classifier-generator.rb
 - lib/tiny-classifier/classifier.rb
+- lib/tiny-classifier/tokenizer.rb
 - lib/tiny-classifier/trainer.rb
 - tiny-classifier.gemspec
 homepage: https://github.com/piroor/tiny-classifier