tiny-classifier 1.0 → 1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ef03bed349d3c266bab512446b6dff8946d5c14f
4
- data.tar.gz: 768126f03759f1fe38ef2e155dad3484836f4422
3
+ metadata.gz: 962697672b93b80fa4ca5efcf46c9e820d184b96
4
+ data.tar.gz: 199917cf6cae91ae3fed243818bf924426b929ca
5
5
  SHA512:
6
- metadata.gz: 9c59f06bef1b3fcc30914a410c29ac52444ea77790f5f4f588515c268bb181a2f353083bfe1ab3ceeba94964052807a6cb556ded863712b09f6488c7d1feff07
7
- data.tar.gz: aa188deccf010883b6d04f3ae63ba0124d9ff4f586690cfccf921be5e1466deed11e72b021083142d0e37134f80a1b462af4e7989af0d3c69c830a19391000c9
6
+ metadata.gz: 08b97dcd3859de27e4b39796fecfb57e5a978e55c8d47dcda19e700acee7cd8a7451e3e60f7e74c750530b17ee038222853babd278b63450b1f71cd82c3d5b12
7
+ data.tar.gz: 5c7be101d3209ff3079e597e069e502be00769aa11bb025825047bcc651bbbda5b1037ac361be4a4531dcea1a4cc32c9f82e922bd461ca4ab438036abbe75446
data/README.md CHANGED
@@ -14,6 +14,14 @@ Command line tool to run text classifier based on naive bayes.
14
14
  % gem install tiny-classifier
15
15
  ```
16
16
 
17
+ If you hope to use `--tokenizer=mecab`, you need to install MeCab like:
18
+
19
+ ```
20
+ % sudo apt install mecab mecab-ipadic-utf8
21
+ ```
22
+
23
+ This is example on Ubuntu.
24
+
17
25
  ## Basic usage
18
26
 
19
27
  Training:
@@ -36,13 +44,13 @@ positive
36
44
 
37
45
  ### Common
38
46
 
39
- `--labels=LABELS` (required)
47
+ `-l`, `--labels=LABELS` (required)
40
48
  : A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
41
49
 
42
- `--data-dir=PATH` (optional)
43
- : The path to the directory that the training data to be saved. The current directory.
50
+ `-d`, `--data-dir=PATH` (optional)
51
+ : The path to the directory that the training data to be saved. The current directory is the default value.
44
52
 
45
- `--tokenizer=TOKENIZER` (optional)
53
+ `-t`, `--tokenizer=TOKENIZER` (optional)
46
54
  : Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
47
55
 
48
56
  ### Trainer
@@ -26,27 +26,30 @@ class TinyClassifierBase
26
26
  end
27
27
 
28
28
  def parse_command_line_options(command_line_options)
29
- option_parser = create_option_parser
30
29
  option_parser.parse!(command_line_options)
31
30
  end
32
31
 
33
32
  private
33
+ def option_parser
34
+ @option_parser ||= create_option_parser
35
+ end
36
+
34
37
  def create_option_parser
35
38
  parser = OptionParser.new
36
39
 
37
- parser.on("--data-dir=PATH",
40
+ parser.on("-d PATH", "--data-dir=PATH",
38
41
  "Path to the directory to store training data file (default=current directory)") do |data_dir|
39
42
  @data_dir = data_dir
40
43
  end
41
44
 
42
- parser.on("--labels=LABELS",
45
+ parser.on("-l LABELS", "--labels=LABELS",
43
46
  "List of labels (comma-separated)") do |labels|
44
47
  @labels = normalize_labels(labels)
45
48
  end
46
49
 
47
- parser.on("--tokenizer=TOKENIZER",
50
+ parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
48
51
  "Tokenizer (default=#{@tokenizer})") do |tokenizer|
49
- @tokenizer = tokenizer.to_sym
52
+ @tokenizer = tokenizer.downcase.to_sym
50
53
  end
51
54
 
52
55
  parser
@@ -113,7 +116,7 @@ class TinyClassifierBase
113
116
  natto = Natto::MeCab.new
114
117
  terms = []
115
118
  natto.parse(@input) do |term|
116
- if term.feature =~ /名詞|形容詞|動詞/
119
+ if term.feature =~ /\A(名詞|形容詞|動詞)/
117
120
  terms << term.surface
118
121
  end
119
122
  end
@@ -31,11 +31,12 @@ class Classifier < TinyClassifierBase
31
31
  @input = params[:input]
32
32
  prepare_input
33
33
  if @input.empty?
34
- exit(1)
34
+ STDERR.puts("Error: No effective input.")
35
+ false
35
36
  else
36
37
  label = classifier.classify(@input)
37
38
  puts label.downcase
38
- exit(0)
39
+ true
39
40
  end
40
41
  end
41
42
  end
@@ -27,20 +27,46 @@ class Trainer < TinyClassifierBase
27
27
  end
28
28
  end
29
29
 
30
+ def initialize
31
+ super
32
+ option_parser.banner += " LABEL"
33
+ end
34
+
30
35
  def run(params)
31
36
  @label = params[:label]
32
37
  @input = params[:input]
38
+ prepare_label
33
39
  prepare_input
34
40
  if @input.empty?
35
- exit(1)
41
+ STDERR.puts("Error: No effective input.")
42
+ false
36
43
  else
37
- classifier.send("train_#{@label.downcase}", @input)
44
+ classifier.send("train_#{@label}", @input)
38
45
  save
39
- exit(0)
46
+ true
40
47
  end
41
48
  end
42
49
 
43
50
  private
51
+ def prepare_label
52
+ unless @label
53
+ STDERR.puts("Error: You need to specify the label for the input.")
54
+ exit(false)
55
+ end
56
+
57
+ @label = @label.downcase.strip
58
+
59
+ if @label.empty?
60
+ STDERR.puts("Error: You need to specify the label for the input.")
61
+ exit(false)
62
+ end
63
+
64
+ unless @labels.include?(@label.capitalize)
65
+ STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
66
+ exit(false)
67
+ end
68
+ end
69
+
44
70
  def save
45
71
  data = Marshal.dump(classifier)
46
72
  File.open(data_file_path, "w") do |file|
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
21
21
 
22
22
  Gem::Specification.new do |spec|
23
23
  spec.name = "tiny-classifier"
24
- spec.version = "1.0"
24
+ spec.version = "1.1"
25
25
  spec.homepage = "https://github.com/piroor/tiny-classifier"
26
26
  spec.authors = ["YUKI \"Piro\" Hiroshi"]
27
27
  spec.email = ["piro.outsider.reflex@gmail.com"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.0'
4
+ version: '1.1'
5
5
  platform: ruby
6
6
  authors:
7
7
  - YUKI "Piro" Hiroshi