stuff-classifier-chinese 0.51

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,122 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module StuffClassifier
3
+
4
+ class Storage
5
+ module ActAsStorable
6
+ def storable(*to_store)
7
+ @to_store = to_store
8
+ end
9
+ def to_store
10
+ @to_store || []
11
+ end
12
+ end
13
+
14
+ attr_accessor :storage
15
+
16
+ def initialize(*opts)
17
+ @storage = {}
18
+ end
19
+
20
+ def storage_to_classifier(classifier)
21
+ if @storage.key? classifier.name
22
+ @storage[classifier.name].each do |var,value|
23
+ classifier.instance_variable_set "@#{var}",value
24
+ end
25
+ end
26
+ end
27
+
28
+ def classifier_to_storage(classifier)
29
+ to_store = classifier.class.to_store + classifier.class.superclass.to_store
30
+ @storage[classifier.name] = to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
31
+ end
32
+
33
+ def clear_storage(classifier)
34
+ @storage.delete(classifier.name)
35
+ end
36
+
37
+ end
38
+
39
+ class InMemoryStorage < Storage
40
+ def initialize
41
+ super
42
+ end
43
+
44
+ def load_state(classifier)
45
+ storage_to_classifier(classifier)
46
+ end
47
+
48
+ def save_state(classifier)
49
+ classifier_to_storage(classifier)
50
+ end
51
+
52
+ def purge_state(classifier)
53
+ clear_storage(classifier)
54
+ end
55
+
56
+ end
57
+
58
+ class FileStorage < Storage
59
+ def initialize(path)
60
+ super
61
+ @path = path
62
+ end
63
+
64
+ def load_state(classifier)
65
+ if @storage.length == 0 && File.exists?(@path)
66
+ data = File.open(@path, 'rb') { |f| f.read }
67
+ @storage = Marshal.load(data)
68
+ end
69
+ storage_to_classifier(classifier)
70
+ end
71
+
72
+ def save_state(classifier)
73
+ classifier_to_storage(classifier)
74
+ _write_to_file
75
+ end
76
+
77
+ def purge_state(classifier)
78
+ clear_storage(classifier)
79
+ _write_to_file
80
+ end
81
+
82
+ def _write_to_file
83
+ File.open(@path, 'wb') do |fh|
84
+ fh.flock(File::LOCK_EX)
85
+ fh.write(Marshal.dump(@storage))
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ class RedisStorage < Storage
92
+ def initialize(key, redis_options=nil)
93
+ super
94
+ @key = key
95
+ @redis = Redis.new(redis_options || {})
96
+ end
97
+
98
+ def load_state(classifier)
99
+ if @storage.length == 0 && @redis.exists(@key)
100
+ data = @redis.get(@key)
101
+ @storage = Marshal.load(data)
102
+ end
103
+ storage_to_classifier(classifier)
104
+ end
105
+
106
+ def save_state(classifier)
107
+ classifier_to_storage(classifier)
108
+ _write_to_redis
109
+ end
110
+
111
+ def purge_state(classifier)
112
+ clear_storage(classifier)
113
+ _write_to_redis
114
+ end
115
+
116
+ private
117
+ def _write_to_redis
118
+ data = Marshal.dump(@storage)
119
+ @redis.set(@key, data)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,45 @@
1
+ # -*- encoding : utf-8 -*-
2
+ class StuffClassifier::TfIdf < StuffClassifier::Base
3
+ extend StuffClassifier::Storage::ActAsStorable
4
+
5
+ def initialize(name, opts={})
6
+ super(name, opts)
7
+ end
8
+
9
+
10
+ def word_prob(word, cat)
11
+ word_cat_nr = word_count(word, cat)
12
+ cat_nr = cat_count(cat)
13
+
14
+ tf = 1.0 * word_cat_nr / cat_nr
15
+
16
+ idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
17
+ tf * idf
18
+ end
19
+
20
+ def text_prob(text, cat)
21
+ @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
22
+ end
23
+
24
+ def cat_scores(text)
25
+ probs = {}
26
+ categories.each do |cat|
27
+ p = text_prob(text, cat)
28
+ probs[cat] = p
29
+ end
30
+ probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
31
+ end
32
+
33
+ def word_classification_detail(word)
34
+
35
+ p "tf_idf"
36
+ result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
37
+ ap result
38
+
39
+ p "text_prob"
40
+ result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
41
+ ap result
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,96 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ require "lingua/stemmer"
4
+ require 'rmmseg'
5
+ require 'debugger'
6
+
7
+ class StuffClassifier::Tokenizer
8
+ require "stuff-classifier/tokenizer/tokenizer_properties"
9
+
10
+ include RMMSeg
11
+ RMMSeg::Dictionary.load_dictionaries
12
+
13
+ def initialize(opts={})
14
+ @language = opts.key?(:language) ? opts[:language] : "en"
15
+ @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
16
+
17
+ @stemming = opts.key?(:stemming) ? opts[:stemming] : true
18
+ if @stemming
19
+ @stemmer = Lingua::Stemmer.new(:language => @language)
20
+ end
21
+ end
22
+
23
+ def language
24
+ @language
25
+ end
26
+
27
+ def preprocessing_regexps=(value)
28
+ @preprocessing_regexps = value
29
+ end
30
+
31
+ def preprocessing_regexps
32
+ @preprocessing_regexps || @properties[:preprocessing_regexps]
33
+ end
34
+
35
+ def ignore_words=(value)
36
+ @ignore_words = value
37
+ end
38
+
39
+ def ignore_words
40
+ @ignore_words || @properties[:stop_word]
41
+ end
42
+
43
+ def stemming?
44
+ @stemming || false
45
+ end
46
+
47
+ def each_word(string)
48
+ string = string.strip
49
+ return if string == ''
50
+
51
+ words = []
52
+
53
+ # tokenize string
54
+ string.split("\n").each do |line|
55
+
56
+ # Apply preprocessing regexps
57
+ if preprocessing_regexps
58
+ preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
59
+ end
60
+
61
+ segment(line).each do |w|
62
+ next if w == '' || ignore_words.member?(w.downcase)
63
+
64
+ if stemming? and stemable?(w)
65
+ w = @stemmer.stem(w).downcase
66
+ next if ignore_words.member?(w)
67
+ else
68
+ w = w.downcase
69
+ end
70
+
71
+ words << (block_given? ? (yield w) : w)
72
+ end
73
+ end
74
+
75
+ return words
76
+ end
77
+
78
+ private
79
+
80
+ def stemable?(word)
81
+ true
82
+ #word =~ /^\p{Alpha}+$/
83
+ end
84
+
85
+ def segment text
86
+ algor = RMMSeg::Algorithm.new(text)
87
+ result = []
88
+ loop do
89
+ tok = algor.next_token
90
+ break if tok.nil?
91
+ result << tok.text
92
+ end
93
+ result
94
+ end
95
+
96
+ end
@@ -0,0 +1,81 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'set'
3
+ StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
4
+ "en" => {
5
+ :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
6
+ :stop_word => Set.new([
7
+ '的','个','得',
8
+ 'a', 'about', 'above', 'across', 'after', 'afterwards',
9
+ 'again', 'against', 'all', 'almost', 'alone', 'along',
10
+ 'already', 'also', 'although', 'always', 'am', 'among',
11
+ 'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
12
+ 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
13
+ 'are', 'around', 'as', 'at', 'back', 'be',
14
+ 'became', 'because', 'become', 'becomes', 'becoming', 'been',
15
+ 'before', 'beforehand', 'behind', 'being', 'below', 'beside',
16
+ 'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
17
+ 'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
18
+ 'co', 'computer', 'con', 'could', 'couldnt', 'cry',
19
+ 'de', 'describe', 'detail', 'do', 'done', 'down',
20
+ 'due', 'during', 'each', 'eg', 'eight', 'either',
21
+ 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
22
+ 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
23
+ 'fify', 'fill', 'find', 'fire', 'first', 'five',
24
+ 'for', 'former', 'formerly', 'forty', 'found', 'four',
25
+ 'from', 'front', 'full', 'further', 'get', 'give',
26
+ 'go', 'had', 'has', 'hasnt', 'have', 'he',
27
+ 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
28
+ 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
29
+ 'how', 'however', 'hundred', 'i', 'ie', 'if',
30
+ 'in', 'inc', 'indeed', 'interest', 'into', 'is',
31
+ 'it', 'its', 'itself', 'keep', 'last', 'latter',
32
+ 'latterly', 'least', 'less', 'ltd', 'made', 'many',
33
+ 'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
34
+ 'more', 'moreover', 'most', 'mostly', 'move', 'much',
35
+ 'must', 'my', 'myself', 'name', 'namely', 'neither',
36
+ 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
37
+ 'none', 'noone', 'nor', 'not', 'nothing', 'now',
38
+ 'nowhere', 'of', 'off', 'often', 'on', 'once',
39
+ 'one', 'only', 'onto', 'or', 'other', 'others',
40
+ 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
41
+ 'own', 'part', 'per', 'perhaps', 'please', 'put',
42
+ 'rather', 're', 'same', 'see', 'seem', 'seemed',
43
+ 'seeming', 'seems', 'serious', 'several', 'she', 'should',
44
+ 'show', 'side', 'since', 'sincere', 'six', 'sixty',
45
+ 'so', 'some', 'somehow', 'someone', 'something', 'sometime',
46
+ 'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
47
+ 'ten', 'than', 'that', 'the', 'their', 'them',
48
+ 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
49
+ 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
50
+ 'thin', 'third', 'this', 'those', 'though', 'three',
51
+ 'through', 'throughout', 'thru', 'thus', 'to', 'together',
52
+ 'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
53
+ 'two', 'un', 'under', 'until', 'up', 'upon',
54
+ 'us', 'very', 'via', 'was', 'we', 'well',
55
+ 'were', 'what', 'whatever', 'when', 'whence', 'whenever',
56
+ 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
57
+ 'wherever', 'whether', 'which', 'while', 'whither', 'who',
58
+ 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
59
+ 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
60
+ 'yourself', 'yourselves'
61
+ ])
62
+ },"fr" => {
63
+ :stop_word => Set.new([
64
+ 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux',
65
+ 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon',
66
+ 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa',
67
+ 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre',
68
+ 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées',
69
+ 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
70
+ 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient',
71
+ 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes',
72
+ 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût',
73
+ 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as',
74
+ 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais',
75
+ 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient',
76
+ 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse',
77
+ 'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'ceci', 'celà ', 'cet', 'cette', 'ici',
78
+ 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi'
79
+ ])
80
+ }
81
+ }
@@ -0,0 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module StuffClassifier
3
+ VERSION = '0.51'
4
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "stuff-classifier/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "stuff-classifier-chinese"
7
+ s.version = StuffClassifier::VERSION
8
+ s.authors = ["Tim Lang"]
9
+ s.email = ["langyong135@gmail.com"]
10
+ s.homepage = "https://github.com/TimLang/stuff-classifier/"
11
+ s.summary = %q{Simple text classifier(s) implemetation Chinese version}
12
+ s.description = %q{forked from https://github.com/alexandru/stuff-classifier, 2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.required_ruby_version = '>= 1.9.1'
20
+
21
+ s.add_runtime_dependency "ruby-stemmer"
22
+ s.add_runtime_dependency "sequel"
23
+ s.add_runtime_dependency "redis"
24
+
25
+
26
+ s.add_development_dependency "bundler"
27
+ s.add_development_dependency "rake", ">= 0.9.2"
28
+ s.add_development_dependency "minitest", "~> 4"
29
+ s.add_development_dependency "turn", ">= 0.8.3"
30
+ s.add_development_dependency "simplecov"
31
+ s.add_development_dependency "awesome_print"
32
+ s.add_development_dependency "rmmseg-cpp-huacnlee"
33
+ s.add_development_dependency "debugger"
34
+
35
+ end
36
+
data/test/helper.rb ADDED
@@ -0,0 +1,50 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'simplecov'
3
+ SimpleCov.start
4
+
5
+ require 'turn'
6
+ require 'minitest/autorun'
7
+ require 'stuff-classifier'
8
+
9
+ Turn.config do |c|
10
+ # use one of output formats:
11
+ # :outline - turn's original case/test outline mode [default]
12
+ # :progress - indicates progress with progress bar
13
+ # :dotted - test/unit's traditional dot-progress mode
14
+ # :pretty - new pretty reporter
15
+ # :marshal - dump output as YAML (normal run mode only)
16
+ # :cue - interactive testing
17
+ c.format = :cue
18
+ # turn on invoke/execute tracing, enable full backtrace
19
+ c.trace = true
20
+ # use humanized test names (works only with :outline format)
21
+ c.natural = true
22
+ end
23
+
24
+ class TestBase < MiniTest::Unit::TestCase
25
+ def self.before(&block)
26
+ @on_setup = block if block
27
+ @on_setup
28
+ end
29
+
30
+ def setup
31
+ on_setup = self.class.before
32
+ instance_eval(&on_setup) if on_setup
33
+ end
34
+
35
+ def set_classifier(instance)
36
+ @classifier = instance
37
+ end
38
+ def classifier
39
+ @classifier
40
+ end
41
+
42
+
43
+ def train(category, value)
44
+ @classifier.train(category, value)
45
+ end
46
+
47
+ def should_be(category, value)
48
+ assert_equal category, @classifier.classify(value), value
49
+ end
50
+ end
@@ -0,0 +1,51 @@
1
+ # -*- encoding : utf-8 -*-
2
+ # -*- coding: utf-8 -*-
3
+ require './helper.rb'
4
+
5
+ class Test001Tokenizer < TestBase
6
+ before do
7
+ @en_tokenizer = StuffClassifier::Tokenizer.new
8
+ @fr_tokenizer = StuffClassifier::Tokenizer.new(:language => "fr")
9
+ end
10
+
11
+ def test_simple_tokens
12
+ words = @en_tokenizer.each_word('Hello world! How are you?')
13
+ should_return = ["hello", "world"]
14
+
15
+ assert_equal should_return, words
16
+ end
17
+
18
+ def test_with_stemming
19
+ words = @en_tokenizer.each_word('Lots of dogs, lots of cats! This really is the information highway')
20
+ should_return =["lot", "dog", "lot", "cat", "realli" ,"inform", "highway" ]
21
+
22
+ assert_equal should_return, words
23
+
24
+ end
25
+
26
+ def test_complicated_tokens
27
+ words = @en_tokenizer.each_word("I don't really get what you want to
28
+ accomplish. There is a class TestEval2, you can do test_eval2 =
29
+ TestEval2.new afterwards. And: class A ... end always yields nil, so
30
+ your output is ok I guess ;-)")
31
+
32
+ should_return = [
33
+ "realli", "want", "accomplish", "class",
34
+ "testeval2", "test", "eval2","testeval2", "new", "class", "end",
35
+ "yield", "nil", "output", "ok", "guess"]
36
+
37
+ assert_equal should_return, words
38
+ end
39
+
40
+ def test_unicode
41
+
42
+ words = @fr_tokenizer.each_word("il s'appelle le vilain petit canard : en référence à Hans Christian Andersen, se démarquer négativement")
43
+
44
+ should_return = [
45
+ "appel", "vilain", "pet", "canard", "référent",
46
+ "han", "christian", "andersen", "démarqu", "négat"]
47
+
48
+ assert_equal should_return, words
49
+ end
50
+
51
+ end