stuff-classifier-zh 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,122 @@
1
+ # encoding : UTF-8
2
+ module StuffClassifier
3
+
4
+ class Storage
5
+ module ActAsStorable
6
+ def storable(*to_store)
7
+ @to_store = to_store
8
+ end
9
+ def to_store
10
+ @to_store || []
11
+ end
12
+ end
13
+
14
+ attr_accessor :storage
15
+
16
+ def initialize(*opts)
17
+ @storage = {}
18
+ end
19
+
20
+ def storage_to_classifier(classifier)
21
+ if @storage.key? classifier.name
22
+ @storage[classifier.name].each do |var,value|
23
+ classifier.instance_variable_set "@#{var}",value
24
+ end
25
+ end
26
+ end
27
+
28
+ def classifier_to_storage(classifier)
29
+ to_store = classifier.class.to_store + classifier.class.superclass.to_store
30
+ @storage[classifier.name] = to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
31
+ end
32
+
33
+ def clear_storage(classifier)
34
+ @storage.delete(classifier.name)
35
+ end
36
+
37
+ end
38
+
39
+ class InMemoryStorage < Storage
40
+ def initialize
41
+ super
42
+ end
43
+
44
+ def load_state(classifier)
45
+ storage_to_classifier(classifier)
46
+ end
47
+
48
+ def save_state(classifier)
49
+ classifier_to_storage(classifier)
50
+ end
51
+
52
+ def purge_state(classifier)
53
+ clear_storage(classifier)
54
+ end
55
+
56
+ end
57
+
58
+ class FileStorage < Storage
59
+ def initialize(path)
60
+ super
61
+ @path = path
62
+ end
63
+
64
+ def load_state(classifier)
65
+ if @storage.length == 0 && File.exists?(@path)
66
+ data = File.open(@path, 'rb') { |f| f.read }
67
+ @storage = Marshal.load(data)
68
+ end
69
+ storage_to_classifier(classifier)
70
+ end
71
+
72
+ def save_state(classifier)
73
+ classifier_to_storage(classifier)
74
+ _write_to_file
75
+ end
76
+
77
+ def purge_state(classifier)
78
+ clear_storage(classifier)
79
+ _write_to_file
80
+ end
81
+
82
+ def _write_to_file
83
+ File.open(@path, 'wb') do |fh|
84
+ fh.flock(File::LOCK_EX)
85
+ fh.write(Marshal.dump(@storage))
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ class RedisStorage < Storage
92
+ def initialize(key, redis_options=nil)
93
+ super
94
+ @key = key
95
+ @redis = Redis.new(redis_options || {})
96
+ end
97
+
98
+ def load_state(classifier)
99
+ if @storage.length == 0 && @redis.exists(@key)
100
+ data = @redis.get(@key)
101
+ @storage = Marshal.load(data)
102
+ end
103
+ storage_to_classifier(classifier)
104
+ end
105
+
106
+ def save_state(classifier)
107
+ classifier_to_storage(classifier)
108
+ _write_to_redis
109
+ end
110
+
111
+ def purge_state(classifier)
112
+ clear_storage(classifier)
113
+ _write_to_redis
114
+ end
115
+
116
+ private
117
+ def _write_to_redis
118
+ data = Marshal.dump(@storage)
119
+ @redis.set(@key, data)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,44 @@
1
+ class StuffClassifier::TfIdf < StuffClassifier::Base
2
+ extend StuffClassifier::Storage::ActAsStorable
3
+
4
+ def initialize(name, opts={})
5
+ super(name, opts)
6
+ end
7
+
8
+
9
+ def word_prob(word, cat)
10
+ word_cat_nr = word_count(word, cat)
11
+ cat_nr = cat_count(cat)
12
+
13
+ tf = 1.0 * word_cat_nr / cat_nr
14
+
15
+ idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
16
+ tf * idf
17
+ end
18
+
19
+ def text_prob(text, cat)
20
+ @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
21
+ end
22
+
23
+ def cat_scores(text)
24
+ probs = {}
25
+ categories.each do |cat|
26
+ p = text_prob(text, cat)
27
+ probs[cat] = p
28
+ end
29
+ probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
30
+ end
31
+
32
+ def word_classification_detail(word)
33
+
34
+ p "tf_idf"
35
+ result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
36
+ ap result
37
+
38
+ p "text_prob"
39
+ result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
40
+ ap result
41
+
42
+ end
43
+
44
+ end
@@ -0,0 +1,94 @@
1
+ # encoding: utf-8
2
+
3
+ require "lingua/stemmer"
4
+ require 'rmmseg'
5
+ require 'rmmseg/dictionary'
6
+ RMMSeg::Dictionary.load_dictionaries
7
+
8
+ class StuffClassifier::Tokenizer
9
+ require "stuff-classifier/tokenizer/tokenizer_properties"
10
+
11
+ def initialize(opts={})
12
+ @language = opts.key?(:language) ? opts[:language] : "en"
13
+ @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
14
+
15
+ @stemming = opts.key?(:stemming) ? opts[:stemming] : true
16
+ if @stemming
17
+ @stemmer = Lingua::Stemmer.new(:language => @language)
18
+ end
19
+ end
20
+
21
+ def language
22
+ @language
23
+ end
24
+
25
+ def preprocessing_regexps=(value)
26
+ @preprocessing_regexps = value
27
+ end
28
+
29
+ def preprocessing_regexps
30
+ @preprocessing_regexps || @properties[:preprocessing_regexps]
31
+ end
32
+
33
+ def ignore_words=(value)
34
+ @ignore_words = value
35
+ end
36
+
37
+ def ignore_words
38
+ @ignore_words || @properties[:stop_word]
39
+ end
40
+
41
+ def stemming?
42
+ @stemming || false
43
+ end
44
+
45
+ def each_word(string)
46
+ string = string.strip
47
+ return if string == ''
48
+
49
+ words = []
50
+
51
+ # tokenize string
52
+ string.split("\n").each do |line|
53
+
54
+ # Apply preprocessing regexps
55
+ if preprocessing_regexps
56
+ preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
57
+ end
58
+
59
+ list = language == 'zh' ? segment(line) : line.gsub(/\p{Word}+/)
60
+
61
+ list.each do |w|
62
+ next if w == '' || ignore_words.member?(w.downcase)
63
+
64
+ if stemming? and stemable?(w)
65
+ w = @stemmer.stem(w).downcase
66
+ next if ignore_words.member?(w)
67
+ else
68
+ w = w.downcase
69
+ end
70
+
71
+ words << (block_given? ? (yield w) : w)
72
+ end
73
+ end
74
+
75
+ return words
76
+ end
77
+
78
+ private
79
+
80
+ def stemable?(word)
81
+ word =~ /^\p{Alpha}+$/
82
+ end
83
+
84
+ def segment text
85
+ algor = RMMSeg::Algorithm.new(text)
86
+ result = []
87
+ loop do
88
+ tok = algor.next_token
89
+ break if tok.nil?
90
+ result << tok.text.force_encoding('utf-8')
91
+ end
92
+ result
93
+ end
94
+ end
@@ -0,0 +1,107 @@
1
+ # encoding: utf-8
2
+ require 'set'
3
+ StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
4
+ "en" => {
5
+ :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
6
+ :stop_word => Set.new([
7
+ 'a', 'about', 'above', 'across', 'after', 'afterwards',
8
+ 'again', 'against', 'all', 'almost', 'alone', 'along',
9
+ 'already', 'also', 'although', 'always', 'am', 'among',
10
+ 'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
11
+ 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
12
+ 'are', 'around', 'as', 'at', 'back', 'be',
13
+ 'became', 'because', 'become', 'becomes', 'becoming', 'been',
14
+ 'before', 'beforehand', 'behind', 'being', 'below', 'beside',
15
+ 'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
16
+ 'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
17
+ 'co', 'computer', 'con', 'could', 'couldnt', 'cry',
18
+ 'de', 'describe', 'detail', 'do', 'done', 'down',
19
+ 'due', 'during', 'each', 'eg', 'eight', 'either',
20
+ 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
21
+ 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
22
+ 'fify', 'fill', 'find', 'fire', 'first', 'five',
23
+ 'for', 'former', 'formerly', 'forty', 'found', 'four',
24
+ 'from', 'front', 'full', 'further', 'get', 'give',
25
+ 'go', 'had', 'has', 'hasnt', 'have', 'he',
26
+ 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
27
+ 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
28
+ 'how', 'however', 'hundred', 'i', 'ie', 'if',
29
+ 'in', 'inc', 'indeed', 'interest', 'into', 'is',
30
+ 'it', 'its', 'itself', 'keep', 'last', 'latter',
31
+ 'latterly', 'least', 'less', 'ltd', 'made', 'many',
32
+ 'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
33
+ 'more', 'moreover', 'most', 'mostly', 'move', 'much',
34
+ 'must', 'my', 'myself', 'name', 'namely', 'neither',
35
+ 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
36
+ 'none', 'noone', 'nor', 'not', 'nothing', 'now',
37
+ 'nowhere', 'of', 'off', 'often', 'on', 'once',
38
+ 'one', 'only', 'onto', 'or', 'other', 'others',
39
+ 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
40
+ 'own', 'part', 'per', 'perhaps', 'please', 'put',
41
+ 'rather', 're', 'same', 'see', 'seem', 'seemed',
42
+ 'seeming', 'seems', 'serious', 'several', 'she', 'should',
43
+ 'show', 'side', 'since', 'sincere', 'six', 'sixty',
44
+ 'so', 'some', 'somehow', 'someone', 'something', 'sometime',
45
+ 'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
46
+ 'ten', 'than', 'that', 'the', 'their', 'them',
47
+ 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
48
+ 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
49
+ 'thin', 'third', 'this', 'those', 'though', 'three',
50
+ 'through', 'throughout', 'thru', 'thus', 'to', 'together',
51
+ 'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
52
+ 'two', 'un', 'under', 'until', 'up', 'upon',
53
+ 'us', 'very', 'via', 'was', 'we', 'well',
54
+ 'were', 'what', 'whatever', 'when', 'whence', 'whenever',
55
+ 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
56
+ 'wherever', 'whether', 'which', 'while', 'whither', 'who',
57
+ 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
58
+ 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
59
+ 'yourself', 'yourselves'
60
+ ])
61
+ },
62
+ "fr" => {
63
+ :stop_word => Set.new([
64
+ 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux',
65
+ 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon',
66
+ 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa',
67
+ 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre',
68
+ 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées',
69
+ 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
70
+ 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient',
71
+ 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes',
72
+ 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût',
73
+ 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as',
74
+ 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais',
75
+ 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient',
76
+ 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse',
77
+ 'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'ceci', 'celà ', 'cet', 'cette', 'ici',
78
+ 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi'
79
+ ])
80
+ },
81
+ "de" => {
82
+ :stop_word => Set.new([
83
+ 'aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere',
84
+ 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf',
85
+ 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das',
86
+ 'daß', 'dass', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe',
87
+ 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du',
88
+ 'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen',
89
+ 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es',
90
+ 'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat',
91
+ 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres',
92
+ 'euch', 'im', 'in', 'indem', 'ins', 'ist', 'jede', 'jedem', 'jeden', 'jeder', 'jedes', 'jene', 'jenem', 'jenen', 'jener',
93
+ 'jenes', 'jetzt', 'kann', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'können', 'könnte', 'machen', 'man', 'manche',
94
+ 'manchem', 'manchen', 'mancher', 'manches', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mit', 'muss', 'musste', 'nach',
95
+ 'nicht', 'nichts', 'noch', 'nun', 'nur', 'ob', 'oder', 'ohne', 'sehr', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'selbst',
96
+ 'sich', 'sie', 'ihnen', 'sind', 'so', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollte', 'sondern', 'sonst', 'über',
97
+ 'um', 'und', 'uns', 'unse', 'unsem', 'unsen', 'unser', 'unses', 'unter', 'viel', 'vom', 'von', 'vor', 'während', 'war', 'waren', 'warst',
98
+ 'was', 'weg', 'weil', 'weiter', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wenn', 'werde', 'werden', 'wie', 'wieder', 'will',
99
+ 'wir', 'wird', 'wirst', 'wo', 'wollen', 'wollte', 'würde', 'würden', 'zu', 'zum', 'zur', 'zwar', 'zwischen'
100
+ ])
101
+ },
102
+ "zh" =>{
103
+ :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
104
+ :stop_word => Set.new([
105
+ ])
106
+ }
107
+ }
@@ -0,0 +1,3 @@
1
+ module StuffClassifier
2
+ VERSION = '0.5.2'
3
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "stuff-classifier/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "stuff-classifier-zh"
7
+ s.version = StuffClassifier::VERSION
8
+ s.authors = ["Alexandru Nedelcu"]
9
+ s.email = ["github@contact.bionicspirit.com"]
10
+ s.homepage = "https://github.com/alexandru/stuff-classifier/"
11
+ s.summary = %q{Simple text classifier(s) implemetation}
12
+ s.description = %q{2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.required_ruby_version = '>= 1.9.1'
20
+
21
+ s.add_runtime_dependency "ruby-stemmer"
22
+ s.add_runtime_dependency "sequel"
23
+ s.add_runtime_dependency "redis"
24
+ s.add_runtime_dependency "rmmseg-cpp"
25
+
26
+
27
+ s.add_development_dependency "bundler"
28
+ s.add_development_dependency "rake", ">= 0.9.2"
29
+ s.add_development_dependency "minitest", "~> 4"
30
+ s.add_development_dependency "turn", ">= 0.8.3"
31
+ s.add_development_dependency "simplecov"
32
+ s.add_development_dependency "awesome_print"
33
+ s.add_development_dependency "byebug"
34
+
35
+ end
36
+
data/test/helper.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'simplecov'
2
+ SimpleCov.start
3
+
4
+ require 'turn'
5
+ require 'minitest/autorun'
6
+ require 'stuff-classifier'
7
+
8
+ Turn.config do |c|
9
+ # use one of output formats:
10
+ # :outline - turn's original case/test outline mode [default]
11
+ # :progress - indicates progress with progress bar
12
+ # :dotted - test/unit's traditional dot-progress mode
13
+ # :pretty - new pretty reporter
14
+ # :marshal - dump output as YAML (normal run mode only)
15
+ # :cue - interactive testing
16
+ c.format = :outline
17
+ # turn on invoke/execute tracing, enable full backtrace
18
+ c.trace = true
19
+ # use humanized test names (works only with :outline format)
20
+ c.natural = true
21
+ end
22
+
23
+ class TestBase < MiniTest::Unit::TestCase
24
+ def self.before(&block)
25
+ @on_setup = block if block
26
+ @on_setup
27
+ end
28
+
29
+ def setup
30
+ on_setup = self.class.before
31
+ instance_eval(&on_setup) if on_setup
32
+ end
33
+
34
+ def set_classifier(instance)
35
+ @classifier = instance
36
+ end
37
+ def classifier
38
+ @classifier
39
+ end
40
+
41
+
42
+ def train(category, value)
43
+ @classifier.train(category, value)
44
+ end
45
+
46
+ def should_be(category, value)
47
+ assert_equal category, @classifier.classify(value), value
48
+ end
49
+ end