stuff-classifier-zh 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ # encoding : UTF-8
2
+ module StuffClassifier
3
+
4
+ class Storage
5
+ module ActAsStorable
6
+ def storable(*to_store)
7
+ @to_store = to_store
8
+ end
9
+ def to_store
10
+ @to_store || []
11
+ end
12
+ end
13
+
14
+ attr_accessor :storage
15
+
16
+ def initialize(*opts)
17
+ @storage = {}
18
+ end
19
+
20
+ def storage_to_classifier(classifier)
21
+ if @storage.key? classifier.name
22
+ @storage[classifier.name].each do |var,value|
23
+ classifier.instance_variable_set "@#{var}",value
24
+ end
25
+ end
26
+ end
27
+
28
+ def classifier_to_storage(classifier)
29
+ to_store = classifier.class.to_store + classifier.class.superclass.to_store
30
+ @storage[classifier.name] = to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
31
+ end
32
+
33
+ def clear_storage(classifier)
34
+ @storage.delete(classifier.name)
35
+ end
36
+
37
+ end
38
+
39
+ class InMemoryStorage < Storage
40
+ def initialize
41
+ super
42
+ end
43
+
44
+ def load_state(classifier)
45
+ storage_to_classifier(classifier)
46
+ end
47
+
48
+ def save_state(classifier)
49
+ classifier_to_storage(classifier)
50
+ end
51
+
52
+ def purge_state(classifier)
53
+ clear_storage(classifier)
54
+ end
55
+
56
+ end
57
+
58
+ class FileStorage < Storage
59
+ def initialize(path)
60
+ super
61
+ @path = path
62
+ end
63
+
64
+ def load_state(classifier)
65
+ if @storage.length == 0 && File.exists?(@path)
66
+ data = File.open(@path, 'rb') { |f| f.read }
67
+ @storage = Marshal.load(data)
68
+ end
69
+ storage_to_classifier(classifier)
70
+ end
71
+
72
+ def save_state(classifier)
73
+ classifier_to_storage(classifier)
74
+ _write_to_file
75
+ end
76
+
77
+ def purge_state(classifier)
78
+ clear_storage(classifier)
79
+ _write_to_file
80
+ end
81
+
82
+ def _write_to_file
83
+ File.open(@path, 'wb') do |fh|
84
+ fh.flock(File::LOCK_EX)
85
+ fh.write(Marshal.dump(@storage))
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ class RedisStorage < Storage
92
+ def initialize(key, redis_options=nil)
93
+ super
94
+ @key = key
95
+ @redis = Redis.new(redis_options || {})
96
+ end
97
+
98
+ def load_state(classifier)
99
+ if @storage.length == 0 && @redis.exists(@key)
100
+ data = @redis.get(@key)
101
+ @storage = Marshal.load(data)
102
+ end
103
+ storage_to_classifier(classifier)
104
+ end
105
+
106
+ def save_state(classifier)
107
+ classifier_to_storage(classifier)
108
+ _write_to_redis
109
+ end
110
+
111
+ def purge_state(classifier)
112
+ clear_storage(classifier)
113
+ _write_to_redis
114
+ end
115
+
116
+ private
117
+ def _write_to_redis
118
+ data = Marshal.dump(@storage)
119
+ @redis.set(@key, data)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,44 @@
1
+ class StuffClassifier::TfIdf < StuffClassifier::Base
2
+ extend StuffClassifier::Storage::ActAsStorable
3
+
4
+ def initialize(name, opts={})
5
+ super(name, opts)
6
+ end
7
+
8
+
9
+ def word_prob(word, cat)
10
+ word_cat_nr = word_count(word, cat)
11
+ cat_nr = cat_count(cat)
12
+
13
+ tf = 1.0 * word_cat_nr / cat_nr
14
+
15
+ idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
16
+ tf * idf
17
+ end
18
+
19
+ def text_prob(text, cat)
20
+ @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
21
+ end
22
+
23
+ def cat_scores(text)
24
+ probs = {}
25
+ categories.each do |cat|
26
+ p = text_prob(text, cat)
27
+ probs[cat] = p
28
+ end
29
+ probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
30
+ end
31
+
32
+ def word_classification_detail(word)
33
+
34
+ p "tf_idf"
35
+ result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
36
+ ap result
37
+
38
+ p "text_prob"
39
+ result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
40
+ ap result
41
+
42
+ end
43
+
44
+ end
@@ -0,0 +1,94 @@
1
+ # encoding: utf-8
2
+
3
+ require "lingua/stemmer"
4
+ require 'rmmseg'
5
+ require 'rmmseg/dictionary'
6
+ RMMSeg::Dictionary.load_dictionaries
7
+
8
+ class StuffClassifier::Tokenizer
9
+ require "stuff-classifier/tokenizer/tokenizer_properties"
10
+
11
+ def initialize(opts={})
12
+ @language = opts.key?(:language) ? opts[:language] : "en"
13
+ @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
14
+
15
+ @stemming = opts.key?(:stemming) ? opts[:stemming] : true
16
+ if @stemming
17
+ @stemmer = Lingua::Stemmer.new(:language => @language)
18
+ end
19
+ end
20
+
21
+ def language
22
+ @language
23
+ end
24
+
25
+ def preprocessing_regexps=(value)
26
+ @preprocessing_regexps = value
27
+ end
28
+
29
+ def preprocessing_regexps
30
+ @preprocessing_regexps || @properties[:preprocessing_regexps]
31
+ end
32
+
33
+ def ignore_words=(value)
34
+ @ignore_words = value
35
+ end
36
+
37
+ def ignore_words
38
+ @ignore_words || @properties[:stop_word]
39
+ end
40
+
41
+ def stemming?
42
+ @stemming || false
43
+ end
44
+
45
+ def each_word(string)
46
+ string = string.strip
47
+ return if string == ''
48
+
49
+ words = []
50
+
51
+ # tokenize string
52
+ string.split("\n").each do |line|
53
+
54
+ # Apply preprocessing regexps
55
+ if preprocessing_regexps
56
+ preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
57
+ end
58
+
59
+ list = language == 'zh' ? segment(line) : line.gsub(/\p{Word}+/)
60
+
61
+ list.each do |w|
62
+ next if w == '' || ignore_words.member?(w.downcase)
63
+
64
+ if stemming? and stemable?(w)
65
+ w = @stemmer.stem(w).downcase
66
+ next if ignore_words.member?(w)
67
+ else
68
+ w = w.downcase
69
+ end
70
+
71
+ words << (block_given? ? (yield w) : w)
72
+ end
73
+ end
74
+
75
+ return words
76
+ end
77
+
78
+ private
79
+
80
+ def stemable?(word)
81
+ word =~ /^\p{Alpha}+$/
82
+ end
83
+
84
+ def segment text
85
+ algor = RMMSeg::Algorithm.new(text)
86
+ result = []
87
+ loop do
88
+ tok = algor.next_token
89
+ break if tok.nil?
90
+ result << tok.text.force_encoding('utf-8')
91
+ end
92
+ result
93
+ end
94
+ end
@@ -0,0 +1,107 @@
1
+ # encoding: utf-8
2
+ require 'set'
3
+ StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
4
+ "en" => {
5
+ :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
6
+ :stop_word => Set.new([
7
+ 'a', 'about', 'above', 'across', 'after', 'afterwards',
8
+ 'again', 'against', 'all', 'almost', 'alone', 'along',
9
+ 'already', 'also', 'although', 'always', 'am', 'among',
10
+ 'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
11
+ 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
12
+ 'are', 'around', 'as', 'at', 'back', 'be',
13
+ 'became', 'because', 'become', 'becomes', 'becoming', 'been',
14
+ 'before', 'beforehand', 'behind', 'being', 'below', 'beside',
15
+ 'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
16
+ 'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
17
+ 'co', 'computer', 'con', 'could', 'couldnt', 'cry',
18
+ 'de', 'describe', 'detail', 'do', 'done', 'down',
19
+ 'due', 'during', 'each', 'eg', 'eight', 'either',
20
+ 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
21
+ 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
22
+ 'fify', 'fill', 'find', 'fire', 'first', 'five',
23
+ 'for', 'former', 'formerly', 'forty', 'found', 'four',
24
+ 'from', 'front', 'full', 'further', 'get', 'give',
25
+ 'go', 'had', 'has', 'hasnt', 'have', 'he',
26
+ 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
27
+ 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
28
+ 'how', 'however', 'hundred', 'i', 'ie', 'if',
29
+ 'in', 'inc', 'indeed', 'interest', 'into', 'is',
30
+ 'it', 'its', 'itself', 'keep', 'last', 'latter',
31
+ 'latterly', 'least', 'less', 'ltd', 'made', 'many',
32
+ 'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
33
+ 'more', 'moreover', 'most', 'mostly', 'move', 'much',
34
+ 'must', 'my', 'myself', 'name', 'namely', 'neither',
35
+ 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
36
+ 'none', 'noone', 'nor', 'not', 'nothing', 'now',
37
+ 'nowhere', 'of', 'off', 'often', 'on', 'once',
38
+ 'one', 'only', 'onto', 'or', 'other', 'others',
39
+ 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
40
+ 'own', 'part', 'per', 'perhaps', 'please', 'put',
41
+ 'rather', 're', 'same', 'see', 'seem', 'seemed',
42
+ 'seeming', 'seems', 'serious', 'several', 'she', 'should',
43
+ 'show', 'side', 'since', 'sincere', 'six', 'sixty',
44
+ 'so', 'some', 'somehow', 'someone', 'something', 'sometime',
45
+ 'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
46
+ 'ten', 'than', 'that', 'the', 'their', 'them',
47
+ 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
48
+ 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
49
+ 'thin', 'third', 'this', 'those', 'though', 'three',
50
+ 'through', 'throughout', 'thru', 'thus', 'to', 'together',
51
+ 'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
52
+ 'two', 'un', 'under', 'until', 'up', 'upon',
53
+ 'us', 'very', 'via', 'was', 'we', 'well',
54
+ 'were', 'what', 'whatever', 'when', 'whence', 'whenever',
55
+ 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
56
+ 'wherever', 'whether', 'which', 'while', 'whither', 'who',
57
+ 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
58
+ 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
59
+ 'yourself', 'yourselves'
60
+ ])
61
+ },
62
+ "fr" => {
63
+ :stop_word => Set.new([
64
+ 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux',
65
+ 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon',
66
+ 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa',
67
+ 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre',
68
+ 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées',
69
+ 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
70
+ 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient',
71
+ 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes',
72
+ 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût',
73
+ 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as',
74
+ 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais',
75
+ 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient',
76
+ 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse',
77
+ 'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'ceci', 'celà ', 'cet', 'cette', 'ici',
78
+ 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi'
79
+ ])
80
+ },
81
+ "de" => {
82
+ :stop_word => Set.new([
83
+ 'aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere',
84
+ 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf',
85
+ 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das',
86
+ 'daß', 'dass', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe',
87
+ 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du',
88
+ 'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen',
89
+ 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es',
90
+ 'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat',
91
+ 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres',
92
+ 'euch', 'im', 'in', 'indem', 'ins', 'ist', 'jede', 'jedem', 'jeden', 'jeder', 'jedes', 'jene', 'jenem', 'jenen', 'jener',
93
+ 'jenes', 'jetzt', 'kann', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'können', 'könnte', 'machen', 'man', 'manche',
94
+ 'manchem', 'manchen', 'mancher', 'manches', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mit', 'muss', 'musste', 'nach',
95
+ 'nicht', 'nichts', 'noch', 'nun', 'nur', 'ob', 'oder', 'ohne', 'sehr', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'selbst',
96
+ 'sich', 'sie', 'ihnen', 'sind', 'so', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollte', 'sondern', 'sonst', 'über',
97
+ 'um', 'und', 'uns', 'unse', 'unsem', 'unsen', 'unser', 'unses', 'unter', 'viel', 'vom', 'von', 'vor', 'während', 'war', 'waren', 'warst',
98
+ 'was', 'weg', 'weil', 'weiter', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wenn', 'werde', 'werden', 'wie', 'wieder', 'will',
99
+ 'wir', 'wird', 'wirst', 'wo', 'wollen', 'wollte', 'würde', 'würden', 'zu', 'zum', 'zur', 'zwar', 'zwischen'
100
+ ])
101
+ },
102
+ "zh" =>{
103
+ :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
104
+ :stop_word => Set.new([
105
+ ])
106
+ }
107
+ }
@@ -0,0 +1,3 @@
1
+ module StuffClassifier
2
+ VERSION = '0.5.2'
3
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "stuff-classifier/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "stuff-classifier-zh"
7
+ s.version = StuffClassifier::VERSION
8
+ s.authors = ["Alexandru Nedelcu"]
9
+ s.email = ["github@contact.bionicspirit.com"]
10
+ s.homepage = "https://github.com/alexandru/stuff-classifier/"
11
+ s.summary = %q{Simple text classifier(s) implemetation}
12
+ s.description = %q{2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.required_ruby_version = '>= 1.9.1'
20
+
21
+ s.add_runtime_dependency "ruby-stemmer"
22
+ s.add_runtime_dependency "sequel"
23
+ s.add_runtime_dependency "redis"
24
+ s.add_runtime_dependency "rmmseg-cpp"
25
+
26
+
27
+ s.add_development_dependency "bundler"
28
+ s.add_development_dependency "rake", ">= 0.9.2"
29
+ s.add_development_dependency "minitest", "~> 4"
30
+ s.add_development_dependency "turn", ">= 0.8.3"
31
+ s.add_development_dependency "simplecov"
32
+ s.add_development_dependency "awesome_print"
33
+ s.add_development_dependency "byebug"
34
+
35
+ end
36
+
data/test/helper.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'simplecov'
2
+ SimpleCov.start
3
+
4
+ require 'turn'
5
+ require 'minitest/autorun'
6
+ require 'stuff-classifier'
7
+
8
+ Turn.config do |c|
9
+ # use one of output formats:
10
+ # :outline - turn's original case/test outline mode [default]
11
+ # :progress - indicates progress with progress bar
12
+ # :dotted - test/unit's traditional dot-progress mode
13
+ # :pretty - new pretty reporter
14
+ # :marshal - dump output as YAML (normal run mode only)
15
+ # :cue - interactive testing
16
+ c.format = :outline
17
+ # turn on invoke/execute tracing, enable full backtrace
18
+ c.trace = true
19
+ # use humanized test names (works only with :outline format)
20
+ c.natural = true
21
+ end
22
+
23
+ class TestBase < MiniTest::Unit::TestCase
24
+ def self.before(&block)
25
+ @on_setup = block if block
26
+ @on_setup
27
+ end
28
+
29
+ def setup
30
+ on_setup = self.class.before
31
+ instance_eval(&on_setup) if on_setup
32
+ end
33
+
34
+ def set_classifier(instance)
35
+ @classifier = instance
36
+ end
37
+ def classifier
38
+ @classifier
39
+ end
40
+
41
+
42
+ def train(category, value)
43
+ @classifier.train(category, value)
44
+ end
45
+
46
+ def should_be(category, value)
47
+ assert_equal category, @classifier.classify(value), value
48
+ end
49
+ end