stuff-classifier-chinese 0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module StuffClassifier
3
+
4
+ class Storage
5
+ module ActAsStorable
6
+ def storable(*to_store)
7
+ @to_store = to_store
8
+ end
9
+ def to_store
10
+ @to_store || []
11
+ end
12
+ end
13
+
14
+ attr_accessor :storage
15
+
16
+ def initialize(*opts)
17
+ @storage = {}
18
+ end
19
+
20
+ def storage_to_classifier(classifier)
21
+ if @storage.key? classifier.name
22
+ @storage[classifier.name].each do |var,value|
23
+ classifier.instance_variable_set "@#{var}",value
24
+ end
25
+ end
26
+ end
27
+
28
+ def classifier_to_storage(classifier)
29
+ to_store = classifier.class.to_store + classifier.class.superclass.to_store
30
+ @storage[classifier.name] = to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
31
+ end
32
+
33
+ def clear_storage(classifier)
34
+ @storage.delete(classifier.name)
35
+ end
36
+
37
+ end
38
+
39
+ class InMemoryStorage < Storage
40
+ def initialize
41
+ super
42
+ end
43
+
44
+ def load_state(classifier)
45
+ storage_to_classifier(classifier)
46
+ end
47
+
48
+ def save_state(classifier)
49
+ classifier_to_storage(classifier)
50
+ end
51
+
52
+ def purge_state(classifier)
53
+ clear_storage(classifier)
54
+ end
55
+
56
+ end
57
+
58
+ class FileStorage < Storage
59
+ def initialize(path)
60
+ super
61
+ @path = path
62
+ end
63
+
64
+ def load_state(classifier)
65
+ if @storage.length == 0 && File.exists?(@path)
66
+ data = File.open(@path, 'rb') { |f| f.read }
67
+ @storage = Marshal.load(data)
68
+ end
69
+ storage_to_classifier(classifier)
70
+ end
71
+
72
+ def save_state(classifier)
73
+ classifier_to_storage(classifier)
74
+ _write_to_file
75
+ end
76
+
77
+ def purge_state(classifier)
78
+ clear_storage(classifier)
79
+ _write_to_file
80
+ end
81
+
82
+ def _write_to_file
83
+ File.open(@path, 'wb') do |fh|
84
+ fh.flock(File::LOCK_EX)
85
+ fh.write(Marshal.dump(@storage))
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ class RedisStorage < Storage
92
+ def initialize(key, redis_options=nil)
93
+ super
94
+ @key = key
95
+ @redis = Redis.new(redis_options || {})
96
+ end
97
+
98
+ def load_state(classifier)
99
+ if @storage.length == 0 && @redis.exists(@key)
100
+ data = @redis.get(@key)
101
+ @storage = Marshal.load(data)
102
+ end
103
+ storage_to_classifier(classifier)
104
+ end
105
+
106
+ def save_state(classifier)
107
+ classifier_to_storage(classifier)
108
+ _write_to_redis
109
+ end
110
+
111
+ def purge_state(classifier)
112
+ clear_storage(classifier)
113
+ _write_to_redis
114
+ end
115
+
116
+ private
117
+ def _write_to_redis
118
+ data = Marshal.dump(@storage)
119
+ @redis.set(@key, data)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,45 @@
1
+ # -*- encoding : utf-8 -*-
2
+ class StuffClassifier::TfIdf < StuffClassifier::Base
3
+ extend StuffClassifier::Storage::ActAsStorable
4
+
5
+ def initialize(name, opts={})
6
+ super(name, opts)
7
+ end
8
+
9
+
10
+ def word_prob(word, cat)
11
+ word_cat_nr = word_count(word, cat)
12
+ cat_nr = cat_count(cat)
13
+
14
+ tf = 1.0 * word_cat_nr / cat_nr
15
+
16
+ idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
17
+ tf * idf
18
+ end
19
+
20
+ def text_prob(text, cat)
21
+ @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
22
+ end
23
+
24
+ def cat_scores(text)
25
+ probs = {}
26
+ categories.each do |cat|
27
+ p = text_prob(text, cat)
28
+ probs[cat] = p
29
+ end
30
+ probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
31
+ end
32
+
33
+ def word_classification_detail(word)
34
+
35
+ p "tf_idf"
36
+ result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
37
+ ap result
38
+
39
+ p "text_prob"
40
+ result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
41
+ ap result
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,96 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ require "lingua/stemmer"
4
+ require 'rmmseg'
5
+ require 'debugger'
6
+
7
+ class StuffClassifier::Tokenizer
8
+ require "stuff-classifier/tokenizer/tokenizer_properties"
9
+
10
+ include RMMSeg
11
+ RMMSeg::Dictionary.load_dictionaries
12
+
13
+ def initialize(opts={})
14
+ @language = opts.key?(:language) ? opts[:language] : "en"
15
+ @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
16
+
17
+ @stemming = opts.key?(:stemming) ? opts[:stemming] : true
18
+ if @stemming
19
+ @stemmer = Lingua::Stemmer.new(:language => @language)
20
+ end
21
+ end
22
+
23
+ def language
24
+ @language
25
+ end
26
+
27
+ def preprocessing_regexps=(value)
28
+ @preprocessing_regexps = value
29
+ end
30
+
31
+ def preprocessing_regexps
32
+ @preprocessing_regexps || @properties[:preprocessing_regexps]
33
+ end
34
+
35
+ def ignore_words=(value)
36
+ @ignore_words = value
37
+ end
38
+
39
+ def ignore_words
40
+ @ignore_words || @properties[:stop_word]
41
+ end
42
+
43
+ def stemming?
44
+ @stemming || false
45
+ end
46
+
47
+ def each_word(string)
48
+ string = string.strip
49
+ return if string == ''
50
+
51
+ words = []
52
+
53
+ # tokenize string
54
+ string.split("\n").each do |line|
55
+
56
+ # Apply preprocessing regexps
57
+ if preprocessing_regexps
58
+ preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
59
+ end
60
+
61
+ segment(line).each do |w|
62
+ next if w == '' || ignore_words.member?(w.downcase)
63
+
64
+ if stemming? and stemable?(w)
65
+ w = @stemmer.stem(w).downcase
66
+ next if ignore_words.member?(w)
67
+ else
68
+ w = w.downcase
69
+ end
70
+
71
+ words << (block_given? ? (yield w) : w)
72
+ end
73
+ end
74
+
75
+ return words
76
+ end
77
+
78
+ private
79
+
80
+ def stemable?(word)
81
+ true
82
+ #word =~ /^\p{Alpha}+$/
83
+ end
84
+
85
+ def segment text
86
+ algor = RMMSeg::Algorithm.new(text)
87
+ result = []
88
+ loop do
89
+ tok = algor.next_token
90
+ break if tok.nil?
91
+ result << tok.text
92
+ end
93
+ result
94
+ end
95
+
96
+ end
@@ -0,0 +1,81 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'set'
3
+ StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
4
+ "en" => {
5
+ :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
6
+ :stop_word => Set.new([
7
+ '的','个','得',
8
+ 'a', 'about', 'above', 'across', 'after', 'afterwards',
9
+ 'again', 'against', 'all', 'almost', 'alone', 'along',
10
+ 'already', 'also', 'although', 'always', 'am', 'among',
11
+ 'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
12
+ 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
13
+ 'are', 'around', 'as', 'at', 'back', 'be',
14
+ 'became', 'because', 'become', 'becomes', 'becoming', 'been',
15
+ 'before', 'beforehand', 'behind', 'being', 'below', 'beside',
16
+ 'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
17
+ 'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
18
+ 'co', 'computer', 'con', 'could', 'couldnt', 'cry',
19
+ 'de', 'describe', 'detail', 'do', 'done', 'down',
20
+ 'due', 'during', 'each', 'eg', 'eight', 'either',
21
+ 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
22
+ 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
23
+ 'fify', 'fill', 'find', 'fire', 'first', 'five',
24
+ 'for', 'former', 'formerly', 'forty', 'found', 'four',
25
+ 'from', 'front', 'full', 'further', 'get', 'give',
26
+ 'go', 'had', 'has', 'hasnt', 'have', 'he',
27
+ 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
28
+ 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
29
+ 'how', 'however', 'hundred', 'i', 'ie', 'if',
30
+ 'in', 'inc', 'indeed', 'interest', 'into', 'is',
31
+ 'it', 'its', 'itself', 'keep', 'last', 'latter',
32
+ 'latterly', 'least', 'less', 'ltd', 'made', 'many',
33
+ 'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
34
+ 'more', 'moreover', 'most', 'mostly', 'move', 'much',
35
+ 'must', 'my', 'myself', 'name', 'namely', 'neither',
36
+ 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
37
+ 'none', 'noone', 'nor', 'not', 'nothing', 'now',
38
+ 'nowhere', 'of', 'off', 'often', 'on', 'once',
39
+ 'one', 'only', 'onto', 'or', 'other', 'others',
40
+ 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
41
+ 'own', 'part', 'per', 'perhaps', 'please', 'put',
42
+ 'rather', 're', 'same', 'see', 'seem', 'seemed',
43
+ 'seeming', 'seems', 'serious', 'several', 'she', 'should',
44
+ 'show', 'side', 'since', 'sincere', 'six', 'sixty',
45
+ 'so', 'some', 'somehow', 'someone', 'something', 'sometime',
46
+ 'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
47
+ 'ten', 'than', 'that', 'the', 'their', 'them',
48
+ 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
49
+ 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
50
+ 'thin', 'third', 'this', 'those', 'though', 'three',
51
+ 'through', 'throughout', 'thru', 'thus', 'to', 'together',
52
+ 'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
53
+ 'two', 'un', 'under', 'until', 'up', 'upon',
54
+ 'us', 'very', 'via', 'was', 'we', 'well',
55
+ 'were', 'what', 'whatever', 'when', 'whence', 'whenever',
56
+ 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
57
+ 'wherever', 'whether', 'which', 'while', 'whither', 'who',
58
+ 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
59
+ 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
60
+ 'yourself', 'yourselves'
61
+ ])
62
+ },"fr" => {
63
+ :stop_word => Set.new([
64
+ 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux',
65
+ 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon',
66
+ 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa',
67
+ 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre',
68
+ 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées',
69
+ 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
70
+ 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient',
71
+ 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes',
72
+ 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût',
73
+ 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as',
74
+ 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais',
75
+ 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient',
76
+ 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse',
77
+ 'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'ceci', 'celà ', 'cet', 'cette', 'ici',
78
+ 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi'
79
+ ])
80
+ }
81
+ }
@@ -0,0 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module StuffClassifier
3
+ VERSION = '0.51'
4
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "stuff-classifier/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "stuff-classifier-chinese"
7
+ s.version = StuffClassifier::VERSION
8
+ s.authors = ["Tim Lang"]
9
+ s.email = ["langyong135@gmail.com"]
10
+ s.homepage = "https://github.com/TimLang/stuff-classifier/"
11
+ s.summary = %q{Simple text classifier(s) implemetation Chinese version}
12
+ s.description = %q{forked from https://github.com/alexandru/stuff-classifier, 2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.required_ruby_version = '>= 1.9.1'
20
+
21
+ s.add_runtime_dependency "ruby-stemmer"
22
+ s.add_runtime_dependency "sequel"
23
+ s.add_runtime_dependency "redis"
24
+
25
+
26
+ s.add_development_dependency "bundler"
27
+ s.add_development_dependency "rake", ">= 0.9.2"
28
+ s.add_development_dependency "minitest", "~> 4"
29
+ s.add_development_dependency "turn", ">= 0.8.3"
30
+ s.add_development_dependency "simplecov"
31
+ s.add_development_dependency "awesome_print"
32
+ s.add_development_dependency "rmmseg-cpp-huacnlee"
33
+ s.add_development_dependency "debugger"
34
+
35
+ end
36
+
data/test/helper.rb ADDED
@@ -0,0 +1,50 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'simplecov'
3
+ SimpleCov.start
4
+
5
+ require 'turn'
6
+ require 'minitest/autorun'
7
+ require 'stuff-classifier'
8
+
9
+ Turn.config do |c|
10
+ # use one of output formats:
11
+ # :outline - turn's original case/test outline mode [default]
12
+ # :progress - indicates progress with progress bar
13
+ # :dotted - test/unit's traditional dot-progress mode
14
+ # :pretty - new pretty reporter
15
+ # :marshal - dump output as YAML (normal run mode only)
16
+ # :cue - interactive testing
17
+ c.format = :cue
18
+ # turn on invoke/execute tracing, enable full backtrace
19
+ c.trace = true
20
+ # use humanized test names (works only with :outline format)
21
+ c.natural = true
22
+ end
23
+
24
+ class TestBase < MiniTest::Unit::TestCase
25
+ def self.before(&block)
26
+ @on_setup = block if block
27
+ @on_setup
28
+ end
29
+
30
+ def setup
31
+ on_setup = self.class.before
32
+ instance_eval(&on_setup) if on_setup
33
+ end
34
+
35
+ def set_classifier(instance)
36
+ @classifier = instance
37
+ end
38
+ def classifier
39
+ @classifier
40
+ end
41
+
42
+
43
+ def train(category, value)
44
+ @classifier.train(category, value)
45
+ end
46
+
47
+ def should_be(category, value)
48
+ assert_equal category, @classifier.classify(value), value
49
+ end
50
+ end
@@ -0,0 +1,51 @@
1
+ # -*- encoding : utf-8 -*-
2
+ # -*- coding: utf-8 -*-
3
+ require './helper.rb'
4
+
5
+ class Test001Tokenizer < TestBase
6
+ before do
7
+ @en_tokenizer = StuffClassifier::Tokenizer.new
8
+ @fr_tokenizer = StuffClassifier::Tokenizer.new(:language => "fr")
9
+ end
10
+
11
+ def test_simple_tokens
12
+ words = @en_tokenizer.each_word('Hello world! How are you?')
13
+ should_return = ["hello", "world"]
14
+
15
+ assert_equal should_return, words
16
+ end
17
+
18
+ def test_with_stemming
19
+ words = @en_tokenizer.each_word('Lots of dogs, lots of cats! This really is the information highway')
20
+ should_return =["lot", "dog", "lot", "cat", "realli" ,"inform", "highway" ]
21
+
22
+ assert_equal should_return, words
23
+
24
+ end
25
+
26
+ def test_complicated_tokens
27
+ words = @en_tokenizer.each_word("I don't really get what you want to
28
+ accomplish. There is a class TestEval2, you can do test_eval2 =
29
+ TestEval2.new afterwards. And: class A ... end always yields nil, so
30
+ your output is ok I guess ;-)")
31
+
32
+ should_return = [
33
+ "realli", "want", "accomplish", "class",
34
+ "testeval2", "test", "eval2","testeval2", "new", "class", "end",
35
+ "yield", "nil", "output", "ok", "guess"]
36
+
37
+ assert_equal should_return, words
38
+ end
39
+
40
+ def test_unicode
41
+
42
+ words = @fr_tokenizer.each_word("il s'appelle le vilain petit canard : en référence à Hans Christian Andersen, se démarquer négativement")
43
+
44
+ should_return = [
45
+ "appel", "vilain", "pet", "canard", "référent",
46
+ "han", "christian", "andersen", "démarqu", "négat"]
47
+
48
+ assert_equal should_return, words
49
+ end
50
+
51
+ end