stuff-classifier-chinese 0.51
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +162 -0
- data/Rakefile +12 -0
- data/lib/stuff-classifier.rb +17 -0
- data/lib/stuff-classifier/base.rb +190 -0
- data/lib/stuff-classifier/bayes.rb +81 -0
- data/lib/stuff-classifier/storage.rb +122 -0
- data/lib/stuff-classifier/tf-idf.rb +45 -0
- data/lib/stuff-classifier/tokenizer.rb +96 -0
- data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb +81 -0
- data/lib/stuff-classifier/version.rb +4 -0
- data/stuff-classifier.gemspec +36 -0
- data/test/helper.rb +50 -0
- data/test/test_001_tokenizer.rb +51 -0
- data/test/test_002_base.rb +39 -0
- data/test/test_003_naive_bayes.rb +57 -0
- data/test/test_004_tf_idf.rb +38 -0
- data/test/test_005_in_memory_storage.rb +32 -0
- data/test/test_006_file_storage.rb +78 -0
- data/test/test_007_redis_storage.rb +82 -0
- metadata +253 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module StuffClassifier
|
3
|
+
|
4
|
+
class Storage
|
5
|
+
module ActAsStorable
|
6
|
+
def storable(*to_store)
|
7
|
+
@to_store = to_store
|
8
|
+
end
|
9
|
+
def to_store
|
10
|
+
@to_store || []
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_accessor :storage
|
15
|
+
|
16
|
+
def initialize(*opts)
|
17
|
+
@storage = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def storage_to_classifier(classifier)
|
21
|
+
if @storage.key? classifier.name
|
22
|
+
@storage[classifier.name].each do |var,value|
|
23
|
+
classifier.instance_variable_set "@#{var}",value
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def classifier_to_storage(classifier)
|
29
|
+
to_store = classifier.class.to_store + classifier.class.superclass.to_store
|
30
|
+
@storage[classifier.name] = to_store.inject({}) {|h,var| h[var] = classifier.instance_variable_get("@#{var}");h}
|
31
|
+
end
|
32
|
+
|
33
|
+
def clear_storage(classifier)
|
34
|
+
@storage.delete(classifier.name)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
class InMemoryStorage < Storage
|
40
|
+
def initialize
|
41
|
+
super
|
42
|
+
end
|
43
|
+
|
44
|
+
def load_state(classifier)
|
45
|
+
storage_to_classifier(classifier)
|
46
|
+
end
|
47
|
+
|
48
|
+
def save_state(classifier)
|
49
|
+
classifier_to_storage(classifier)
|
50
|
+
end
|
51
|
+
|
52
|
+
def purge_state(classifier)
|
53
|
+
clear_storage(classifier)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
class FileStorage < Storage
|
59
|
+
def initialize(path)
|
60
|
+
super
|
61
|
+
@path = path
|
62
|
+
end
|
63
|
+
|
64
|
+
def load_state(classifier)
|
65
|
+
if @storage.length == 0 && File.exists?(@path)
|
66
|
+
data = File.open(@path, 'rb') { |f| f.read }
|
67
|
+
@storage = Marshal.load(data)
|
68
|
+
end
|
69
|
+
storage_to_classifier(classifier)
|
70
|
+
end
|
71
|
+
|
72
|
+
def save_state(classifier)
|
73
|
+
classifier_to_storage(classifier)
|
74
|
+
_write_to_file
|
75
|
+
end
|
76
|
+
|
77
|
+
def purge_state(classifier)
|
78
|
+
clear_storage(classifier)
|
79
|
+
_write_to_file
|
80
|
+
end
|
81
|
+
|
82
|
+
def _write_to_file
|
83
|
+
File.open(@path, 'wb') do |fh|
|
84
|
+
fh.flock(File::LOCK_EX)
|
85
|
+
fh.write(Marshal.dump(@storage))
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
class RedisStorage < Storage
|
92
|
+
def initialize(key, redis_options=nil)
|
93
|
+
super
|
94
|
+
@key = key
|
95
|
+
@redis = Redis.new(redis_options || {})
|
96
|
+
end
|
97
|
+
|
98
|
+
def load_state(classifier)
|
99
|
+
if @storage.length == 0 && @redis.exists(@key)
|
100
|
+
data = @redis.get(@key)
|
101
|
+
@storage = Marshal.load(data)
|
102
|
+
end
|
103
|
+
storage_to_classifier(classifier)
|
104
|
+
end
|
105
|
+
|
106
|
+
def save_state(classifier)
|
107
|
+
classifier_to_storage(classifier)
|
108
|
+
_write_to_redis
|
109
|
+
end
|
110
|
+
|
111
|
+
def purge_state(classifier)
|
112
|
+
clear_storage(classifier)
|
113
|
+
_write_to_redis
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
def _write_to_redis
|
118
|
+
data = Marshal.dump(@storage)
|
119
|
+
@redis.set(@key, data)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
class StuffClassifier::TfIdf < StuffClassifier::Base
|
3
|
+
extend StuffClassifier::Storage::ActAsStorable
|
4
|
+
|
5
|
+
def initialize(name, opts={})
|
6
|
+
super(name, opts)
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
def word_prob(word, cat)
|
11
|
+
word_cat_nr = word_count(word, cat)
|
12
|
+
cat_nr = cat_count(cat)
|
13
|
+
|
14
|
+
tf = 1.0 * word_cat_nr / cat_nr
|
15
|
+
|
16
|
+
idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))
|
17
|
+
tf * idf
|
18
|
+
end
|
19
|
+
|
20
|
+
def text_prob(text, cat)
|
21
|
+
@tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
|
22
|
+
end
|
23
|
+
|
24
|
+
def cat_scores(text)
|
25
|
+
probs = {}
|
26
|
+
categories.each do |cat|
|
27
|
+
p = text_prob(text, cat)
|
28
|
+
probs[cat] = p
|
29
|
+
end
|
30
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
31
|
+
end
|
32
|
+
|
33
|
+
def word_classification_detail(word)
|
34
|
+
|
35
|
+
p "tf_idf"
|
36
|
+
result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
|
37
|
+
ap result
|
38
|
+
|
39
|
+
p "text_prob"
|
40
|
+
result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
|
41
|
+
ap result
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
require "lingua/stemmer"
|
4
|
+
require 'rmmseg'
|
5
|
+
require 'debugger'
|
6
|
+
|
7
|
+
class StuffClassifier::Tokenizer
|
8
|
+
require "stuff-classifier/tokenizer/tokenizer_properties"
|
9
|
+
|
10
|
+
include RMMSeg
|
11
|
+
RMMSeg::Dictionary.load_dictionaries
|
12
|
+
|
13
|
+
def initialize(opts={})
|
14
|
+
@language = opts.key?(:language) ? opts[:language] : "en"
|
15
|
+
@properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
|
16
|
+
|
17
|
+
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
|
18
|
+
if @stemming
|
19
|
+
@stemmer = Lingua::Stemmer.new(:language => @language)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def language
|
24
|
+
@language
|
25
|
+
end
|
26
|
+
|
27
|
+
def preprocessing_regexps=(value)
|
28
|
+
@preprocessing_regexps = value
|
29
|
+
end
|
30
|
+
|
31
|
+
def preprocessing_regexps
|
32
|
+
@preprocessing_regexps || @properties[:preprocessing_regexps]
|
33
|
+
end
|
34
|
+
|
35
|
+
def ignore_words=(value)
|
36
|
+
@ignore_words = value
|
37
|
+
end
|
38
|
+
|
39
|
+
def ignore_words
|
40
|
+
@ignore_words || @properties[:stop_word]
|
41
|
+
end
|
42
|
+
|
43
|
+
def stemming?
|
44
|
+
@stemming || false
|
45
|
+
end
|
46
|
+
|
47
|
+
def each_word(string)
|
48
|
+
string = string.strip
|
49
|
+
return if string == ''
|
50
|
+
|
51
|
+
words = []
|
52
|
+
|
53
|
+
# tokenize string
|
54
|
+
string.split("\n").each do |line|
|
55
|
+
|
56
|
+
# Apply preprocessing regexps
|
57
|
+
if preprocessing_regexps
|
58
|
+
preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
|
59
|
+
end
|
60
|
+
|
61
|
+
segment(line).each do |w|
|
62
|
+
next if w == '' || ignore_words.member?(w.downcase)
|
63
|
+
|
64
|
+
if stemming? and stemable?(w)
|
65
|
+
w = @stemmer.stem(w).downcase
|
66
|
+
next if ignore_words.member?(w)
|
67
|
+
else
|
68
|
+
w = w.downcase
|
69
|
+
end
|
70
|
+
|
71
|
+
words << (block_given? ? (yield w) : w)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
return words
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def stemable?(word)
|
81
|
+
true
|
82
|
+
#word =~ /^\p{Alpha}+$/
|
83
|
+
end
|
84
|
+
|
85
|
+
def segment text
|
86
|
+
algor = RMMSeg::Algorithm.new(text)
|
87
|
+
result = []
|
88
|
+
loop do
|
89
|
+
tok = algor.next_token
|
90
|
+
break if tok.nil?
|
91
|
+
result << tok.text
|
92
|
+
end
|
93
|
+
result
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'set'
|
3
|
+
StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
|
4
|
+
"en" => {
|
5
|
+
:preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
|
6
|
+
:stop_word => Set.new([
|
7
|
+
'的','个','得',
|
8
|
+
'a', 'about', 'above', 'across', 'after', 'afterwards',
|
9
|
+
'again', 'against', 'all', 'almost', 'alone', 'along',
|
10
|
+
'already', 'also', 'although', 'always', 'am', 'among',
|
11
|
+
'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
|
12
|
+
'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
|
13
|
+
'are', 'around', 'as', 'at', 'back', 'be',
|
14
|
+
'became', 'because', 'become', 'becomes', 'becoming', 'been',
|
15
|
+
'before', 'beforehand', 'behind', 'being', 'below', 'beside',
|
16
|
+
'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
|
17
|
+
'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
|
18
|
+
'co', 'computer', 'con', 'could', 'couldnt', 'cry',
|
19
|
+
'de', 'describe', 'detail', 'do', 'done', 'down',
|
20
|
+
'due', 'during', 'each', 'eg', 'eight', 'either',
|
21
|
+
'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
|
22
|
+
'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
|
23
|
+
'fify', 'fill', 'find', 'fire', 'first', 'five',
|
24
|
+
'for', 'former', 'formerly', 'forty', 'found', 'four',
|
25
|
+
'from', 'front', 'full', 'further', 'get', 'give',
|
26
|
+
'go', 'had', 'has', 'hasnt', 'have', 'he',
|
27
|
+
'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
|
28
|
+
'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
|
29
|
+
'how', 'however', 'hundred', 'i', 'ie', 'if',
|
30
|
+
'in', 'inc', 'indeed', 'interest', 'into', 'is',
|
31
|
+
'it', 'its', 'itself', 'keep', 'last', 'latter',
|
32
|
+
'latterly', 'least', 'less', 'ltd', 'made', 'many',
|
33
|
+
'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
|
34
|
+
'more', 'moreover', 'most', 'mostly', 'move', 'much',
|
35
|
+
'must', 'my', 'myself', 'name', 'namely', 'neither',
|
36
|
+
'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
|
37
|
+
'none', 'noone', 'nor', 'not', 'nothing', 'now',
|
38
|
+
'nowhere', 'of', 'off', 'often', 'on', 'once',
|
39
|
+
'one', 'only', 'onto', 'or', 'other', 'others',
|
40
|
+
'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
|
41
|
+
'own', 'part', 'per', 'perhaps', 'please', 'put',
|
42
|
+
'rather', 're', 'same', 'see', 'seem', 'seemed',
|
43
|
+
'seeming', 'seems', 'serious', 'several', 'she', 'should',
|
44
|
+
'show', 'side', 'since', 'sincere', 'six', 'sixty',
|
45
|
+
'so', 'some', 'somehow', 'someone', 'something', 'sometime',
|
46
|
+
'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
|
47
|
+
'ten', 'than', 'that', 'the', 'their', 'them',
|
48
|
+
'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
|
49
|
+
'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
|
50
|
+
'thin', 'third', 'this', 'those', 'though', 'three',
|
51
|
+
'through', 'throughout', 'thru', 'thus', 'to', 'together',
|
52
|
+
'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
|
53
|
+
'two', 'un', 'under', 'until', 'up', 'upon',
|
54
|
+
'us', 'very', 'via', 'was', 'we', 'well',
|
55
|
+
'were', 'what', 'whatever', 'when', 'whence', 'whenever',
|
56
|
+
'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
|
57
|
+
'wherever', 'whether', 'which', 'while', 'whither', 'who',
|
58
|
+
'whoever', 'whole', 'whom', 'whose', 'why', 'will',
|
59
|
+
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
|
60
|
+
'yourself', 'yourselves'
|
61
|
+
])
|
62
|
+
},"fr" => {
|
63
|
+
:stop_word => Set.new([
|
64
|
+
'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux',
|
65
|
+
'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon',
|
66
|
+
'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa',
|
67
|
+
'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre',
|
68
|
+
'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées',
|
69
|
+
'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
|
70
|
+
'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient',
|
71
|
+
'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes',
|
72
|
+
'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût',
|
73
|
+
'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as',
|
74
|
+
'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais',
|
75
|
+
'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient',
|
76
|
+
'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse',
|
77
|
+
'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'ceci', 'celà ', 'cet', 'cette', 'ici',
|
78
|
+
'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi'
|
79
|
+
])
|
80
|
+
}
|
81
|
+
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "stuff-classifier/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "stuff-classifier-chinese"
|
7
|
+
s.version = StuffClassifier::VERSION
|
8
|
+
s.authors = ["Tim Lang"]
|
9
|
+
s.email = ["langyong135@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/TimLang/stuff-classifier/"
|
11
|
+
s.summary = %q{Simple text classifier(s) implemetation Chinese version}
|
12
|
+
s.description = %q{forked from https://github.com/alexandru/stuff-classifier, 2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.required_ruby_version = '>= 1.9.1'
|
20
|
+
|
21
|
+
s.add_runtime_dependency "ruby-stemmer"
|
22
|
+
s.add_runtime_dependency "sequel"
|
23
|
+
s.add_runtime_dependency "redis"
|
24
|
+
|
25
|
+
|
26
|
+
s.add_development_dependency "bundler"
|
27
|
+
s.add_development_dependency "rake", ">= 0.9.2"
|
28
|
+
s.add_development_dependency "minitest", "~> 4"
|
29
|
+
s.add_development_dependency "turn", ">= 0.8.3"
|
30
|
+
s.add_development_dependency "simplecov"
|
31
|
+
s.add_development_dependency "awesome_print"
|
32
|
+
s.add_development_dependency "rmmseg-cpp-huacnlee"
|
33
|
+
s.add_development_dependency "debugger"
|
34
|
+
|
35
|
+
end
|
36
|
+
|
data/test/helper.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'simplecov'
|
3
|
+
SimpleCov.start
|
4
|
+
|
5
|
+
require 'turn'
|
6
|
+
require 'minitest/autorun'
|
7
|
+
require 'stuff-classifier'
|
8
|
+
|
9
|
+
Turn.config do |c|
|
10
|
+
# use one of output formats:
|
11
|
+
# :outline - turn's original case/test outline mode [default]
|
12
|
+
# :progress - indicates progress with progress bar
|
13
|
+
# :dotted - test/unit's traditional dot-progress mode
|
14
|
+
# :pretty - new pretty reporter
|
15
|
+
# :marshal - dump output as YAML (normal run mode only)
|
16
|
+
# :cue - interactive testing
|
17
|
+
c.format = :cue
|
18
|
+
# turn on invoke/execute tracing, enable full backtrace
|
19
|
+
c.trace = true
|
20
|
+
# use humanized test names (works only with :outline format)
|
21
|
+
c.natural = true
|
22
|
+
end
|
23
|
+
|
24
|
+
class TestBase < MiniTest::Unit::TestCase
|
25
|
+
def self.before(&block)
|
26
|
+
@on_setup = block if block
|
27
|
+
@on_setup
|
28
|
+
end
|
29
|
+
|
30
|
+
def setup
|
31
|
+
on_setup = self.class.before
|
32
|
+
instance_eval(&on_setup) if on_setup
|
33
|
+
end
|
34
|
+
|
35
|
+
def set_classifier(instance)
|
36
|
+
@classifier = instance
|
37
|
+
end
|
38
|
+
def classifier
|
39
|
+
@classifier
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def train(category, value)
|
44
|
+
@classifier.train(category, value)
|
45
|
+
end
|
46
|
+
|
47
|
+
def should_be(category, value)
|
48
|
+
assert_equal category, @classifier.classify(value), value
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
require './helper.rb'
|
4
|
+
|
5
|
+
class Test001Tokenizer < TestBase
|
6
|
+
before do
|
7
|
+
@en_tokenizer = StuffClassifier::Tokenizer.new
|
8
|
+
@fr_tokenizer = StuffClassifier::Tokenizer.new(:language => "fr")
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_simple_tokens
|
12
|
+
words = @en_tokenizer.each_word('Hello world! How are you?')
|
13
|
+
should_return = ["hello", "world"]
|
14
|
+
|
15
|
+
assert_equal should_return, words
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_with_stemming
|
19
|
+
words = @en_tokenizer.each_word('Lots of dogs, lots of cats! This really is the information highway')
|
20
|
+
should_return =["lot", "dog", "lot", "cat", "realli" ,"inform", "highway" ]
|
21
|
+
|
22
|
+
assert_equal should_return, words
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_complicated_tokens
|
27
|
+
words = @en_tokenizer.each_word("I don't really get what you want to
|
28
|
+
accomplish. There is a class TestEval2, you can do test_eval2 =
|
29
|
+
TestEval2.new afterwards. And: class A ... end always yields nil, so
|
30
|
+
your output is ok I guess ;-)")
|
31
|
+
|
32
|
+
should_return = [
|
33
|
+
"realli", "want", "accomplish", "class",
|
34
|
+
"testeval2", "test", "eval2","testeval2", "new", "class", "end",
|
35
|
+
"yield", "nil", "output", "ok", "guess"]
|
36
|
+
|
37
|
+
assert_equal should_return, words
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_unicode
|
41
|
+
|
42
|
+
words = @fr_tokenizer.each_word("il s'appelle le vilain petit canard : en référence à Hans Christian Andersen, se démarquer négativement")
|
43
|
+
|
44
|
+
should_return = [
|
45
|
+
"appel", "vilain", "pet", "canard", "référent",
|
46
|
+
"han", "christian", "andersen", "démarqu", "négat"]
|
47
|
+
|
48
|
+
assert_equal should_return, words
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|