langusta 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +32 -0
- data/LICENSE.txt +13 -0
- data/README.rdoc +34 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/bin/langusta +5 -0
- data/data/messages.properties +128 -0
- data/data/uppercase.bin +0 -0
- data/langusta.gemspec +210 -0
- data/lib/langusta.rb +36 -0
- data/lib/langusta/command.rb +78 -0
- data/lib/langusta/detector.rb +197 -0
- data/lib/langusta/detector_factory.rb +46 -0
- data/lib/langusta/java_property_reader.rb +35 -0
- data/lib/langusta/lang_profile.rb +80 -0
- data/lib/langusta/language.rb +14 -0
- data/lib/langusta/language_detection_facade.rb +24 -0
- data/lib/langusta/n_gram.rb +116 -0
- data/lib/langusta/regex_helper.rb +15 -0
- data/lib/langusta/tag_extractor.rb +39 -0
- data/lib/langusta/ucs2_string.rb +70 -0
- data/lib/langusta/unicode_block.rb +56 -0
- data/profiles/af +1 -0
- data/profiles/ar +1 -0
- data/profiles/bg +1 -0
- data/profiles/bn +1 -0
- data/profiles/cs +1 -0
- data/profiles/da +1 -0
- data/profiles/de +1 -0
- data/profiles/el +1 -0
- data/profiles/en +1 -0
- data/profiles/es +1 -0
- data/profiles/fa +1 -0
- data/profiles/fi +1 -0
- data/profiles/fr +1 -0
- data/profiles/gu +1 -0
- data/profiles/he +1 -0
- data/profiles/hi +1 -0
- data/profiles/hr +1 -0
- data/profiles/hu +1 -0
- data/profiles/id +1 -0
- data/profiles/it +1 -0
- data/profiles/ja +1 -0
- data/profiles/kn +1 -0
- data/profiles/ko +1 -0
- data/profiles/mk +1 -0
- data/profiles/ml +1 -0
- data/profiles/mr +1 -0
- data/profiles/ne +1 -0
- data/profiles/nl +1 -0
- data/profiles/no +1 -0
- data/profiles/pa +1 -0
- data/profiles/pl +1 -0
- data/profiles/pt +1 -0
- data/profiles/ro +1 -0
- data/profiles/ru +1 -0
- data/profiles/sk +1 -0
- data/profiles/so +1 -0
- data/profiles/sq +1 -0
- data/profiles/sv +1 -0
- data/profiles/sw +1 -0
- data/profiles/ta +1 -0
- data/profiles/te +1 -0
- data/profiles/th +1 -0
- data/profiles/tl +1 -0
- data/profiles/tr +1 -0
- data/profiles/uk +1 -0
- data/profiles/ur +1 -0
- data/profiles/vi +1 -0
- data/profiles/zh-cn +1 -0
- data/profiles/zh-tw +1 -0
- data/test/helper.rb +20 -0
- data/test/quality/test_falsified.rb +33 -0
- data/test/test_command.rb +34 -0
- data/test/test_data/af +1 -0
- data/test/test_data/ar +1 -0
- data/test/test_data/bg +32 -0
- data/test/test_data/bn +9 -0
- data/test/test_data/cs +9 -0
- data/test/test_data/da +14 -0
- data/test/test_data/de +4 -0
- data/test/test_data/el +7 -0
- data/test/test_data/en +26 -0
- data/test/test_data/es +4 -0
- data/test/test_data/fa +21 -0
- data/test/test_data/fi +8 -0
- data/test/test_data/fr +13 -0
- data/test/test_data/gu +3 -0
- data/test/test_data/he +20 -0
- data/test/test_data/hi +1 -0
- data/test/test_data/hr +16 -0
- data/test/test_data/hu +6 -0
- data/test/test_data/id +2 -0
- data/test/test_data/it +3 -0
- data/test/test_data/ja +34 -0
- data/test/test_data/kn +14 -0
- data/test/test_data/ko +2 -0
- data/test/test_data/mk +3 -0
- data/test/test_data/ml +1 -0
- data/test/test_data/mr +3 -0
- data/test/test_data/ne +2 -0
- data/test/test_data/nl +1 -0
- data/test/test_data/no +3 -0
- data/test/test_data/pa +1 -0
- data/test/test_data/pl +23 -0
- data/test/test_data/pt +2 -0
- data/test/test_data/ro +2 -0
- data/test/test_data/ru +1 -0
- data/test/test_data/sk +2 -0
- data/test/test_data/so +4 -0
- data/test/test_data/sq +4 -0
- data/test/test_data/sv +3 -0
- data/test/test_data/sw +6 -0
- data/test/test_data/ta +1 -0
- data/test/test_data/te +2 -0
- data/test/test_data/th +3 -0
- data/test/test_data/tl +1 -0
- data/test/test_data/tr +2 -0
- data/test/test_data/uk +3 -0
- data/test/test_data/ur +1 -0
- data/test/test_data/vi +2 -0
- data/test/test_data/zh-tw +3 -0
- data/test/test_detector.rb +52 -0
- data/test/test_detector_factory.rb +16 -0
- data/test/test_java_property_reader.rb +8 -0
- data/test/test_lang_profile.rb +79 -0
- data/test/test_language.rb +15 -0
- data/test/test_language_detection_facade.rb +9 -0
- data/test/test_langusta.rb +25 -0
- data/test/test_n_gram.rb +103 -0
- data/test/test_tag_extractor.rb +71 -0
- data/test/test_ucs2_string.rb +9 -0
- data/test/test_unicode_block.rb +9 -0
- metadata +320 -0
data/lib/langusta.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
$: << File.expand_path(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
Bundler.setup
|
6
|
+
|
7
|
+
require 'optparse'
|
8
|
+
require 'iconv'
|
9
|
+
|
10
|
+
# Required gems
|
11
|
+
require 'oniguruma'
|
12
|
+
require 'yajl'
|
13
|
+
|
14
|
+
module Langusta
|
15
|
+
VERSION = '0.1.0'
|
16
|
+
|
17
|
+
autoload :RegexHelper, 'langusta/regex_helper'
|
18
|
+
autoload :UCS2String, 'langusta/ucs2_string'
|
19
|
+
autoload :Language, 'langusta/language'
|
20
|
+
autoload :LangProfile, 'langusta/lang_profile'
|
21
|
+
autoload :Detector, 'langusta/detector'
|
22
|
+
autoload :JavaPropertyReader, 'langusta/java_property_reader'
|
23
|
+
autoload :UnicodeBlock, 'langusta/unicode_block'
|
24
|
+
autoload :NGram, 'langusta/n_gram'
|
25
|
+
autoload :DetectorFactory, 'langusta/detector_factory'
|
26
|
+
autoload :Detector, 'langusta/detector'
|
27
|
+
autoload :TagExtractor, 'langusta/tag_extractor'
|
28
|
+
autoload :Command, 'langusta/command'
|
29
|
+
autoload :LanguageDetectionFacade, 'langusta/language_detection_facade'
|
30
|
+
|
31
|
+
ABSOLUTE_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
32
|
+
PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
|
33
|
+
UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
|
34
|
+
MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Langusta
|
2
|
+
class Command
|
3
|
+
def self.run(argv)
|
4
|
+
options = {}
|
5
|
+
opts = OptionParser.new do |opts|
|
6
|
+
opts.on("--detectlang", "Detect the language from the given text") do |d|
|
7
|
+
options[:operation] = :detectlang if d
|
8
|
+
end
|
9
|
+
|
10
|
+
opts.on("--batchtest", "Batch test of language detection") do |b|
|
11
|
+
options[:operation] = :batchtest if b
|
12
|
+
end
|
13
|
+
|
14
|
+
opts.on("-d [profile directory]") do |pd|
|
15
|
+
options[:profile_directory] = pd
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on("-a [alpha]", Float) do |alpha|
|
19
|
+
options[:alpha] = alpha
|
20
|
+
end
|
21
|
+
end.parse!(argv)
|
22
|
+
|
23
|
+
arguments = [options[:profile_directory]] + [argv]
|
24
|
+
arguments << options[:alpha] if options[:alpha]
|
25
|
+
|
26
|
+
case options[:operation]
|
27
|
+
when :detectlang
|
28
|
+
self.new.send(:detect_lang, *arguments)
|
29
|
+
when :batchtest
|
30
|
+
self.new.send(:batch_test, *arguments)
|
31
|
+
else
|
32
|
+
$stderr.puts <<EOF
|
33
|
+
Usage:
|
34
|
+
|
35
|
+
langusta --detectlang -d [profile directory] -a [alpha] [test file(s)]
|
36
|
+
langusta --batchtest -d [profile directory] -a [alpha] [test file(s)]
|
37
|
+
EOF
|
38
|
+
end
|
39
|
+
0
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize
|
43
|
+
@detector_factory = DetectorFactory.new
|
44
|
+
end
|
45
|
+
|
46
|
+
def detect_lang(profile_directory, test_files, alpha=nil)
|
47
|
+
initialize_factory(profile_directory)
|
48
|
+
test_files.each do |filename|
|
49
|
+
language = detect_single_lang(filename, alpha)
|
50
|
+
puts "%s: %s" % [filename, language]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def batch_test(profile_directory, test_files, alpha=nil)
|
55
|
+
end
|
56
|
+
|
57
|
+
def detect_single_lang(filename, alpha)
|
58
|
+
ucs2_content = UCS2String.from_utf8(File.open(filename).read)
|
59
|
+
detector = @detector_factory.create(alpha)
|
60
|
+
detector.append(ucs2_content)
|
61
|
+
|
62
|
+
language = detector.detect()
|
63
|
+
end
|
64
|
+
|
65
|
+
def initialize_factory(profile_directory)
|
66
|
+
profiles = load_profiles(profile_directory)
|
67
|
+
profiles.each_with_index do |profile, index|
|
68
|
+
@detector_factory.add_profile(profile, index, profiles.length)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def load_profiles(directory)
|
73
|
+
@profiles = Dir[File.join(directory, '/*')].map do |filename|
|
74
|
+
LangProfile.load_from_file(filename)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
module Langusta
|
2
|
+
class Detector
|
3
|
+
attr_accessor :verbose, :alpha, :max_text_length
|
4
|
+
|
5
|
+
ALPHA_DEFAULT = 0.5
|
6
|
+
ALPHA_WIDTH = 0.05
|
7
|
+
ITERATION_LIMIT = 1000
|
8
|
+
PROB_THRESHOLD = 0.1
|
9
|
+
CONV_THRESHOLD = 0.99999
|
10
|
+
BASE_FREQ = 10000
|
11
|
+
UNKNOWN_LANG = "unknown"
|
12
|
+
|
13
|
+
def initialize(factory)
|
14
|
+
@word_lang_prob_map = factory.word_lang_prob_map
|
15
|
+
@lang_list = factory.lang_list
|
16
|
+
@text = UCS2String.new('')
|
17
|
+
@langprob = nil
|
18
|
+
@alpha = ALPHA_DEFAULT
|
19
|
+
@n_trial = 7
|
20
|
+
@max_text_length = 10000
|
21
|
+
@prior_map = nil
|
22
|
+
@verbose = false
|
23
|
+
end
|
24
|
+
|
25
|
+
# Append more text to be recognized.
|
26
|
+
# @param text [UCS2String] text to be recognized
|
27
|
+
def append(text)
|
28
|
+
raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
|
29
|
+
text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
|
30
|
+
text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
|
31
|
+
text = text.map do |c|
|
32
|
+
NGram.normalize(c)
|
33
|
+
end
|
34
|
+
@text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
|
35
|
+
end
|
36
|
+
|
37
|
+
# Detect the language.
|
38
|
+
# @return [String] (usually) two-letter code describing the language.
|
39
|
+
def detect
|
40
|
+
probabilities = get_probabilities()
|
41
|
+
(probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def detect_block
|
46
|
+
cleaning_text()
|
47
|
+
ngrams = extract_ngrams()
|
48
|
+
raise "no features in text" if ngrams.empty?
|
49
|
+
@langprob = Array.new(@lang_list.length, 0.0)
|
50
|
+
|
51
|
+
@n_trial.times do
|
52
|
+
prob = init_probability()
|
53
|
+
alpha = @alpha + Detector.next_gaussian() * ALPHA_WIDTH
|
54
|
+
|
55
|
+
i = 0
|
56
|
+
Kernel.loop do
|
57
|
+
r = Kernel.rand(ngrams.length)
|
58
|
+
update_lang_prob(prob, ngrams[r], alpha)
|
59
|
+
if i % 5
|
60
|
+
break if Detector.normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT
|
61
|
+
# verbose
|
62
|
+
end
|
63
|
+
end
|
64
|
+
@langprob.length.times do |j|
|
65
|
+
@langprob[j] += prob[j] / @n_trial
|
66
|
+
end
|
67
|
+
# verbose
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def set_prior_map(prior_map)
|
72
|
+
@prior_map = Array.new[@lang_list.length]
|
73
|
+
sump = 0.0
|
74
|
+
@prior_map.length.times do |i|
|
75
|
+
lang = @lang_list[i]
|
76
|
+
if @prior_map.has_key?(lang)
|
77
|
+
p = @prior_map[lang]
|
78
|
+
raise "probability must be non-negative" if p < 0
|
79
|
+
@prior_map[i] = p
|
80
|
+
sump += p
|
81
|
+
end
|
82
|
+
end
|
83
|
+
raise "more one of prob must be non-zero" if sump <= 0
|
84
|
+
@prior_map.map! do |p|
|
85
|
+
p /= sump
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.normalize_prob(prob)
|
90
|
+
maxp = 0.0; sump = 0.0
|
91
|
+
prob.each do |p|
|
92
|
+
sump += p
|
93
|
+
end
|
94
|
+
prob.map! do |p|
|
95
|
+
q = p / sump
|
96
|
+
maxp = q if q > maxp
|
97
|
+
q
|
98
|
+
end
|
99
|
+
maxp
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
def cleaning_text
|
104
|
+
non_latin_count = latin_count = 0
|
105
|
+
@text.each_char do |c|
|
106
|
+
if c < "\00z" && c >= "\x00A"
|
107
|
+
latin_count += 1
|
108
|
+
elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
|
109
|
+
non_latin_count += 1
|
110
|
+
end
|
111
|
+
end
|
112
|
+
if latin_count * 2 < non_latin_count
|
113
|
+
text_without_latin = UCS2String.new('')
|
114
|
+
@text.each_char do |c|
|
115
|
+
text_without_latin << c if c > "\x00z" || c < "\x00A"
|
116
|
+
end
|
117
|
+
@text = text_without_latin
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def extract_ngrams
|
122
|
+
list = []
|
123
|
+
ngram = NGram.new
|
124
|
+
@text.each_char do |char|
|
125
|
+
ngram.add_char(char)
|
126
|
+
(1..NGram::N_GRAM).each do |n|
|
127
|
+
w = ngram.get(n)
|
128
|
+
list << w if w && @word_lang_prob_map.has_key?(w)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
list
|
132
|
+
end
|
133
|
+
|
134
|
+
def get_probabilities
|
135
|
+
if @langprob.nil?
|
136
|
+
detect_block()
|
137
|
+
end
|
138
|
+
sort_probability(@langprob)
|
139
|
+
end
|
140
|
+
|
141
|
+
def init_probability
|
142
|
+
prob = Array.new(@lang_list.length)
|
143
|
+
if @prior_map
|
144
|
+
prob = @prior_map.clone
|
145
|
+
else
|
146
|
+
prob.length.times do |i|
|
147
|
+
prob[i] = 1.0 / @lang_list.length
|
148
|
+
end
|
149
|
+
end
|
150
|
+
prob
|
151
|
+
end
|
152
|
+
|
153
|
+
def sort_probability(prob)
|
154
|
+
list = []
|
155
|
+
prob.each_with_index do |prob, index|
|
156
|
+
list[index] = Language.new(@lang_list[index], prob)
|
157
|
+
end
|
158
|
+
list.sort_by do |x|
|
159
|
+
x.prob
|
160
|
+
end.select do |x|
|
161
|
+
x.prob > PROB_THRESHOLD
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def update_lang_prob(prob, word, alpha)
|
166
|
+
return false if word.nil? || ! @word_lang_prob_map.has_key?(word)
|
167
|
+
|
168
|
+
lang_prob_map = @word_lang_prob_map[word]
|
169
|
+
# verbose
|
170
|
+
weight = alpha / BASE_FREQ
|
171
|
+
prob.length.times do |i|
|
172
|
+
prob[i] *= weight + lang_prob_map[i]
|
173
|
+
end
|
174
|
+
true
|
175
|
+
end
|
176
|
+
|
177
|
+
def word_prob_to_string(prob)
|
178
|
+
prob.zip(@lang_list).select do |p, lang|
|
179
|
+
p > 0.00001
|
180
|
+
end.map do |p, lang|
|
181
|
+
"%s:%.5f" % [p, lang]
|
182
|
+
end.join(' ')
|
183
|
+
end
|
184
|
+
|
185
|
+
# Box-Muller transform.
|
186
|
+
def self.next_gaussian
|
187
|
+
s = 0
|
188
|
+
while s >= 1 || s == 0
|
189
|
+
v1 = 2 * Kernel.rand - 1
|
190
|
+
v2 = 2 * Kernel.rand - 1
|
191
|
+
s = v1 * v1 + v2 * v2
|
192
|
+
end
|
193
|
+
multiplier = Math.sqrt(-2 * Math.log(s)/s)
|
194
|
+
return v1 * multiplier
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Langusta
|
2
|
+
class LangDetectException < StandardError; end
|
3
|
+
|
4
|
+
class DetectorFactory
|
5
|
+
attr_reader :word_lang_prob_map, :lang_list
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@word_lang_prob_map = {}
|
9
|
+
@lang_list = []
|
10
|
+
end
|
11
|
+
|
12
|
+
# Adds a new language profile to this factory.
|
13
|
+
# @param [LangProfile] language profile to be added.
|
14
|
+
# @param [Fixnum] index at which the language profile is to be added.
|
15
|
+
# @param [Fixnum] counts how many language profiles are to be added to this factory in total.
|
16
|
+
def add_profile(profile, index, langsize)
|
17
|
+
raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name)
|
18
|
+
@lang_list << profile.name
|
19
|
+
profile.freq.keys.each do |word|
|
20
|
+
if not @word_lang_prob_map.has_key?(word)
|
21
|
+
@word_lang_prob_map[word] = Array.new(langsize, 0.0)
|
22
|
+
end
|
23
|
+
prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
|
24
|
+
@word_lang_prob_map[word][index] = prob
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Creates a new detector object, based on a preconfigured set of language profiles.
|
29
|
+
# @return [Detector]
|
30
|
+
def create(alpha=nil)
|
31
|
+
if alpha
|
32
|
+
detector = create_detector()
|
33
|
+
detector.alpha = alpha
|
34
|
+
detector
|
35
|
+
else
|
36
|
+
create_detector()
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def create_detector
|
42
|
+
raise LangDetectException.new("need to load profiles") if @lang_list.length == 0
|
43
|
+
detector = Detector.new(self)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Langusta
|
2
|
+
class JavaPropertyReader
|
3
|
+
# This is a minimal implementation, don't expect this to actually work.
|
4
|
+
|
5
|
+
def initialize(filename)
|
6
|
+
@lines = File.open(filename).read
|
7
|
+
parse()
|
8
|
+
end
|
9
|
+
|
10
|
+
def [](property)
|
11
|
+
@properties[property]
|
12
|
+
end
|
13
|
+
|
14
|
+
def underlying_hash
|
15
|
+
@properties
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
def parse
|
20
|
+
@properties = {}
|
21
|
+
@lines.each do |line|
|
22
|
+
prop_name, value = line.split(/\=/)
|
23
|
+
@properties[prop_name] = parse_value(value)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_value(value)
|
28
|
+
codepoints = value.scan(/([0-9A-F]{4})/)
|
29
|
+
codepoints.map do |cp|
|
30
|
+
int_cp = cp.first.to_i(16)
|
31
|
+
[int_cp / 256, int_cp % 256].pack("c*")
|
32
|
+
end.join
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Langusta
|
4
|
+
class LangProfile
|
5
|
+
MINIMUM_FREQ = 2
|
6
|
+
LESS_FREQ_RATIO = 100_000
|
7
|
+
attr_reader :name, :freq, :n_words
|
8
|
+
|
9
|
+
# Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.
|
10
|
+
# @param [String] file name of the language profile.
|
11
|
+
# @return [LangProfile]
|
12
|
+
def self.load_from_file(filename)
|
13
|
+
json = Yajl::Parser.parse(File.new(filename))
|
14
|
+
profile = self.new
|
15
|
+
|
16
|
+
name = json['name']
|
17
|
+
n_words = json['n_words']
|
18
|
+
freq = json['freq'].inject({}) do |acc, kv|
|
19
|
+
key, value = kv
|
20
|
+
acc[UCS2String.from_utf8(key)] = value
|
21
|
+
acc
|
22
|
+
end
|
23
|
+
profile.populate_json(name, freq, n_words)
|
24
|
+
profile
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(name=nil)
|
28
|
+
@name = name
|
29
|
+
@freq = {}
|
30
|
+
@n_words = Array.new(NGram::N_GRAM, 0)
|
31
|
+
end
|
32
|
+
|
33
|
+
def populate_json(name, freq, n_words)
|
34
|
+
@name, @freq, @n_words = name, freq, n_words
|
35
|
+
end
|
36
|
+
|
37
|
+
# Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
|
38
|
+
# @param gram [UCS2String]
|
39
|
+
def add(gram)
|
40
|
+
raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
|
41
|
+
return if @name.nil? or gram.nil?
|
42
|
+
length = gram.size
|
43
|
+
return if length < 1 or length > NGram::N_GRAM
|
44
|
+
@n_words[length - 1] += 1
|
45
|
+
@freq[gram] ||= 0
|
46
|
+
@freq[gram] += 1
|
47
|
+
end
|
48
|
+
|
49
|
+
def omit_less_freq
|
50
|
+
return if @name.nil?
|
51
|
+
threshold = @n_words[0] / LESS_FREQ_RATIO
|
52
|
+
threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
|
53
|
+
keys = Set.new(@freq.keys)
|
54
|
+
roman = 0
|
55
|
+
keys.each do |key|
|
56
|
+
count = @freq[key]
|
57
|
+
if count <= threshold
|
58
|
+
@n_words[key.size - 1] -= count
|
59
|
+
@freq.delete(key)
|
60
|
+
else
|
61
|
+
# temp workaround
|
62
|
+
if RegexHelper::ROMAN_REGEX.match(key.underlying)
|
63
|
+
roman += count
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
if roman < @n_words[0] / 3
|
69
|
+
keys2 = Set.new(@freq.keys)
|
70
|
+
keys2.each do |key|
|
71
|
+
# temp workaround
|
72
|
+
if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
|
73
|
+
@n_words[key.size - 1] -= @freq[key]
|
74
|
+
@freq.delete(key)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|