langusta 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (136) hide show
  1. data/.document +5 -0
  2. data/Gemfile +11 -0
  3. data/Gemfile.lock +32 -0
  4. data/LICENSE.txt +13 -0
  5. data/README.rdoc +34 -0
  6. data/Rakefile +55 -0
  7. data/VERSION +1 -0
  8. data/bin/langusta +5 -0
  9. data/data/messages.properties +128 -0
  10. data/data/uppercase.bin +0 -0
  11. data/langusta.gemspec +210 -0
  12. data/lib/langusta.rb +36 -0
  13. data/lib/langusta/command.rb +78 -0
  14. data/lib/langusta/detector.rb +197 -0
  15. data/lib/langusta/detector_factory.rb +46 -0
  16. data/lib/langusta/java_property_reader.rb +35 -0
  17. data/lib/langusta/lang_profile.rb +80 -0
  18. data/lib/langusta/language.rb +14 -0
  19. data/lib/langusta/language_detection_facade.rb +24 -0
  20. data/lib/langusta/n_gram.rb +116 -0
  21. data/lib/langusta/regex_helper.rb +15 -0
  22. data/lib/langusta/tag_extractor.rb +39 -0
  23. data/lib/langusta/ucs2_string.rb +70 -0
  24. data/lib/langusta/unicode_block.rb +56 -0
  25. data/profiles/af +1 -0
  26. data/profiles/ar +1 -0
  27. data/profiles/bg +1 -0
  28. data/profiles/bn +1 -0
  29. data/profiles/cs +1 -0
  30. data/profiles/da +1 -0
  31. data/profiles/de +1 -0
  32. data/profiles/el +1 -0
  33. data/profiles/en +1 -0
  34. data/profiles/es +1 -0
  35. data/profiles/fa +1 -0
  36. data/profiles/fi +1 -0
  37. data/profiles/fr +1 -0
  38. data/profiles/gu +1 -0
  39. data/profiles/he +1 -0
  40. data/profiles/hi +1 -0
  41. data/profiles/hr +1 -0
  42. data/profiles/hu +1 -0
  43. data/profiles/id +1 -0
  44. data/profiles/it +1 -0
  45. data/profiles/ja +1 -0
  46. data/profiles/kn +1 -0
  47. data/profiles/ko +1 -0
  48. data/profiles/mk +1 -0
  49. data/profiles/ml +1 -0
  50. data/profiles/mr +1 -0
  51. data/profiles/ne +1 -0
  52. data/profiles/nl +1 -0
  53. data/profiles/no +1 -0
  54. data/profiles/pa +1 -0
  55. data/profiles/pl +1 -0
  56. data/profiles/pt +1 -0
  57. data/profiles/ro +1 -0
  58. data/profiles/ru +1 -0
  59. data/profiles/sk +1 -0
  60. data/profiles/so +1 -0
  61. data/profiles/sq +1 -0
  62. data/profiles/sv +1 -0
  63. data/profiles/sw +1 -0
  64. data/profiles/ta +1 -0
  65. data/profiles/te +1 -0
  66. data/profiles/th +1 -0
  67. data/profiles/tl +1 -0
  68. data/profiles/tr +1 -0
  69. data/profiles/uk +1 -0
  70. data/profiles/ur +1 -0
  71. data/profiles/vi +1 -0
  72. data/profiles/zh-cn +1 -0
  73. data/profiles/zh-tw +1 -0
  74. data/test/helper.rb +20 -0
  75. data/test/quality/test_falsified.rb +33 -0
  76. data/test/test_command.rb +34 -0
  77. data/test/test_data/af +1 -0
  78. data/test/test_data/ar +1 -0
  79. data/test/test_data/bg +32 -0
  80. data/test/test_data/bn +9 -0
  81. data/test/test_data/cs +9 -0
  82. data/test/test_data/da +14 -0
  83. data/test/test_data/de +4 -0
  84. data/test/test_data/el +7 -0
  85. data/test/test_data/en +26 -0
  86. data/test/test_data/es +4 -0
  87. data/test/test_data/fa +21 -0
  88. data/test/test_data/fi +8 -0
  89. data/test/test_data/fr +13 -0
  90. data/test/test_data/gu +3 -0
  91. data/test/test_data/he +20 -0
  92. data/test/test_data/hi +1 -0
  93. data/test/test_data/hr +16 -0
  94. data/test/test_data/hu +6 -0
  95. data/test/test_data/id +2 -0
  96. data/test/test_data/it +3 -0
  97. data/test/test_data/ja +34 -0
  98. data/test/test_data/kn +14 -0
  99. data/test/test_data/ko +2 -0
  100. data/test/test_data/mk +3 -0
  101. data/test/test_data/ml +1 -0
  102. data/test/test_data/mr +3 -0
  103. data/test/test_data/ne +2 -0
  104. data/test/test_data/nl +1 -0
  105. data/test/test_data/no +3 -0
  106. data/test/test_data/pa +1 -0
  107. data/test/test_data/pl +23 -0
  108. data/test/test_data/pt +2 -0
  109. data/test/test_data/ro +2 -0
  110. data/test/test_data/ru +1 -0
  111. data/test/test_data/sk +2 -0
  112. data/test/test_data/so +4 -0
  113. data/test/test_data/sq +4 -0
  114. data/test/test_data/sv +3 -0
  115. data/test/test_data/sw +6 -0
  116. data/test/test_data/ta +1 -0
  117. data/test/test_data/te +2 -0
  118. data/test/test_data/th +3 -0
  119. data/test/test_data/tl +1 -0
  120. data/test/test_data/tr +2 -0
  121. data/test/test_data/uk +3 -0
  122. data/test/test_data/ur +1 -0
  123. data/test/test_data/vi +2 -0
  124. data/test/test_data/zh-tw +3 -0
  125. data/test/test_detector.rb +52 -0
  126. data/test/test_detector_factory.rb +16 -0
  127. data/test/test_java_property_reader.rb +8 -0
  128. data/test/test_lang_profile.rb +79 -0
  129. data/test/test_language.rb +15 -0
  130. data/test/test_language_detection_facade.rb +9 -0
  131. data/test/test_langusta.rb +25 -0
  132. data/test/test_n_gram.rb +103 -0
  133. data/test/test_tag_extractor.rb +71 -0
  134. data/test/test_ucs2_string.rb +9 -0
  135. data/test/test_unicode_block.rb +9 -0
  136. metadata +320 -0
@@ -0,0 +1,36 @@
1
+ $: << File.expand_path(File.dirname(__FILE__))
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ Bundler.setup
6
+
7
+ require 'optparse'
8
+ require 'iconv'
9
+
10
+ # Required gems
11
+ require 'oniguruma'
12
+ require 'yajl'
13
+
14
+ module Langusta
15
+ VERSION = '0.1.0'
16
+
17
+ autoload :RegexHelper, 'langusta/regex_helper'
18
+ autoload :UCS2String, 'langusta/ucs2_string'
19
+ autoload :Language, 'langusta/language'
20
+ autoload :LangProfile, 'langusta/lang_profile'
21
+ autoload :Detector, 'langusta/detector'
22
+ autoload :JavaPropertyReader, 'langusta/java_property_reader'
23
+ autoload :UnicodeBlock, 'langusta/unicode_block'
24
+ autoload :NGram, 'langusta/n_gram'
25
+ autoload :DetectorFactory, 'langusta/detector_factory'
26
+ autoload :Detector, 'langusta/detector'
27
+ autoload :TagExtractor, 'langusta/tag_extractor'
28
+ autoload :Command, 'langusta/command'
29
+ autoload :LanguageDetectionFacade, 'langusta/language_detection_facade'
30
+
31
+ ABSOLUTE_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
32
+ PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
33
+ UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
34
+ MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
35
+ end
36
+
@@ -0,0 +1,78 @@
1
+ module Langusta
2
+ class Command
3
+ def self.run(argv)
4
+ options = {}
5
+ opts = OptionParser.new do |opts|
6
+ opts.on("--detectlang", "Detect the language from the given text") do |d|
7
+ options[:operation] = :detectlang if d
8
+ end
9
+
10
+ opts.on("--batchtest", "Batch test of language detection") do |b|
11
+ options[:operation] = :batchtest if b
12
+ end
13
+
14
+ opts.on("-d [profile directory]") do |pd|
15
+ options[:profile_directory] = pd
16
+ end
17
+
18
+ opts.on("-a [alpha]", Float) do |alpha|
19
+ options[:alpha] = alpha
20
+ end
21
+ end.parse!(argv)
22
+
23
+ arguments = [options[:profile_directory]] + [argv]
24
+ arguments << options[:alpha] if options[:alpha]
25
+
26
+ case options[:operation]
27
+ when :detectlang
28
+ self.new.send(:detect_lang, *arguments)
29
+ when :batchtest
30
+ self.new.send(:batch_test, *arguments)
31
+ else
32
+ $stderr.puts <<EOF
33
+ Usage:
34
+
35
+ langusta --detectlang -d [profile directory] -a [alpha] [test file(s)]
36
+ langusta --batchtest -d [profile directory] -a [alpha] [test file(s)]
37
+ EOF
38
+ end
39
+ 0
40
+ end
41
+
42
+ def initialize
43
+ @detector_factory = DetectorFactory.new
44
+ end
45
+
46
+ def detect_lang(profile_directory, test_files, alpha=nil)
47
+ initialize_factory(profile_directory)
48
+ test_files.each do |filename|
49
+ language = detect_single_lang(filename, alpha)
50
+ puts "%s: %s" % [filename, language]
51
+ end
52
+ end
53
+
54
+ def batch_test(profile_directory, test_files, alpha=nil)
55
+ end
56
+
57
+ def detect_single_lang(filename, alpha)
58
+ ucs2_content = UCS2String.from_utf8(File.open(filename).read)
59
+ detector = @detector_factory.create(alpha)
60
+ detector.append(ucs2_content)
61
+
62
+ language = detector.detect()
63
+ end
64
+
65
+ def initialize_factory(profile_directory)
66
+ profiles = load_profiles(profile_directory)
67
+ profiles.each_with_index do |profile, index|
68
+ @detector_factory.add_profile(profile, index, profiles.length)
69
+ end
70
+ end
71
+
72
+ def load_profiles(directory)
73
+ @profiles = Dir[File.join(directory, '/*')].map do |filename|
74
+ LangProfile.load_from_file(filename)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,197 @@
1
+ module Langusta
2
+ class Detector
3
+ attr_accessor :verbose, :alpha, :max_text_length
4
+
5
+ ALPHA_DEFAULT = 0.5
6
+ ALPHA_WIDTH = 0.05
7
+ ITERATION_LIMIT = 1000
8
+ PROB_THRESHOLD = 0.1
9
+ CONV_THRESHOLD = 0.99999
10
+ BASE_FREQ = 10000
11
+ UNKNOWN_LANG = "unknown"
12
+
13
+ def initialize(factory)
14
+ @word_lang_prob_map = factory.word_lang_prob_map
15
+ @lang_list = factory.lang_list
16
+ @text = UCS2String.new('')
17
+ @langprob = nil
18
+ @alpha = ALPHA_DEFAULT
19
+ @n_trial = 7
20
+ @max_text_length = 10000
21
+ @prior_map = nil
22
+ @verbose = false
23
+ end
24
+
25
+ # Append more text to be recognized.
26
+ # @param text [UCS2String] text to be recognized
27
+ def append(text)
28
+ raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
29
+ text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
30
+ text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
31
+ text = text.map do |c|
32
+ NGram.normalize(c)
33
+ end
34
+ @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
35
+ end
36
+
37
+ # Detect the language.
38
+ # @return [String] (usually) two-letter code describing the language.
39
+ def detect
40
+ probabilities = get_probabilities()
41
+ (probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
42
+ end
43
+
44
+ private
45
+ def detect_block
46
+ cleaning_text()
47
+ ngrams = extract_ngrams()
48
+ raise "no features in text" if ngrams.empty?
49
+ @langprob = Array.new(@lang_list.length, 0.0)
50
+
51
+ @n_trial.times do
52
+ prob = init_probability()
53
+ alpha = @alpha + Detector.next_gaussian() * ALPHA_WIDTH
54
+
55
+ i = 0
56
+ Kernel.loop do
57
+ r = Kernel.rand(ngrams.length)
58
+ update_lang_prob(prob, ngrams[r], alpha)
59
+ if i % 5
60
+ break if Detector.normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT
61
+ # verbose
62
+ end
63
+ end
64
+ @langprob.length.times do |j|
65
+ @langprob[j] += prob[j] / @n_trial
66
+ end
67
+ # verbose
68
+ end
69
+ end
70
+
71
+ def set_prior_map(prior_map)
72
+ @prior_map = Array.new[@lang_list.length]
73
+ sump = 0.0
74
+ @prior_map.length.times do |i|
75
+ lang = @lang_list[i]
76
+ if @prior_map.has_key?(lang)
77
+ p = @prior_map[lang]
78
+ raise "probability must be non-negative" if p < 0
79
+ @prior_map[i] = p
80
+ sump += p
81
+ end
82
+ end
83
+ raise "more one of prob must be non-zero" if sump <= 0
84
+ @prior_map.map! do |p|
85
+ p /= sump
86
+ end
87
+ end
88
+
89
+ def self.normalize_prob(prob)
90
+ maxp = 0.0; sump = 0.0
91
+ prob.each do |p|
92
+ sump += p
93
+ end
94
+ prob.map! do |p|
95
+ q = p / sump
96
+ maxp = q if q > maxp
97
+ q
98
+ end
99
+ maxp
100
+ end
101
+
102
+ private
103
+ def cleaning_text
104
+ non_latin_count = latin_count = 0
105
+ @text.each_char do |c|
106
+ if c < "\00z" && c >= "\x00A"
107
+ latin_count += 1
108
+ elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
109
+ non_latin_count += 1
110
+ end
111
+ end
112
+ if latin_count * 2 < non_latin_count
113
+ text_without_latin = UCS2String.new('')
114
+ @text.each_char do |c|
115
+ text_without_latin << c if c > "\x00z" || c < "\x00A"
116
+ end
117
+ @text = text_without_latin
118
+ end
119
+ end
120
+
121
+ def extract_ngrams
122
+ list = []
123
+ ngram = NGram.new
124
+ @text.each_char do |char|
125
+ ngram.add_char(char)
126
+ (1..NGram::N_GRAM).each do |n|
127
+ w = ngram.get(n)
128
+ list << w if w && @word_lang_prob_map.has_key?(w)
129
+ end
130
+ end
131
+ list
132
+ end
133
+
134
+ def get_probabilities
135
+ if @langprob.nil?
136
+ detect_block()
137
+ end
138
+ sort_probability(@langprob)
139
+ end
140
+
141
+ def init_probability
142
+ prob = Array.new(@lang_list.length)
143
+ if @prior_map
144
+ prob = @prior_map.clone
145
+ else
146
+ prob.length.times do |i|
147
+ prob[i] = 1.0 / @lang_list.length
148
+ end
149
+ end
150
+ prob
151
+ end
152
+
153
+ def sort_probability(prob)
154
+ list = []
155
+ prob.each_with_index do |prob, index|
156
+ list[index] = Language.new(@lang_list[index], prob)
157
+ end
158
+ list.sort_by do |x|
159
+ x.prob
160
+ end.select do |x|
161
+ x.prob > PROB_THRESHOLD
162
+ end
163
+ end
164
+
165
+ def update_lang_prob(prob, word, alpha)
166
+ return false if word.nil? || ! @word_lang_prob_map.has_key?(word)
167
+
168
+ lang_prob_map = @word_lang_prob_map[word]
169
+ # verbose
170
+ weight = alpha / BASE_FREQ
171
+ prob.length.times do |i|
172
+ prob[i] *= weight + lang_prob_map[i]
173
+ end
174
+ true
175
+ end
176
+
177
+ def word_prob_to_string(prob)
178
+ prob.zip(@lang_list).select do |p, lang|
179
+ p > 0.00001
180
+ end.map do |p, lang|
181
+ "%s:%.5f" % [p, lang]
182
+ end.join(' ')
183
+ end
184
+
185
+ # Box-Muller transform.
186
+ def self.next_gaussian
187
+ s = 0
188
+ while s >= 1 || s == 0
189
+ v1 = 2 * Kernel.rand - 1
190
+ v2 = 2 * Kernel.rand - 1
191
+ s = v1 * v1 + v2 * v2
192
+ end
193
+ multiplier = Math.sqrt(-2 * Math.log(s)/s)
194
+ return v1 * multiplier
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,46 @@
1
+ module Langusta
2
+ class LangDetectException < StandardError; end
3
+
4
+ class DetectorFactory
5
+ attr_reader :word_lang_prob_map, :lang_list
6
+
7
+ def initialize
8
+ @word_lang_prob_map = {}
9
+ @lang_list = []
10
+ end
11
+
12
+ # Adds a new language profile to this factory.
13
+ # @param [LangProfile] language profile to be added.
14
+ # @param [Fixnum] index at which the language profile is to be added.
15
+ # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
16
+ def add_profile(profile, index, langsize)
17
+ raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name)
18
+ @lang_list << profile.name
19
+ profile.freq.keys.each do |word|
20
+ if not @word_lang_prob_map.has_key?(word)
21
+ @word_lang_prob_map[word] = Array.new(langsize, 0.0)
22
+ end
23
+ prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
24
+ @word_lang_prob_map[word][index] = prob
25
+ end
26
+ end
27
+
28
+ # Creates a new detector object, based on a preconfigured set of language profiles.
29
+ # @return [Detector]
30
+ def create(alpha=nil)
31
+ if alpha
32
+ detector = create_detector()
33
+ detector.alpha = alpha
34
+ detector
35
+ else
36
+ create_detector()
37
+ end
38
+ end
39
+
40
+ private
41
+ def create_detector
42
+ raise LangDetectException.new("need to load profiles") if @lang_list.length == 0
43
+ detector = Detector.new(self)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,35 @@
1
+ module Langusta
2
+ class JavaPropertyReader
3
+ # This is a minimal implementation, don't expect this to actually work.
4
+
5
+ def initialize(filename)
6
+ @lines = File.open(filename).read
7
+ parse()
8
+ end
9
+
10
+ def [](property)
11
+ @properties[property]
12
+ end
13
+
14
+ def underlying_hash
15
+ @properties
16
+ end
17
+
18
+ private
19
+ def parse
20
+ @properties = {}
21
+ @lines.each do |line|
22
+ prop_name, value = line.split(/\=/)
23
+ @properties[prop_name] = parse_value(value)
24
+ end
25
+ end
26
+
27
+ def parse_value(value)
28
+ codepoints = value.scan(/([0-9A-F]{4})/)
29
+ codepoints.map do |cp|
30
+ int_cp = cp.first.to_i(16)
31
+ [int_cp / 256, int_cp % 256].pack("c*")
32
+ end.join
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,80 @@
1
+ require 'set'
2
+
3
+ module Langusta
4
+ class LangProfile
5
+ MINIMUM_FREQ = 2
6
+ LESS_FREQ_RATIO = 100_000
7
+ attr_reader :name, :freq, :n_words
8
+
9
+ # Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.
10
+ # @param [String] file name of the language profile.
11
+ # @return [LangProfile]
12
+ def self.load_from_file(filename)
13
+ json = Yajl::Parser.parse(File.new(filename))
14
+ profile = self.new
15
+
16
+ name = json['name']
17
+ n_words = json['n_words']
18
+ freq = json['freq'].inject({}) do |acc, kv|
19
+ key, value = kv
20
+ acc[UCS2String.from_utf8(key)] = value
21
+ acc
22
+ end
23
+ profile.populate_json(name, freq, n_words)
24
+ profile
25
+ end
26
+
27
+ def initialize(name=nil)
28
+ @name = name
29
+ @freq = {}
30
+ @n_words = Array.new(NGram::N_GRAM, 0)
31
+ end
32
+
33
+ def populate_json(name, freq, n_words)
34
+ @name, @freq, @n_words = name, freq, n_words
35
+ end
36
+
37
+ # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
38
+ # @param gram [UCS2String]
39
+ def add(gram)
40
+ raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
41
+ return if @name.nil? or gram.nil?
42
+ length = gram.size
43
+ return if length < 1 or length > NGram::N_GRAM
44
+ @n_words[length - 1] += 1
45
+ @freq[gram] ||= 0
46
+ @freq[gram] += 1
47
+ end
48
+
49
+ def omit_less_freq
50
+ return if @name.nil?
51
+ threshold = @n_words[0] / LESS_FREQ_RATIO
52
+ threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
53
+ keys = Set.new(@freq.keys)
54
+ roman = 0
55
+ keys.each do |key|
56
+ count = @freq[key]
57
+ if count <= threshold
58
+ @n_words[key.size - 1] -= count
59
+ @freq.delete(key)
60
+ else
61
+ # temp workaround
62
+ if RegexHelper::ROMAN_REGEX.match(key.underlying)
63
+ roman += count
64
+ end
65
+ end
66
+ end
67
+
68
+ if roman < @n_words[0] / 3
69
+ keys2 = Set.new(@freq.keys)
70
+ keys2.each do |key|
71
+ # temp workaround
72
+ if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
73
+ @n_words[key.size - 1] -= @freq[key]
74
+ @freq.delete(key)
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end