langusta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. data/.document +5 -0
  2. data/Gemfile +11 -0
  3. data/Gemfile.lock +32 -0
  4. data/LICENSE.txt +13 -0
  5. data/README.rdoc +34 -0
  6. data/Rakefile +55 -0
  7. data/VERSION +1 -0
  8. data/bin/langusta +5 -0
  9. data/data/messages.properties +128 -0
  10. data/data/uppercase.bin +0 -0
  11. data/langusta.gemspec +210 -0
  12. data/lib/langusta.rb +36 -0
  13. data/lib/langusta/command.rb +78 -0
  14. data/lib/langusta/detector.rb +197 -0
  15. data/lib/langusta/detector_factory.rb +46 -0
  16. data/lib/langusta/java_property_reader.rb +35 -0
  17. data/lib/langusta/lang_profile.rb +80 -0
  18. data/lib/langusta/language.rb +14 -0
  19. data/lib/langusta/language_detection_facade.rb +24 -0
  20. data/lib/langusta/n_gram.rb +116 -0
  21. data/lib/langusta/regex_helper.rb +15 -0
  22. data/lib/langusta/tag_extractor.rb +39 -0
  23. data/lib/langusta/ucs2_string.rb +70 -0
  24. data/lib/langusta/unicode_block.rb +56 -0
  25. data/profiles/af +1 -0
  26. data/profiles/ar +1 -0
  27. data/profiles/bg +1 -0
  28. data/profiles/bn +1 -0
  29. data/profiles/cs +1 -0
  30. data/profiles/da +1 -0
  31. data/profiles/de +1 -0
  32. data/profiles/el +1 -0
  33. data/profiles/en +1 -0
  34. data/profiles/es +1 -0
  35. data/profiles/fa +1 -0
  36. data/profiles/fi +1 -0
  37. data/profiles/fr +1 -0
  38. data/profiles/gu +1 -0
  39. data/profiles/he +1 -0
  40. data/profiles/hi +1 -0
  41. data/profiles/hr +1 -0
  42. data/profiles/hu +1 -0
  43. data/profiles/id +1 -0
  44. data/profiles/it +1 -0
  45. data/profiles/ja +1 -0
  46. data/profiles/kn +1 -0
  47. data/profiles/ko +1 -0
  48. data/profiles/mk +1 -0
  49. data/profiles/ml +1 -0
  50. data/profiles/mr +1 -0
  51. data/profiles/ne +1 -0
  52. data/profiles/nl +1 -0
  53. data/profiles/no +1 -0
  54. data/profiles/pa +1 -0
  55. data/profiles/pl +1 -0
  56. data/profiles/pt +1 -0
  57. data/profiles/ro +1 -0
  58. data/profiles/ru +1 -0
  59. data/profiles/sk +1 -0
  60. data/profiles/so +1 -0
  61. data/profiles/sq +1 -0
  62. data/profiles/sv +1 -0
  63. data/profiles/sw +1 -0
  64. data/profiles/ta +1 -0
  65. data/profiles/te +1 -0
  66. data/profiles/th +1 -0
  67. data/profiles/tl +1 -0
  68. data/profiles/tr +1 -0
  69. data/profiles/uk +1 -0
  70. data/profiles/ur +1 -0
  71. data/profiles/vi +1 -0
  72. data/profiles/zh-cn +1 -0
  73. data/profiles/zh-tw +1 -0
  74. data/test/helper.rb +20 -0
  75. data/test/quality/test_falsified.rb +33 -0
  76. data/test/test_command.rb +34 -0
  77. data/test/test_data/af +1 -0
  78. data/test/test_data/ar +1 -0
  79. data/test/test_data/bg +32 -0
  80. data/test/test_data/bn +9 -0
  81. data/test/test_data/cs +9 -0
  82. data/test/test_data/da +14 -0
  83. data/test/test_data/de +4 -0
  84. data/test/test_data/el +7 -0
  85. data/test/test_data/en +26 -0
  86. data/test/test_data/es +4 -0
  87. data/test/test_data/fa +21 -0
  88. data/test/test_data/fi +8 -0
  89. data/test/test_data/fr +13 -0
  90. data/test/test_data/gu +3 -0
  91. data/test/test_data/he +20 -0
  92. data/test/test_data/hi +1 -0
  93. data/test/test_data/hr +16 -0
  94. data/test/test_data/hu +6 -0
  95. data/test/test_data/id +2 -0
  96. data/test/test_data/it +3 -0
  97. data/test/test_data/ja +34 -0
  98. data/test/test_data/kn +14 -0
  99. data/test/test_data/ko +2 -0
  100. data/test/test_data/mk +3 -0
  101. data/test/test_data/ml +1 -0
  102. data/test/test_data/mr +3 -0
  103. data/test/test_data/ne +2 -0
  104. data/test/test_data/nl +1 -0
  105. data/test/test_data/no +3 -0
  106. data/test/test_data/pa +1 -0
  107. data/test/test_data/pl +23 -0
  108. data/test/test_data/pt +2 -0
  109. data/test/test_data/ro +2 -0
  110. data/test/test_data/ru +1 -0
  111. data/test/test_data/sk +2 -0
  112. data/test/test_data/so +4 -0
  113. data/test/test_data/sq +4 -0
  114. data/test/test_data/sv +3 -0
  115. data/test/test_data/sw +6 -0
  116. data/test/test_data/ta +1 -0
  117. data/test/test_data/te +2 -0
  118. data/test/test_data/th +3 -0
  119. data/test/test_data/tl +1 -0
  120. data/test/test_data/tr +2 -0
  121. data/test/test_data/uk +3 -0
  122. data/test/test_data/ur +1 -0
  123. data/test/test_data/vi +2 -0
  124. data/test/test_data/zh-tw +3 -0
  125. data/test/test_detector.rb +52 -0
  126. data/test/test_detector_factory.rb +16 -0
  127. data/test/test_java_property_reader.rb +8 -0
  128. data/test/test_lang_profile.rb +79 -0
  129. data/test/test_language.rb +15 -0
  130. data/test/test_language_detection_facade.rb +9 -0
  131. data/test/test_langusta.rb +25 -0
  132. data/test/test_n_gram.rb +103 -0
  133. data/test/test_tag_extractor.rb +71 -0
  134. data/test/test_ucs2_string.rb +9 -0
  135. data/test/test_unicode_block.rb +9 -0
  136. metadata +320 -0
@@ -0,0 +1,36 @@
1
+ $: << File.expand_path(File.dirname(__FILE__))
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ Bundler.setup
6
+
7
+ require 'optparse'
8
+ require 'iconv'
9
+
10
+ # Required gems
11
+ require 'oniguruma'
12
+ require 'yajl'
13
+
14
+ module Langusta
15
+ VERSION = '0.1.0'
16
+
17
+ autoload :RegexHelper, 'langusta/regex_helper'
18
+ autoload :UCS2String, 'langusta/ucs2_string'
19
+ autoload :Language, 'langusta/language'
20
+ autoload :LangProfile, 'langusta/lang_profile'
21
+ autoload :Detector, 'langusta/detector'
22
+ autoload :JavaPropertyReader, 'langusta/java_property_reader'
23
+ autoload :UnicodeBlock, 'langusta/unicode_block'
24
+ autoload :NGram, 'langusta/n_gram'
25
+ autoload :DetectorFactory, 'langusta/detector_factory'
26
+ autoload :Detector, 'langusta/detector'
27
+ autoload :TagExtractor, 'langusta/tag_extractor'
28
+ autoload :Command, 'langusta/command'
29
+ autoload :LanguageDetectionFacade, 'langusta/language_detection_facade'
30
+
31
+ ABSOLUTE_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
32
+ PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
33
+ UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
34
+ MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
35
+ end
36
+
@@ -0,0 +1,78 @@
1
+ module Langusta
2
+ class Command
3
+ def self.run(argv)
4
+ options = {}
5
+ opts = OptionParser.new do |opts|
6
+ opts.on("--detectlang", "Detect the language from the given text") do |d|
7
+ options[:operation] = :detectlang if d
8
+ end
9
+
10
+ opts.on("--batchtest", "Batch test of language detection") do |b|
11
+ options[:operation] = :batchtest if b
12
+ end
13
+
14
+ opts.on("-d [profile directory]") do |pd|
15
+ options[:profile_directory] = pd
16
+ end
17
+
18
+ opts.on("-a [alpha]", Float) do |alpha|
19
+ options[:alpha] = alpha
20
+ end
21
+ end.parse!(argv)
22
+
23
+ arguments = [options[:profile_directory]] + [argv]
24
+ arguments << options[:alpha] if options[:alpha]
25
+
26
+ case options[:operation]
27
+ when :detectlang
28
+ self.new.send(:detect_lang, *arguments)
29
+ when :batchtest
30
+ self.new.send(:batch_test, *arguments)
31
+ else
32
+ $stderr.puts <<EOF
33
+ Usage:
34
+
35
+ langusta --detectlang -d [profile directory] -a [alpha] [test file(s)]
36
+ langusta --batchtest -d [profile directory] -a [alpha] [test file(s)]
37
+ EOF
38
+ end
39
+ 0
40
+ end
41
+
42
+ def initialize
43
+ @detector_factory = DetectorFactory.new
44
+ end
45
+
46
+ def detect_lang(profile_directory, test_files, alpha=nil)
47
+ initialize_factory(profile_directory)
48
+ test_files.each do |filename|
49
+ language = detect_single_lang(filename, alpha)
50
+ puts "%s: %s" % [filename, language]
51
+ end
52
+ end
53
+
54
+ def batch_test(profile_directory, test_files, alpha=nil)
55
+ end
56
+
57
+ def detect_single_lang(filename, alpha)
58
+ ucs2_content = UCS2String.from_utf8(File.open(filename).read)
59
+ detector = @detector_factory.create(alpha)
60
+ detector.append(ucs2_content)
61
+
62
+ language = detector.detect()
63
+ end
64
+
65
+ def initialize_factory(profile_directory)
66
+ profiles = load_profiles(profile_directory)
67
+ profiles.each_with_index do |profile, index|
68
+ @detector_factory.add_profile(profile, index, profiles.length)
69
+ end
70
+ end
71
+
72
+ def load_profiles(directory)
73
+ @profiles = Dir[File.join(directory, '/*')].map do |filename|
74
+ LangProfile.load_from_file(filename)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,197 @@
1
+ module Langusta
2
+ class Detector
3
+ attr_accessor :verbose, :alpha, :max_text_length
4
+
5
+ ALPHA_DEFAULT = 0.5
6
+ ALPHA_WIDTH = 0.05
7
+ ITERATION_LIMIT = 1000
8
+ PROB_THRESHOLD = 0.1
9
+ CONV_THRESHOLD = 0.99999
10
+ BASE_FREQ = 10000
11
+ UNKNOWN_LANG = "unknown"
12
+
13
+ def initialize(factory)
14
+ @word_lang_prob_map = factory.word_lang_prob_map
15
+ @lang_list = factory.lang_list
16
+ @text = UCS2String.new('')
17
+ @langprob = nil
18
+ @alpha = ALPHA_DEFAULT
19
+ @n_trial = 7
20
+ @max_text_length = 10000
21
+ @prior_map = nil
22
+ @verbose = false
23
+ end
24
+
25
+ # Append more text to be recognized.
26
+ # @param text [UCS2String] text to be recognized
27
+ def append(text)
28
+ raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
29
+ text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
30
+ text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
31
+ text = text.map do |c|
32
+ NGram.normalize(c)
33
+ end
34
+ @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
35
+ end
36
+
37
+ # Detect the language.
38
+ # @return [String] (usually) two-letter code describing the language.
39
+ def detect
40
+ probabilities = get_probabilities()
41
+ (probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
42
+ end
43
+
44
+ private
45
+ def detect_block
46
+ cleaning_text()
47
+ ngrams = extract_ngrams()
48
+ raise "no features in text" if ngrams.empty?
49
+ @langprob = Array.new(@lang_list.length, 0.0)
50
+
51
+ @n_trial.times do
52
+ prob = init_probability()
53
+ alpha = @alpha + Detector.next_gaussian() * ALPHA_WIDTH
54
+
55
+ i = 0
56
+ Kernel.loop do
57
+ r = Kernel.rand(ngrams.length)
58
+ update_lang_prob(prob, ngrams[r], alpha)
59
+ if i % 5
60
+ break if Detector.normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT
61
+ # verbose
62
+ end
63
+ end
64
+ @langprob.length.times do |j|
65
+ @langprob[j] += prob[j] / @n_trial
66
+ end
67
+ # verbose
68
+ end
69
+ end
70
+
71
+ def set_prior_map(prior_map)
72
+ @prior_map = Array.new[@lang_list.length]
73
+ sump = 0.0
74
+ @prior_map.length.times do |i|
75
+ lang = @lang_list[i]
76
+ if @prior_map.has_key?(lang)
77
+ p = @prior_map[lang]
78
+ raise "probability must be non-negative" if p < 0
79
+ @prior_map[i] = p
80
+ sump += p
81
+ end
82
+ end
83
+ raise "more one of prob must be non-zero" if sump <= 0
84
+ @prior_map.map! do |p|
85
+ p /= sump
86
+ end
87
+ end
88
+
89
+ def self.normalize_prob(prob)
90
+ maxp = 0.0; sump = 0.0
91
+ prob.each do |p|
92
+ sump += p
93
+ end
94
+ prob.map! do |p|
95
+ q = p / sump
96
+ maxp = q if q > maxp
97
+ q
98
+ end
99
+ maxp
100
+ end
101
+
102
+ private
103
+ def cleaning_text
104
+ non_latin_count = latin_count = 0
105
+ @text.each_char do |c|
106
+ if c < "\00z" && c >= "\x00A"
107
+ latin_count += 1
108
+ elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
109
+ non_latin_count += 1
110
+ end
111
+ end
112
+ if latin_count * 2 < non_latin_count
113
+ text_without_latin = UCS2String.new('')
114
+ @text.each_char do |c|
115
+ text_without_latin << c if c > "\x00z" || c < "\x00A"
116
+ end
117
+ @text = text_without_latin
118
+ end
119
+ end
120
+
121
+ def extract_ngrams
122
+ list = []
123
+ ngram = NGram.new
124
+ @text.each_char do |char|
125
+ ngram.add_char(char)
126
+ (1..NGram::N_GRAM).each do |n|
127
+ w = ngram.get(n)
128
+ list << w if w && @word_lang_prob_map.has_key?(w)
129
+ end
130
+ end
131
+ list
132
+ end
133
+
134
+ def get_probabilities
135
+ if @langprob.nil?
136
+ detect_block()
137
+ end
138
+ sort_probability(@langprob)
139
+ end
140
+
141
+ def init_probability
142
+ prob = Array.new(@lang_list.length)
143
+ if @prior_map
144
+ prob = @prior_map.clone
145
+ else
146
+ prob.length.times do |i|
147
+ prob[i] = 1.0 / @lang_list.length
148
+ end
149
+ end
150
+ prob
151
+ end
152
+
153
+ def sort_probability(prob)
154
+ list = []
155
+ prob.each_with_index do |prob, index|
156
+ list[index] = Language.new(@lang_list[index], prob)
157
+ end
158
+ list.sort_by do |x|
159
+ x.prob
160
+ end.select do |x|
161
+ x.prob > PROB_THRESHOLD
162
+ end
163
+ end
164
+
165
+ def update_lang_prob(prob, word, alpha)
166
+ return false if word.nil? || ! @word_lang_prob_map.has_key?(word)
167
+
168
+ lang_prob_map = @word_lang_prob_map[word]
169
+ # verbose
170
+ weight = alpha / BASE_FREQ
171
+ prob.length.times do |i|
172
+ prob[i] *= weight + lang_prob_map[i]
173
+ end
174
+ true
175
+ end
176
+
177
+ def word_prob_to_string(prob)
178
+ prob.zip(@lang_list).select do |p, lang|
179
+ p > 0.00001
180
+ end.map do |p, lang|
181
+ "%s:%.5f" % [p, lang]
182
+ end.join(' ')
183
+ end
184
+
185
+ # Box-Muller transform.
186
+ def self.next_gaussian
187
+ s = 0
188
+ while s >= 1 || s == 0
189
+ v1 = 2 * Kernel.rand - 1
190
+ v2 = 2 * Kernel.rand - 1
191
+ s = v1 * v1 + v2 * v2
192
+ end
193
+ multiplier = Math.sqrt(-2 * Math.log(s)/s)
194
+ return v1 * multiplier
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,46 @@
1
+ module Langusta
2
+ class LangDetectException < StandardError; end
3
+
4
+ class DetectorFactory
5
+ attr_reader :word_lang_prob_map, :lang_list
6
+
7
+ def initialize
8
+ @word_lang_prob_map = {}
9
+ @lang_list = []
10
+ end
11
+
12
+ # Adds a new language profile to this factory.
13
+ # @param [LangProfile] language profile to be added.
14
+ # @param [Fixnum] index at which the language profile is to be added.
15
+ # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
16
+ def add_profile(profile, index, langsize)
17
+ raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name)
18
+ @lang_list << profile.name
19
+ profile.freq.keys.each do |word|
20
+ if not @word_lang_prob_map.has_key?(word)
21
+ @word_lang_prob_map[word] = Array.new(langsize, 0.0)
22
+ end
23
+ prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
24
+ @word_lang_prob_map[word][index] = prob
25
+ end
26
+ end
27
+
28
+ # Creates a new detector object, based on a preconfigured set of language profiles.
29
+ # @return [Detector]
30
+ def create(alpha=nil)
31
+ if alpha
32
+ detector = create_detector()
33
+ detector.alpha = alpha
34
+ detector
35
+ else
36
+ create_detector()
37
+ end
38
+ end
39
+
40
+ private
41
+ def create_detector
42
+ raise LangDetectException.new("need to load profiles") if @lang_list.length == 0
43
+ detector = Detector.new(self)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,35 @@
1
+ module Langusta
2
+ class JavaPropertyReader
3
+ # This is a minimal implementation, don't expect this to actually work.
4
+
5
+ def initialize(filename)
6
+ @lines = File.open(filename).read
7
+ parse()
8
+ end
9
+
10
+ def [](property)
11
+ @properties[property]
12
+ end
13
+
14
+ def underlying_hash
15
+ @properties
16
+ end
17
+
18
+ private
19
+ def parse
20
+ @properties = {}
21
+ @lines.each do |line|
22
+ prop_name, value = line.split(/\=/)
23
+ @properties[prop_name] = parse_value(value)
24
+ end
25
+ end
26
+
27
+ def parse_value(value)
28
+ codepoints = value.scan(/([0-9A-F]{4})/)
29
+ codepoints.map do |cp|
30
+ int_cp = cp.first.to_i(16)
31
+ [int_cp / 256, int_cp % 256].pack("c*")
32
+ end.join
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,80 @@
1
+ require 'set'
2
+
3
+ module Langusta
4
+ class LangProfile
5
+ MINIMUM_FREQ = 2
6
+ LESS_FREQ_RATIO = 100_000
7
+ attr_reader :name, :freq, :n_words
8
+
9
+ # Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.
10
+ # @param [String] file name of the language profile.
11
+ # @return [LangProfile]
12
+ def self.load_from_file(filename)
13
+ json = Yajl::Parser.parse(File.new(filename))
14
+ profile = self.new
15
+
16
+ name = json['name']
17
+ n_words = json['n_words']
18
+ freq = json['freq'].inject({}) do |acc, kv|
19
+ key, value = kv
20
+ acc[UCS2String.from_utf8(key)] = value
21
+ acc
22
+ end
23
+ profile.populate_json(name, freq, n_words)
24
+ profile
25
+ end
26
+
27
+ def initialize(name=nil)
28
+ @name = name
29
+ @freq = {}
30
+ @n_words = Array.new(NGram::N_GRAM, 0)
31
+ end
32
+
33
+ def populate_json(name, freq, n_words)
34
+ @name, @freq, @n_words = name, freq, n_words
35
+ end
36
+
37
+ # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
38
+ # @param gram [UCS2String]
39
+ def add(gram)
40
+ raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
41
+ return if @name.nil? or gram.nil?
42
+ length = gram.size
43
+ return if length < 1 or length > NGram::N_GRAM
44
+ @n_words[length - 1] += 1
45
+ @freq[gram] ||= 0
46
+ @freq[gram] += 1
47
+ end
48
+
49
+ def omit_less_freq
50
+ return if @name.nil?
51
+ threshold = @n_words[0] / LESS_FREQ_RATIO
52
+ threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
53
+ keys = Set.new(@freq.keys)
54
+ roman = 0
55
+ keys.each do |key|
56
+ count = @freq[key]
57
+ if count <= threshold
58
+ @n_words[key.size - 1] -= count
59
+ @freq.delete(key)
60
+ else
61
+ # temp workaround
62
+ if RegexHelper::ROMAN_REGEX.match(key.underlying)
63
+ roman += count
64
+ end
65
+ end
66
+ end
67
+
68
+ if roman < @n_words[0] / 3
69
+ keys2 = Set.new(@freq.keys)
70
+ keys2.each do |key|
71
+ # temp workaround
72
+ if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
73
+ @n_words[key.size - 1] -= @freq[key]
74
+ @freq.delete(key)
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end