langusta 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,7 @@
1
1
  module Langusta
2
2
  class DetectorFactory
3
+ include Inspector
4
+
3
5
  attr_reader :word_lang_prob_map, :lang_list
4
6
 
5
7
  def initialize
@@ -11,15 +13,15 @@ module Langusta
11
13
  # @param [LangProfile] language profile to be added.
12
14
  # @param [Fixnum] index at which the language profile is to be added.
13
15
  # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
14
- def add_profile(profile, index, langsize)
16
+ def add_profile(profile)
15
17
  raise DuplicateProfilesError.new(profile.name) if @lang_list.include?(profile.name)
16
18
  @lang_list << profile.name
19
+ last_lang_index = @lang_list.size - 1
20
+
17
21
  profile.freq.keys.each do |word|
18
- if not @word_lang_prob_map.has_key?(word)
19
- @word_lang_prob_map[word] = Array.new(langsize, 0.0)
20
- end
22
+ @word_lang_prob_map[word] ||= []
21
23
  prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
22
- @word_lang_prob_map[word][index] = prob
24
+ @word_lang_prob_map[word][last_lang_index] = prob
23
25
  end
24
26
  end
25
27
 
@@ -35,6 +37,10 @@ module Langusta
35
37
  end
36
38
  end
37
39
 
40
+ def inspect
41
+ "#<#{self.class.name}:0x#{object_ptr} (#{@lang_list.size} profile(s))"
42
+ end
43
+
38
44
  private
39
45
  def create_detector
40
46
  raise NoProfilesLoadedError if @lang_list.empty?
@@ -0,0 +1,22 @@
1
+ module Langusta
2
+ module Guard
3
+
4
+ def self.klass(argument, klass, _method)
5
+ return unless $debug
6
+ raise TypeError.new("#{_method}: expected #{klass} got: #{argument.class}") unless argument.is_a?(klass)
7
+ end
8
+
9
+ def self.codepoint(codepoint, _method)
10
+ return unless $debug
11
+ raise ArgumentError.new([_method, ':', codepoint.to_s(16)].join) unless (0x0000..0xffff).include?(codepoint)
12
+ end
13
+
14
+ def self.codepoint_array(array, _method)
15
+ return unless $debug
16
+ raise TypeError.new("#{_method}: expected Array, got: #{array.class}") unless array.is_a?(Array)
17
+ cp = array.find do |cp|
18
+ ! (0x0000..0xffff).include?(cp)
19
+ end && (raise ArgumentError.new("#{_method}: bad codepoint: #{cp}"))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,7 @@
1
+ module Langusta
2
+ module Inspector
3
+ def object_ptr
4
+ (object_id * 2).to_s(16)
5
+ end
6
+ end
7
+ end
@@ -3,7 +3,7 @@ module Langusta
3
3
  # This is a minimal implementation, don't expect this to actually work.
4
4
 
5
5
  def initialize(filename)
6
- @lines = File.open(filename).read
6
+ @lines = File.open(filename).readlines
7
7
  parse()
8
8
  end
9
9
 
@@ -28,8 +28,7 @@ module Langusta
28
28
  codepoints = value.scan(/([0-9A-F]{4})/)
29
29
  codepoints.map do |cp|
30
30
  int_cp = cp.first.to_i(16)
31
- [int_cp / 256, int_cp % 256].pack("c*")
32
- end.join
31
+ end
33
32
  end
34
33
  end
35
34
  end
@@ -11,34 +11,29 @@ module Langusta
11
11
  # @return [LangProfile]
12
12
  def self.load_from_file(filename)
13
13
  json = Yajl::Parser.parse(File.new(filename))
14
- profile = self.new
15
14
 
16
- name = json['name']
17
- n_words = json['n_words']
18
15
  freq = json['freq'].inject({}) do |acc, kv|
19
16
  key, value = kv
20
- acc[UCS2String.from_utf8(key)] = value
17
+ acc[Langusta.utf82cp(key)] = value
21
18
  acc
22
19
  end
23
- profile.populate_json(name, freq, n_words)
24
- profile
25
- end
26
20
 
27
- def initialize(name=nil)
28
- @name = name
29
- @freq = {}
30
- @n_words = Array.new(NGram::N_GRAM, 0)
21
+ self.new(json['name'] || (raise CorruptProfileError.new("Missing profile name")),
22
+ freq,
23
+ json['n_words'] || (raise CorruptProfileError.new("Missing number of words value")))
31
24
  end
32
25
 
33
- def populate_json(name, freq, n_words)
26
+ def initialize(name, freq={}, n_words = Array.new(NGram::N_GRAM, 0))
27
+ Guard.klass(name, String, __method__)
34
28
  @name, @freq, @n_words = name, freq, n_words
35
29
  end
36
30
 
37
31
  # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
38
- # @param gram [UCS2String]
32
+ # @param gram [Array<Fixnum>]
39
33
  def add(gram)
40
- raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
41
- return if @name.nil? or gram.nil?
34
+ return if gram.nil?
35
+ Guard.klass(gram, Array, __method__)
36
+
42
37
  length = gram.size
43
38
  return if length < 1 or length > NGram::N_GRAM
44
39
  @n_words[length - 1] += 1
@@ -47,7 +42,6 @@ module Langusta
47
42
  end
48
43
 
49
44
  def omit_less_freq
50
- return if @name.nil?
51
45
  threshold = @n_words[0] / LESS_FREQ_RATIO
52
46
  threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
53
47
  keys = Set.new(@freq.keys)
@@ -59,7 +53,7 @@ module Langusta
59
53
  @freq.delete(key)
60
54
  else
61
55
  # temp workaround
62
- if RegexHelper::ROMAN_REGEX.match(key.underlying)
56
+ if RegexHelper::ROMAN_REGEX.match(Langusta.cp2utf8(key))
63
57
  roman += count
64
58
  end
65
59
  end
@@ -69,7 +63,7 @@ module Langusta
69
63
  keys2 = Set.new(@freq.keys)
70
64
  keys2.each do |key|
71
65
  # temp workaround
72
- if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
66
+ if RegexHelper::INCL_ROMAN_REGEX.match(Langusta.cp2utf8(key))
73
67
  @n_words[key.size - 1] -= @freq[key]
74
68
  @freq.delete(key)
75
69
  end
@@ -3,8 +3,8 @@ module Langusta
3
3
  def initialize
4
4
  @factory = DetectorFactory.new
5
5
  profiles = load_profiles()
6
- profiles.each_with_index do |profile, index|
7
- @factory.add_profile(profile, index, profiles.length)
6
+ profiles.each do |profile|
7
+ @factory.add_profile(profile)
8
8
  end
9
9
  end
10
10
 
@@ -3,17 +3,10 @@ module Langusta
3
3
  # constructed on a character by character basis.
4
4
  class NGram
5
5
  N_GRAM = 3
6
- UCS2_SPACE = "\x00\x20"
6
+ UCS2_SPACE = 0x0020
7
7
 
8
8
  def self.calculate_latin1_excluded
9
- internal_hash = JavaPropertyReader.new(MESSAGES_PROPERTIES).underlying_hash
10
- _, value = internal_hash.find do |k, v|
11
- k == "NGram.LATIN1_EXCLUDE"
12
- end
13
-
14
- (0..(value.length - 2)).step(2).map do |index|
15
- value[index, 2]
16
- end
9
+ JavaPropertyReader.new(MESSAGES_PROPERTIES)["NGram.LATIN1_EXCLUDE"]
17
10
  end
18
11
 
19
12
  LATIN1_EXCLUDED = self.calculate_latin1_excluded()
@@ -28,10 +21,9 @@ module Langusta
28
21
  internal_hash.select do |key, _|
29
22
  /KANJI_[0-9]{1}/ =~ key
30
23
  end.each do |_, chars|
31
- key = chars[0..1]
32
- m[key] = key
33
- (2..(chars.length - 2)).step(2) do |n|
34
- m[chars[n, 2]] = key
24
+ key = chars.first
25
+ chars.each do |cp|
26
+ m[cp] = key
35
27
  end
36
28
  end
37
29
  m
@@ -44,27 +36,27 @@ module Langusta
44
36
  block = UnicodeBlock.of(ch)
45
37
  case block
46
38
  when UnicodeBlock::BASIC_LATIN
47
- (ch < "\x00A" || (ch < "\x00a" && ch > "\x00Z") || ch > "\x00z") ? UCS2_SPACE : ch
39
+ (ch < 0x0041 || (ch < 0x0061 && ch > 0x005a) || ch > 0x007a) ? UCS2_SPACE : ch
48
40
  when UnicodeBlock::LATIN_1_SUPPLEMENT
49
41
  LATIN1_EXCLUDED.include?(ch) ? UCS2_SPACE : ch
50
42
  when UnicodeBlock::GENERAL_PUNCTUATION
51
43
  UCS2_SPACE
52
44
  when UnicodeBlock::ARABIC
53
- (ch == "\x06\xcc") ? "\x06\x4a" : ch
45
+ (ch == 0x06cc) ? 0x064a : ch
54
46
  when UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
55
- (ch >= "\x1e\xa0") ? "\x1e\xc3" : ch
47
+ (ch >= 0x1ea0) ? 0x1ec3 : ch
56
48
  when UnicodeBlock::HIRAGANA
57
- "\x30\x42"
49
+ 0x3042
58
50
  when UnicodeBlock::KATAKANA
59
- "\x30\xa2"
51
+ 0x30a2
60
52
  when UnicodeBlock::BOPOMOFO
61
- "\x31\x05"
53
+ 0x3105
62
54
  when UnicodeBlock::BOPOMOFO_EXTENDED
63
- "\x31\x05"
55
+ 0x3105
64
56
  when UnicodeBlock::CJK_UNIFIED_IDEOGRAPHS
65
57
  cjk_map.has_key?(ch) ? cjk_map[ch] : ch
66
58
  when UnicodeBlock::HANGUL_SYLLABES
67
- "\xac\x00"
59
+ 0xac00
68
60
  else
69
61
  ch
70
62
  end
@@ -77,22 +69,25 @@ module Langusta
77
69
 
78
70
  # Retrieves an n-sized NGram from the current sequence.
79
71
  # @param n [Integer] length of NGram.
80
- # @return [UCS2String] n-sized NGram.
72
+ # @return [Array<Integer>] n-sized NGram.
81
73
  def get(n)
82
74
  return nil if @capitalword
83
75
  len = @grams.length
84
76
  return nil if n < 1 || n > 3 || len < n
85
77
  if n == 1
86
78
  ch = @grams[len - 1]
87
- return (ch == UCS2_SPACE) ? nil : UCS2String.new(ch)
79
+ return (ch == UCS2_SPACE) ? nil : [ch]
88
80
  else
89
- return UCS2String.new(@grams[len - n, len].join)
81
+ return @grams[len - n, len]
90
82
  end
91
83
  end
92
84
 
93
85
  # Adds a single character to an NGram sequence.
94
- # @param character [String[2]] Two-byte Unicode codepoint.
86
+ # @param character [Fixnum] Two-byte Unicode codepoint.
95
87
  def add_char(character)
88
+ Guard.klass(character, Fixnum, __method__)
89
+ Guard.codepoint(character, __method__)
90
+
96
91
  character = NGram.normalize(character)
97
92
  lastchar = @grams[-1]
98
93
  if lastchar == UCS2_SPACE
@@ -1,15 +1,20 @@
1
1
  module Langusta
2
2
  module RegexHelper
3
- include Oniguruma
4
-
5
- def self._u16(string)
6
- string.unpack("U*").pack("n*")
3
+ if RUBY_VERSION < "1.9"
4
+ include Oniguruma
5
+
6
+ ROMAN_REGEX = ORegexp.new("^[a-z]$", :options => OPTION_IGNORECASE)
7
+ INCL_ROMAN_REGEX = ORegexp.new(".*[a-z].*", :options => OPTION_IGNORECASE)
8
+ URL_REGEX = ORegexp.new("https?://[-_.?&~;+=/#0-9a-z]+", :options => OPTION_IGNORECASE)
9
+ MAIL_REGEX = ORegexp.new("[-_.0-9a-z]+@[-_0-9a-z]+[-_.0-9a-z]+", :options => OPTION_IGNORECASE)
10
+ SPACE_REGEX = ORegexp.new(" +")
11
+ else
12
+ # /ui stands for UTF-8 case-insensitive regexp.
13
+ ROMAN_REGEX = /^[a-z]$/ui
14
+ INCL_ROMAN_REGEX = /.*[a-z].*/ui
15
+ URL_REGEX = Regexp.new("https?://[-_.?&~;+=/#a-z0-9]+")
16
+ MAIL_REGEX = /[-_.a-z0-9]+@[-_a-z0-9]+[-_.a-z0-9]+/ui
17
+ SPACE_REGEX = / +/
7
18
  end
8
-
9
- ROMAN_REGEX = ORegexp.new(_u16("^[A-Za-z]$"), "", "UTF16_BE", "java")
10
- INCL_ROMAN_REGEX = ORegexp.new(_u16(".*[A-Za-z].*"), "", "UTF16_BE", "java")
11
- URL_REGEX = ORegexp.new(_u16("https?://[-_.?&~;+=/#0-9A-Za-z]+"), "", "UTF16_BE", "java")
12
- MAIL_REGEX = ORegexp.new(_u16("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+"), "", "UTF_16BE", "java")
13
- SPACE_REGEX = ORegexp.new(_u16(" +"), "", "UTF16_BE", "java")
14
19
  end
15
20
  end
@@ -7,26 +7,26 @@ module Langusta
7
7
  @target = tag
8
8
  @threshold = threshold
9
9
  @count = 0
10
- @buffer = UCS2String.new("")
10
+ @buffer = []
11
11
  @tag = nil
12
12
  end
13
13
 
14
14
  def add(line)
15
15
  if @target == @tag && line
16
- @buffer << line
16
+ @buffer += line
17
17
  end
18
18
  end
19
19
 
20
20
  def clear
21
21
  @tag = nil
22
- @buffer = UCS2String.new("")
22
+ @buffer = []
23
23
  end
24
24
 
25
25
  def close_tag(profile)
26
26
  if profile && @tag == @target && @buffer.length > @threshold
27
27
  gram = NGram.new
28
- @buffer.each_char do |char|
29
- gram.add_char(char)
28
+ @buffer.each do |codepoint|
29
+ gram.add_char(codepoint)
30
30
  (1..NGram::N_GRAM).each do |n|
31
31
  profile.add(gram.get(n))
32
32
  end
@@ -2,44 +2,44 @@ module Langusta
2
2
  module UnicodeBlock
3
3
  # Half-baked implementation of Java's UnicodeBlock.
4
4
 
5
- OTHER = 0
6
- BASIC_LATIN = 1
7
- LATIN_1_SUPPLEMENT = 2
8
- GENERAL_PUNCTUATION = 3
9
- ARABIC = 4
10
- LATIN_EXTENDED_ADDITIONAL = 5
11
- HIRAGANA = 6
12
- KATAKANA = 7
13
- BOPOMOFO = 8
14
- BOPOMOFO_EXTENDED = 9
15
- CJK_UNIFIED_IDEOGRAPHS = 10
16
- HANGUL_SYLLABES = 11
5
+ OTHER = 0
6
+ BASIC_LATIN = 1
7
+ LATIN_1_SUPPLEMENT = 2
8
+ GENERAL_PUNCTUATION = 3
9
+ ARABIC = 4
10
+ LATIN_EXTENDED_ADDITIONAL = 5
11
+ HIRAGANA = 6
12
+ KATAKANA = 7
13
+ BOPOMOFO = 8
14
+ BOPOMOFO_EXTENDED = 9
15
+ CJK_UNIFIED_IDEOGRAPHS = 10
16
+ HANGUL_SYLLABES = 11
17
17
 
18
- BASIC_LATIN_RANGE = "\x00\x00".."\x00\x7f"
19
- LATIN_1_SUPPLEMENT_RANGE = "\x00\x80".."\x00\xff"
20
- GENERAL_PUNCTUATION_RANGE = "\x20\x00".."\x20\x6f"
21
- ARABIC_RANGE = "\x06\x00".."\x06\xff"
22
- LATIN_EXTENDED_ADDITIONAL_RANGE = "\x1e\x00".."\x1e\xff"
23
- HIRAGANA_RANGE = "\x30\x40".."\x30\x9f"
24
- KATAKANA_RANGE = "\x30\xa0".."\x30\xff"
25
- BOPOMOFO_RANGE = "\x31\x00".."\x31\xbf"
26
- BOPOMOFO_EXTENDED_RANGE = "\x31\xa0".."\x31\xbf"
27
- CJK_UNIFIED_IDEOGRAPHS_RANGE = "\x4e\x00".."\x9f\xff"
28
- HANGUL_SYLLABES_RANGE = "\xac\x00".."\xd7\xaf"
18
+ BASIC_LATIN_RANGE = 0x0000..0x007f
19
+ LATIN_1_SUPPLEMENT_RANGE = 0x0080..0x00ff
20
+ GENERAL_PUNCTUATION_RANGE = 0x2000..0x206f
21
+ ARABIC_RANGE = 0x0600..0x06ff
22
+ LATIN_EXTENDED_ADDITIONAL_RANGE = 0x1e00..0x1eff
23
+ HIRAGANA_RANGE = 0x3040..0x309f
24
+ KATAKANA_RANGE = 0x30a0..0x30ff
25
+ BOPOMOFO_RANGE = 0x3100..0x31bf
26
+ BOPOMOFO_EXTENDED_RANGE = 0x31a0..0x31bf
27
+ CJK_UNIFIED_IDEOGRAPHS_RANGE = 0x4e00..0x9fff
28
+ HANGUL_SYLLABES_RANGE = 0xac00..0xd7af
29
29
 
30
30
  def self.of(character)
31
31
  case character
32
- when BASIC_LATIN_RANGE then return BASIC_LATIN
33
- when LATIN_1_SUPPLEMENT_RANGE then return LATIN_1_SUPPLEMENT
34
- when GENERAL_PUNCTUATION_RANGE then return GENERAL_PUNCTUATION
35
- when ARABIC_RANGE then return ARABIC
32
+ when BASIC_LATIN_RANGE then return BASIC_LATIN
33
+ when LATIN_1_SUPPLEMENT_RANGE then return LATIN_1_SUPPLEMENT
34
+ when GENERAL_PUNCTUATION_RANGE then return GENERAL_PUNCTUATION
35
+ when ARABIC_RANGE then return ARABIC
36
36
  when LATIN_EXTENDED_ADDITIONAL_RANGE then return LATIN_EXTENDED_ADDITIONAL
37
- when HIRAGANA_RANGE then return HIRAGANA
38
- when KATAKANA_RANGE then return KATAKANA
39
- when BOPOMOFO_RANGE then return BOPOMOFO
40
- when BOPOMOFO_EXTENDED_RANGE then return BOPOMOFO_EXTENDED
41
- when CJK_UNIFIED_IDEOGRAPHS_RANGE then return CJK_UNIFIED_IDEOGRAPHS
42
- when HANGUL_SYLLABES_RANGE then return HANGUL_SYLLABES
37
+ when HIRAGANA_RANGE then return HIRAGANA
38
+ when KATAKANA_RANGE then return KATAKANA
39
+ when BOPOMOFO_RANGE then return BOPOMOFO
40
+ when BOPOMOFO_EXTENDED_RANGE then return BOPOMOFO_EXTENDED
41
+ when CJK_UNIFIED_IDEOGRAPHS_RANGE then return CJK_UNIFIED_IDEOGRAPHS
42
+ when HANGUL_SYLLABES_RANGE then return HANGUL_SYLLABES
43
43
  else
44
44
  return OTHER
45
45
  end
@@ -50,7 +50,7 @@ module Langusta
50
50
  end
51
51
 
52
52
  def self.compute_upper_case_table
53
- File.open(UPPERCASE_BIN).read
53
+ File.open(UPPERCASE_BIN).read.unpack('n*')
54
54
  end
55
55
  end
56
56
  end
data/test/helper.rb CHANGED
@@ -1,20 +1,29 @@
1
1
  require 'rubygems'
2
2
  require 'bundler'
3
3
  begin
4
- Bundler.setup(:default, :development)
4
+ Bundler.setup(:default, :test)
5
5
  rescue Bundler::BundlerError => e
6
6
  $stderr.puts e.message
7
7
  $stderr.puts "Run `bundle install` to install missing gems"
8
8
  exit e.status_code
9
9
  end
10
10
  require 'test/unit'
11
+ require 'mocha'
11
12
 
12
13
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
14
  $LOAD_PATH.unshift(File.dirname(__FILE__))
14
15
  require 'langusta'
15
- require 'ruby-debug'
16
- require 'mocha'
17
16
 
18
17
  class Test::Unit::TestCase
19
18
  include Langusta
19
+
20
+ def str2cp(ascii_string)
21
+ Langusta.utf82cp(ascii_string)
22
+ end
23
+
24
+ def utf82cp(utf8_string)
25
+ Langusta.utf82cp(utf8_string)
26
+ end
20
27
  end
28
+
29
+ $debug = true