langusta 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  module Langusta
2
2
  class DetectorFactory
3
+ include Inspector
4
+
3
5
  attr_reader :word_lang_prob_map, :lang_list
4
6
 
5
7
  def initialize
@@ -11,15 +13,15 @@ module Langusta
11
13
  # @param [LangProfile] language profile to be added.
12
14
  # @param [Fixnum] index at which the language profile is to be added.
13
15
  # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
14
- def add_profile(profile, index, langsize)
16
+ def add_profile(profile)
15
17
  raise DuplicateProfilesError.new(profile.name) if @lang_list.include?(profile.name)
16
18
  @lang_list << profile.name
19
+ last_lang_index = @lang_list.size - 1
20
+
17
21
  profile.freq.keys.each do |word|
18
- if not @word_lang_prob_map.has_key?(word)
19
- @word_lang_prob_map[word] = Array.new(langsize, 0.0)
20
- end
22
+ @word_lang_prob_map[word] ||= []
21
23
  prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
22
- @word_lang_prob_map[word][index] = prob
24
+ @word_lang_prob_map[word][last_lang_index] = prob
23
25
  end
24
26
  end
25
27
 
@@ -35,6 +37,10 @@ module Langusta
35
37
  end
36
38
  end
37
39
 
40
+ def inspect
41
+ "#<#{self.class.name}:0x#{object_ptr} (#{@lang_list.size} profile(s))"
42
+ end
43
+
38
44
  private
39
45
  def create_detector
40
46
  raise NoProfilesLoadedError if @lang_list.empty?
@@ -0,0 +1,22 @@
1
+ module Langusta
2
+ module Guard
3
+
4
+ def self.klass(argument, klass, _method)
5
+ return unless $debug
6
+ raise TypeError.new("#{_method}: expected #{klass} got: #{argument.class}") unless argument.is_a?(klass)
7
+ end
8
+
9
+ def self.codepoint(codepoint, _method)
10
+ return unless $debug
11
+ raise ArgumentError.new([_method, ':', codepoint.to_s(16)].join) unless (0x0000..0xffff).include?(codepoint)
12
+ end
13
+
14
+ def self.codepoint_array(array, _method)
15
+ return unless $debug
16
+ raise TypeError.new("#{_method}: expected Array, got: #{array.class}") unless array.is_a?(Array)
17
+ cp = array.find do |cp|
18
+ ! (0x0000..0xffff).include?(cp)
19
+ end && (raise ArgumentError.new("#{_method}: bad codepoint: #{cp}"))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,7 @@
1
+ module Langusta
2
+ module Inspector
3
+ def object_ptr
4
+ (object_id * 2).to_s(16)
5
+ end
6
+ end
7
+ end
@@ -3,7 +3,7 @@ module Langusta
3
3
  # This is a minimal implementation, don't expect this to actually work.
4
4
 
5
5
  def initialize(filename)
6
- @lines = File.open(filename).read
6
+ @lines = File.open(filename).readlines
7
7
  parse()
8
8
  end
9
9
 
@@ -28,8 +28,7 @@ module Langusta
28
28
  codepoints = value.scan(/([0-9A-F]{4})/)
29
29
  codepoints.map do |cp|
30
30
  int_cp = cp.first.to_i(16)
31
- [int_cp / 256, int_cp % 256].pack("c*")
32
- end.join
31
+ end
33
32
  end
34
33
  end
35
34
  end
@@ -11,34 +11,29 @@ module Langusta
11
11
  # @return [LangProfile]
12
12
  def self.load_from_file(filename)
13
13
  json = Yajl::Parser.parse(File.new(filename))
14
- profile = self.new
15
14
 
16
- name = json['name']
17
- n_words = json['n_words']
18
15
  freq = json['freq'].inject({}) do |acc, kv|
19
16
  key, value = kv
20
- acc[UCS2String.from_utf8(key)] = value
17
+ acc[Langusta.utf82cp(key)] = value
21
18
  acc
22
19
  end
23
- profile.populate_json(name, freq, n_words)
24
- profile
25
- end
26
20
 
27
- def initialize(name=nil)
28
- @name = name
29
- @freq = {}
30
- @n_words = Array.new(NGram::N_GRAM, 0)
21
+ self.new(json['name'] || (raise CorruptProfileError.new("Missing profile name")),
22
+ freq,
23
+ json['n_words'] || (raise CorruptProfileError.new("Missing number of words value")))
31
24
  end
32
25
 
33
- def populate_json(name, freq, n_words)
26
+ def initialize(name, freq={}, n_words = Array.new(NGram::N_GRAM, 0))
27
+ Guard.klass(name, String, __method__)
34
28
  @name, @freq, @n_words = name, freq, n_words
35
29
  end
36
30
 
37
31
  # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
38
- # @param gram [UCS2String]
32
+ # @param gram [Array<Fixnum>]
39
33
  def add(gram)
40
- raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
41
- return if @name.nil? or gram.nil?
34
+ return if gram.nil?
35
+ Guard.klass(gram, Array, __method__)
36
+
42
37
  length = gram.size
43
38
  return if length < 1 or length > NGram::N_GRAM
44
39
  @n_words[length - 1] += 1
@@ -47,7 +42,6 @@ module Langusta
47
42
  end
48
43
 
49
44
  def omit_less_freq
50
- return if @name.nil?
51
45
  threshold = @n_words[0] / LESS_FREQ_RATIO
52
46
  threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
53
47
  keys = Set.new(@freq.keys)
@@ -59,7 +53,7 @@ module Langusta
59
53
  @freq.delete(key)
60
54
  else
61
55
  # temp workaround
62
- if RegexHelper::ROMAN_REGEX.match(key.underlying)
56
+ if RegexHelper::ROMAN_REGEX.match(Langusta.cp2utf8(key))
63
57
  roman += count
64
58
  end
65
59
  end
@@ -69,7 +63,7 @@ module Langusta
69
63
  keys2 = Set.new(@freq.keys)
70
64
  keys2.each do |key|
71
65
  # temp workaround
72
- if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
66
+ if RegexHelper::INCL_ROMAN_REGEX.match(Langusta.cp2utf8(key))
73
67
  @n_words[key.size - 1] -= @freq[key]
74
68
  @freq.delete(key)
75
69
  end
@@ -3,8 +3,8 @@ module Langusta
3
3
  def initialize
4
4
  @factory = DetectorFactory.new
5
5
  profiles = load_profiles()
6
- profiles.each_with_index do |profile, index|
7
- @factory.add_profile(profile, index, profiles.length)
6
+ profiles.each do |profile|
7
+ @factory.add_profile(profile)
8
8
  end
9
9
  end
10
10
 
@@ -3,17 +3,10 @@ module Langusta
3
3
  # constructed on a character by character basis.
4
4
  class NGram
5
5
  N_GRAM = 3
6
- UCS2_SPACE = "\x00\x20"
6
+ UCS2_SPACE = 0x0020
7
7
 
8
8
  def self.calculate_latin1_excluded
9
- internal_hash = JavaPropertyReader.new(MESSAGES_PROPERTIES).underlying_hash
10
- _, value = internal_hash.find do |k, v|
11
- k == "NGram.LATIN1_EXCLUDE"
12
- end
13
-
14
- (0..(value.length - 2)).step(2).map do |index|
15
- value[index, 2]
16
- end
9
+ JavaPropertyReader.new(MESSAGES_PROPERTIES)["NGram.LATIN1_EXCLUDE"]
17
10
  end
18
11
 
19
12
  LATIN1_EXCLUDED = self.calculate_latin1_excluded()
@@ -28,10 +21,9 @@ module Langusta
28
21
  internal_hash.select do |key, _|
29
22
  /KANJI_[0-9]{1}/ =~ key
30
23
  end.each do |_, chars|
31
- key = chars[0..1]
32
- m[key] = key
33
- (2..(chars.length - 2)).step(2) do |n|
34
- m[chars[n, 2]] = key
24
+ key = chars.first
25
+ chars.each do |cp|
26
+ m[cp] = key
35
27
  end
36
28
  end
37
29
  m
@@ -44,27 +36,27 @@ module Langusta
44
36
  block = UnicodeBlock.of(ch)
45
37
  case block
46
38
  when UnicodeBlock::BASIC_LATIN
47
- (ch < "\x00A" || (ch < "\x00a" && ch > "\x00Z") || ch > "\x00z") ? UCS2_SPACE : ch
39
+ (ch < 0x0041 || (ch < 0x0061 && ch > 0x005a) || ch > 0x007a) ? UCS2_SPACE : ch
48
40
  when UnicodeBlock::LATIN_1_SUPPLEMENT
49
41
  LATIN1_EXCLUDED.include?(ch) ? UCS2_SPACE : ch
50
42
  when UnicodeBlock::GENERAL_PUNCTUATION
51
43
  UCS2_SPACE
52
44
  when UnicodeBlock::ARABIC
53
- (ch == "\x06\xcc") ? "\x06\x4a" : ch
45
+ (ch == 0x06cc) ? 0x064a : ch
54
46
  when UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
55
- (ch >= "\x1e\xa0") ? "\x1e\xc3" : ch
47
+ (ch >= 0x1ea0) ? 0x1ec3 : ch
56
48
  when UnicodeBlock::HIRAGANA
57
- "\x30\x42"
49
+ 0x3042
58
50
  when UnicodeBlock::KATAKANA
59
- "\x30\xa2"
51
+ 0x30a2
60
52
  when UnicodeBlock::BOPOMOFO
61
- "\x31\x05"
53
+ 0x3105
62
54
  when UnicodeBlock::BOPOMOFO_EXTENDED
63
- "\x31\x05"
55
+ 0x3105
64
56
  when UnicodeBlock::CJK_UNIFIED_IDEOGRAPHS
65
57
  cjk_map.has_key?(ch) ? cjk_map[ch] : ch
66
58
  when UnicodeBlock::HANGUL_SYLLABES
67
- "\xac\x00"
59
+ 0xac00
68
60
  else
69
61
  ch
70
62
  end
@@ -77,22 +69,25 @@ module Langusta
77
69
 
78
70
  # Retrieves an n-sized NGram from the current sequence.
79
71
  # @param n [Integer] length of NGram.
80
- # @return [UCS2String] n-sized NGram.
72
+ # @return [Array<Integer>] n-sized NGram.
81
73
  def get(n)
82
74
  return nil if @capitalword
83
75
  len = @grams.length
84
76
  return nil if n < 1 || n > 3 || len < n
85
77
  if n == 1
86
78
  ch = @grams[len - 1]
87
- return (ch == UCS2_SPACE) ? nil : UCS2String.new(ch)
79
+ return (ch == UCS2_SPACE) ? nil : [ch]
88
80
  else
89
- return UCS2String.new(@grams[len - n, len].join)
81
+ return @grams[len - n, len]
90
82
  end
91
83
  end
92
84
 
93
85
  # Adds a single character to an NGram sequence.
94
- # @param character [String[2]] Two-byte Unicode codepoint.
86
+ # @param character [Fixnum] Two-byte Unicode codepoint.
95
87
  def add_char(character)
88
+ Guard.klass(character, Fixnum, __method__)
89
+ Guard.codepoint(character, __method__)
90
+
96
91
  character = NGram.normalize(character)
97
92
  lastchar = @grams[-1]
98
93
  if lastchar == UCS2_SPACE
@@ -1,15 +1,20 @@
1
1
  module Langusta
2
2
  module RegexHelper
3
- include Oniguruma
4
-
5
- def self._u16(string)
6
- string.unpack("U*").pack("n*")
3
+ if RUBY_VERSION < "1.9"
4
+ include Oniguruma
5
+
6
+ ROMAN_REGEX = ORegexp.new("^[a-z]$", :options => OPTION_IGNORECASE)
7
+ INCL_ROMAN_REGEX = ORegexp.new(".*[a-z].*", :options => OPTION_IGNORECASE)
8
+ URL_REGEX = ORegexp.new("https?://[-_.?&~;+=/#0-9a-z]+", :options => OPTION_IGNORECASE)
9
+ MAIL_REGEX = ORegexp.new("[-_.0-9a-z]+@[-_0-9a-z]+[-_.0-9a-z]+", :options => OPTION_IGNORECASE)
10
+ SPACE_REGEX = ORegexp.new(" +")
11
+ else
12
+ # /ui stands for UTF-8 case-insensitive regexp.
13
+ ROMAN_REGEX = /^[a-z]$/ui
14
+ INCL_ROMAN_REGEX = /.*[a-z].*/ui
15
+ URL_REGEX = Regexp.new("https?://[-_.?&~;+=/#a-z0-9]+")
16
+ MAIL_REGEX = /[-_.a-z0-9]+@[-_a-z0-9]+[-_.a-z0-9]+/ui
17
+ SPACE_REGEX = / +/
7
18
  end
8
-
9
- ROMAN_REGEX = ORegexp.new(_u16("^[A-Za-z]$"), "", "UTF16_BE", "java")
10
- INCL_ROMAN_REGEX = ORegexp.new(_u16(".*[A-Za-z].*"), "", "UTF16_BE", "java")
11
- URL_REGEX = ORegexp.new(_u16("https?://[-_.?&~;+=/#0-9A-Za-z]+"), "", "UTF16_BE", "java")
12
- MAIL_REGEX = ORegexp.new(_u16("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+"), "", "UTF_16BE", "java")
13
- SPACE_REGEX = ORegexp.new(_u16(" +"), "", "UTF16_BE", "java")
14
19
  end
15
20
  end
@@ -7,26 +7,26 @@ module Langusta
7
7
  @target = tag
8
8
  @threshold = threshold
9
9
  @count = 0
10
- @buffer = UCS2String.new("")
10
+ @buffer = []
11
11
  @tag = nil
12
12
  end
13
13
 
14
14
  def add(line)
15
15
  if @target == @tag && line
16
- @buffer << line
16
+ @buffer += line
17
17
  end
18
18
  end
19
19
 
20
20
  def clear
21
21
  @tag = nil
22
- @buffer = UCS2String.new("")
22
+ @buffer = []
23
23
  end
24
24
 
25
25
  def close_tag(profile)
26
26
  if profile && @tag == @target && @buffer.length > @threshold
27
27
  gram = NGram.new
28
- @buffer.each_char do |char|
29
- gram.add_char(char)
28
+ @buffer.each do |codepoint|
29
+ gram.add_char(codepoint)
30
30
  (1..NGram::N_GRAM).each do |n|
31
31
  profile.add(gram.get(n))
32
32
  end
@@ -2,44 +2,44 @@ module Langusta
2
2
  module UnicodeBlock
3
3
  # Half-baked implementation of Java's UnicodeBlock.
4
4
 
5
- OTHER = 0
6
- BASIC_LATIN = 1
7
- LATIN_1_SUPPLEMENT = 2
8
- GENERAL_PUNCTUATION = 3
9
- ARABIC = 4
10
- LATIN_EXTENDED_ADDITIONAL = 5
11
- HIRAGANA = 6
12
- KATAKANA = 7
13
- BOPOMOFO = 8
14
- BOPOMOFO_EXTENDED = 9
15
- CJK_UNIFIED_IDEOGRAPHS = 10
16
- HANGUL_SYLLABES = 11
5
+ OTHER = 0
6
+ BASIC_LATIN = 1
7
+ LATIN_1_SUPPLEMENT = 2
8
+ GENERAL_PUNCTUATION = 3
9
+ ARABIC = 4
10
+ LATIN_EXTENDED_ADDITIONAL = 5
11
+ HIRAGANA = 6
12
+ KATAKANA = 7
13
+ BOPOMOFO = 8
14
+ BOPOMOFO_EXTENDED = 9
15
+ CJK_UNIFIED_IDEOGRAPHS = 10
16
+ HANGUL_SYLLABES = 11
17
17
 
18
- BASIC_LATIN_RANGE = "\x00\x00".."\x00\x7f"
19
- LATIN_1_SUPPLEMENT_RANGE = "\x00\x80".."\x00\xff"
20
- GENERAL_PUNCTUATION_RANGE = "\x20\x00".."\x20\x6f"
21
- ARABIC_RANGE = "\x06\x00".."\x06\xff"
22
- LATIN_EXTENDED_ADDITIONAL_RANGE = "\x1e\x00".."\x1e\xff"
23
- HIRAGANA_RANGE = "\x30\x40".."\x30\x9f"
24
- KATAKANA_RANGE = "\x30\xa0".."\x30\xff"
25
- BOPOMOFO_RANGE = "\x31\x00".."\x31\xbf"
26
- BOPOMOFO_EXTENDED_RANGE = "\x31\xa0".."\x31\xbf"
27
- CJK_UNIFIED_IDEOGRAPHS_RANGE = "\x4e\x00".."\x9f\xff"
28
- HANGUL_SYLLABES_RANGE = "\xac\x00".."\xd7\xaf"
18
+ BASIC_LATIN_RANGE = 0x0000..0x007f
19
+ LATIN_1_SUPPLEMENT_RANGE = 0x0080..0x00ff
20
+ GENERAL_PUNCTUATION_RANGE = 0x2000..0x206f
21
+ ARABIC_RANGE = 0x0600..0x06ff
22
+ LATIN_EXTENDED_ADDITIONAL_RANGE = 0x1e00..0x1eff
23
+ HIRAGANA_RANGE = 0x3040..0x309f
24
+ KATAKANA_RANGE = 0x30a0..0x30ff
25
+ BOPOMOFO_RANGE = 0x3100..0x31bf
26
+ BOPOMOFO_EXTENDED_RANGE = 0x31a0..0x31bf
27
+ CJK_UNIFIED_IDEOGRAPHS_RANGE = 0x4e00..0x9fff
28
+ HANGUL_SYLLABES_RANGE = 0xac00..0xd7af
29
29
 
30
30
  def self.of(character)
31
31
  case character
32
- when BASIC_LATIN_RANGE then return BASIC_LATIN
33
- when LATIN_1_SUPPLEMENT_RANGE then return LATIN_1_SUPPLEMENT
34
- when GENERAL_PUNCTUATION_RANGE then return GENERAL_PUNCTUATION
35
- when ARABIC_RANGE then return ARABIC
32
+ when BASIC_LATIN_RANGE then return BASIC_LATIN
33
+ when LATIN_1_SUPPLEMENT_RANGE then return LATIN_1_SUPPLEMENT
34
+ when GENERAL_PUNCTUATION_RANGE then return GENERAL_PUNCTUATION
35
+ when ARABIC_RANGE then return ARABIC
36
36
  when LATIN_EXTENDED_ADDITIONAL_RANGE then return LATIN_EXTENDED_ADDITIONAL
37
- when HIRAGANA_RANGE then return HIRAGANA
38
- when KATAKANA_RANGE then return KATAKANA
39
- when BOPOMOFO_RANGE then return BOPOMOFO
40
- when BOPOMOFO_EXTENDED_RANGE then return BOPOMOFO_EXTENDED
41
- when CJK_UNIFIED_IDEOGRAPHS_RANGE then return CJK_UNIFIED_IDEOGRAPHS
42
- when HANGUL_SYLLABES_RANGE then return HANGUL_SYLLABES
37
+ when HIRAGANA_RANGE then return HIRAGANA
38
+ when KATAKANA_RANGE then return KATAKANA
39
+ when BOPOMOFO_RANGE then return BOPOMOFO
40
+ when BOPOMOFO_EXTENDED_RANGE then return BOPOMOFO_EXTENDED
41
+ when CJK_UNIFIED_IDEOGRAPHS_RANGE then return CJK_UNIFIED_IDEOGRAPHS
42
+ when HANGUL_SYLLABES_RANGE then return HANGUL_SYLLABES
43
43
  else
44
44
  return OTHER
45
45
  end
@@ -50,7 +50,7 @@ module Langusta
50
50
  end
51
51
 
52
52
  def self.compute_upper_case_table
53
- File.open(UPPERCASE_BIN).read
53
+ File.open(UPPERCASE_BIN).read.unpack('n*')
54
54
  end
55
55
  end
56
56
  end
data/test/helper.rb CHANGED
@@ -1,20 +1,29 @@
1
1
  require 'rubygems'
2
2
  require 'bundler'
3
3
  begin
4
- Bundler.setup(:default, :development)
4
+ Bundler.setup(:default, :test)
5
5
  rescue Bundler::BundlerError => e
6
6
  $stderr.puts e.message
7
7
  $stderr.puts "Run `bundle install` to install missing gems"
8
8
  exit e.status_code
9
9
  end
10
10
  require 'test/unit'
11
+ require 'mocha'
11
12
 
12
13
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
14
  $LOAD_PATH.unshift(File.dirname(__FILE__))
14
15
  require 'langusta'
15
- require 'ruby-debug'
16
- require 'mocha'
17
16
 
18
17
  class Test::Unit::TestCase
19
18
  include Langusta
19
+
20
+ def str2cp(ascii_string)
21
+ Langusta.utf82cp(ascii_string)
22
+ end
23
+
24
+ def utf82cp(utf8_string)
25
+ Langusta.utf82cp(utf8_string)
26
+ end
20
27
  end
28
+
29
+ $debug = true