analects 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +14 -0
  5. data/Gemfile +17 -0
  6. data/Gemfile.devtools +71 -0
  7. data/Gemfile.lock +236 -0
  8. data/LICENSE.txt +674 -0
  9. data/README.md +81 -0
  10. data/Rakefile +26 -0
  11. data/SOURCES.md +17 -0
  12. data/analects.gemspec +29 -0
  13. data/bin/wp_hsk_filter +36 -0
  14. data/config/devtools.yml +2 -0
  15. data/config/flay.yml +3 -0
  16. data/config/flog.yml +2 -0
  17. data/config/mutant.yml +3 -0
  18. data/config/reek.yml +103 -0
  19. data/config/rubocop.yml +58 -0
  20. data/config/yardstick.yml +2 -0
  21. data/data/.gitkeep +0 -0
  22. data/lib/analects.rb +37 -0
  23. data/lib/analects/cedict_loader.rb +44 -0
  24. data/lib/analects/chise_ids_loader.rb +34 -0
  25. data/lib/analects/cli/progress.rb +37 -0
  26. data/lib/analects/encoding.rb +61 -0
  27. data/lib/analects/library.rb +68 -0
  28. data/lib/analects/models/kangxi_radical.rb +14 -0
  29. data/lib/analects/models/zi.rb +64 -0
  30. data/lib/analects/rake_tasks.rb +49 -0
  31. data/lib/analects/source.rb +70 -0
  32. data/lib/analects/tokenizer.rb +54 -0
  33. data/lib/analects/version.rb +3 -0
  34. data/lib/cjk_string.rb +56 -0
  35. data/lib/generators/analects.rb +20 -0
  36. data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
  37. data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
  38. data/lib/generators/analects/cedict/templates/model.rb +3 -0
  39. data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
  40. data/spec/analects/cedict_loader_spec.rb +48 -0
  41. data/spec/analects/chise_ids_loader_spec.rb +50 -0
  42. data/spec/analects/library_spec.rb +50 -0
  43. data/spec/analects/source_spec.rb +18 -0
  44. data/spec/spec_helper.rb +19 -0
  45. data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
  46. metadata +221 -0
@@ -0,0 +1,34 @@
1
+ module Analects
2
+ class ChiseIdsLoader
3
+ include Enumerable
4
+
5
+ attr_accessor :only_unicode
6
+
7
+ class MultiFile < Struct.new(:files)
8
+ def each_line(&blk)
9
+ return to_enum(__method__) unless block_given?
10
+ files.each do |file|
11
+ file.each_line(&blk)
12
+ end
13
+ self
14
+ end
15
+ end
16
+
17
+ def initialize(pathname, only_unicode = true)
18
+ @contents = MultiFile.new(pathname.children.select{|ch| ch.to_s =~ /IDS-.*\.txt/})
19
+ @only_unicode = only_unicode
20
+ end
21
+
22
+ def field_names
23
+ [:name, :representation, :ids]
24
+ end
25
+
26
+ def each(&blk)
27
+ return to_enum(__method__) unless block_given?
28
+ @entries ||= @contents.each_line
29
+ .reject {|line| line !~ /\t/ || (only_unicode && line !~ /^U/) }
30
+ .map {|line| line.strip.split("\t")[0..2] }
31
+ @entries.each(&blk)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,37 @@
1
+ module Analects
2
+ module CLI
3
+ # Command line progress bar
4
+ class Progress
5
+ attr_accessor :length, :count
6
+
7
+ def initialize(total, accuracy = 1000, prefix = '')
8
+ @total = total
9
+ @current = 0
10
+ @length = 60
11
+ @count = 100
12
+ @accuracy = accuracy
13
+ @prefix = prefix
14
+ end
15
+
16
+ def next
17
+ @current += 1
18
+ draw if (@current % (Float(@total)/@accuracy).ceil) == 0 || @current == @total
19
+ end
20
+
21
+ def draw
22
+ return unless
23
+ x = pos(@length).floor
24
+ total_count = @count == 100 ? '%' : "/#{@count}"
25
+ print "\e[%dD\e[32m%s[\e[31m%s%s\e[32m]\e[34m %d%s\e[0m" % [@length+10+@prefix.length, @prefix, '='*x, ' '*(@length-x), pos(@count), total_count]
26
+ end
27
+
28
+ def pos(scale)
29
+ if @current == @total
30
+ scale
31
+ else
32
+ Float(@current)/@total * scale
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,61 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module Analects
4
+ module Encoding
5
+ extend self
6
+
7
+ GB = ::Encoding::GB18030
8
+ BIG5 = ::Encoding::BIG5_UAO
9
+
10
+ def recode(enc, str)
11
+ str.force_encoding(enc).encode('UTF-8')
12
+ end
13
+
14
+ def from_gb(str)
15
+ recode(GB, str)
16
+ end
17
+
18
+ def from_big5(str)
19
+ recode(BIG5, str)
20
+ end
21
+
22
+ def valid_cjk(str)
23
+ [GB, BIG5].map do |enc|
24
+ begin
25
+ recode(enc, str)
26
+ enc
27
+ rescue ::Encoding::UndefinedConversionError
28
+ rescue ::Encoding::InvalidByteSequenceError
29
+ end
30
+ end.compact
31
+ end
32
+
33
+ # Crude way to guess which encoding it is
34
+ def ratings(str)
35
+ all_valid_cjk(str).map do |enc|
36
+ [
37
+ enc,
38
+ recode(enc, str).codepoints.map do |point|
39
+ Analects::Models::Zi.codepoint_ranges.map.with_index do |range, idx|
40
+ next 6-idx if range.include?(point)
41
+ 0
42
+ end.inject(:+)
43
+ end.inject(:+)
44
+ ]
45
+ end.sort_by(&:last).reverse
46
+ end
47
+
48
+ end
49
+ end
50
+
51
+ # For info on Taiwanese Big5 variants + Ruby
52
+ # * https://bugs.ruby-lang.org/issues/1784
53
+ # * http://lists.gnu.org/archive/html/bug-gnu-libiconv/2010-11/msg00007.html
54
+
55
+ # Wikipedia pages of GB (国家标准) encodings (chronological?)
56
+ # * http://en.wikipedia.org/wiki/GB_2312
57
+ # * http://en.wikipedia.org/wiki/GBK
58
+ # * http://en.wikipedia.org/wiki/GB18030
59
+
60
+ # Ruby also knows about this one, but can't convert it to UTF-8
61
+ # * http://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW
@@ -0,0 +1,68 @@
1
+ module Analects
2
+ CEDICT_URL = 'http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz'
3
+ CHISE_IDS_URL = 'http://git.chise.org/git/chise/ids.git'
4
+ UNIHAN_URL = ''
5
+ HSK_URL = ''
6
+
7
+ class Library
8
+ attr_reader :options
9
+
10
+ def initialize(options = {})
11
+ @options = options.freeze
12
+ end
13
+
14
+ def data_dir
15
+ if options[:data_dir]
16
+ Dir.mkdir(options[:data_dir]) unless File.exist?(options[:data_dir])
17
+ return options[:data_dir]
18
+ end
19
+ File.join(Dir.home, '.analects').tap do |dir|
20
+ unless File.exist? dir
21
+ Dir.mkdir dir
22
+ end
23
+ end
24
+ end
25
+
26
+ def cedict
27
+ @cedict ||= Source.new(
28
+ {
29
+ data_file: 'cedict_1_0_ts_utf-8_mdbg.txt',
30
+ retrieval: [ :http, :gunzip, :save ]
31
+ }.merge(options_for :cedict)
32
+ )
33
+ end
34
+
35
+ def chise_ids
36
+ @chise_ids ||= Source.new(
37
+ {
38
+ retrieval: :git
39
+ }.merge(options_for :chise_ids)
40
+ )
41
+ end
42
+
43
+ def unihan
44
+ @unihan ||= Source.new(
45
+ {
46
+ data_file: ''
47
+ }.merge(options_for :chise_ids)
48
+ )
49
+ end
50
+ # def hsk
51
+ # @hsk ||= Source.new(
52
+
53
+ # ).merge(options_for :hsk)
54
+ # end
55
+
56
+ private
57
+
58
+ def options_for(name)
59
+ {
60
+ name: name,
61
+ url: Analects.const_get("#{name.to_s.upcase}_URL"),
62
+ loader: Analects.const_get("#{Inflecto.camelize name}Loader"),
63
+ data_dir: data_dir
64
+ }.merge(options.fetch(name, {}))
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ module Analects
3
+ module Models
4
+ class KangxiRadical
5
+ # Mapping of Kangxi radical (Unicode Symbol) to the compatibility character that is
6
+ # composed of that radical (Unicode Letter). The former are not full characters
7
+ # and not normally used in text unless explicitly referring to only their
8
+ # usage as radicals.
9
+ def self.compat(radical)
10
+ {"⼀"=>"一", "⼁"=>"丨", "⼂"=>"丶", "⼃"=>"丿", "⼄"=>"乙", "⼅"=>"亅", "⼆"=>"二", "⼇"=>"亠", "⼈"=>"人", "⼉"=>"儿", "⼊"=>"入", "⼋"=>"八", "⼌"=>"冂", "⼍"=>"冖", "⼎"=>"冫", "⼏"=>"几", "⼐"=>"凵", "⼑"=>"刀", "⼒"=>"力", "⼓"=>"勹", "⼔"=>"匕", "⼕"=>"匚", "⼖"=>"匸", "⼗"=>"十", "⼘"=>"卜", "⼙"=>"卩", "⼚"=>"厂", "⼛"=>"厶", "⼜"=>"又", "⼝"=>"口", "⼞"=>"囗", "⼟"=>"土", "⼠"=>"士", "⼡"=>"夂", "⼢"=>"夊", "⼣"=>"夕", "⼤"=>"大", "⼥"=>"女", "⼦"=>"子", "⼧"=>"宀", "⼨"=>"寸", "⼩"=>"小", "⼪"=>"尢", "⼫"=>"尸", "⼬"=>"屮", "⼭"=>"山", "⼮"=>"巛", "⼯"=>"工", "⼰"=>"己", "⼱"=>"巾", "⼲"=>"干", "⼳"=>"幺", "⼴"=>"广", "⼵"=>"廴", "⼶"=>"廾", "⼷"=>"弋", "⼸"=>"弓", "⼹"=>"彐", "⼺"=>"彡", "⼻"=>"彳", "⼼"=>"心", "⼽"=>"戈", "⼾"=>"戶", "⼿"=>"手", "⽀"=>"支", "⽁"=>"攴", "⽂"=>"文", "⽃"=>"斗", "⽄"=>"斤", "⽅"=>"方", "⽆"=>"无", "⽇"=>"日", "⽈"=>"曰", "⽉"=>"月", "⽊"=>"木", "⽋"=>"欠", "⽌"=>"止", "⽍"=>"歹", "⽎"=>"殳", "⽏"=>"毋", "⽐"=>"比", "⽑"=>"毛", "⽒"=>"氏", "⽓"=>"气", "⽔"=>"水", "⽕"=>"火", "⽖"=>"爪", "⽗"=>"父", "⽘"=>"爻", "⽙"=>"爿", "⽚"=>"片", "⽛"=>"牙", "⽜"=>"牛", "⽝"=>"犬", "⽞"=>"玄", "⽟"=>"玉", "⽠"=>"瓜", "⽡"=>"瓦", "⽢"=>"甘", "⽣"=>"生", "⽤"=>"用", "⽥"=>"田", "⽦"=>"疋", "⽧"=>"疒", "⽨"=>"癶", "⽩"=>"白", "⽪"=>"皮", "⽫"=>"皿", "⽬"=>"目", "⽭"=>"矛", "⽮"=>"矢", "⽯"=>"石", "⽰"=>"示", "⽱"=>"禸", "⽲"=>"禾", "⽳"=>"穴", "⽴"=>"立", "⽵"=>"竹", "⽶"=>"米", "⽷"=>"糸", "⽸"=>"缶", "⽹"=>"网", "⽺"=>"羊", "⽻"=>"羽", "⽼"=>"老", "⽽"=>"而", "⽾"=>"耒", "⽿"=>"耳", "⾀"=>"聿", "⾁"=>"肉", "⾂"=>"臣", "⾃"=>"自", "⾄"=>"至", "⾅"=>"臼", "⾆"=>"舌", "⾇"=>"舛", "⾈"=>"舟", "⾉"=>"艮", "⾊"=>"色", "⾋"=>"艸", "⾌"=>"虍", "⾍"=>"虫", "⾎"=>"血", "⾏"=>"行", "⾐"=>"衣", "⾑"=>"襾", "⾒"=>"見", "⾓"=>"角", "⾔"=>"言", "⾕"=>"谷", "⾖"=>"豆", "⾗"=>"豕", "⾘"=>"豸", "⾙"=>"貝", "⾚"=>"赤", "⾛"=>"走", "⾜"=>"足", "⾝"=>"身", "⾞"=>"車", "⾟"=>"辛", "⾠"=>"辰", "⾡"=>"辵", "⾢"=>"邑", "⾣"=>"酉", "⾤"=>"釆", "⾥"=>"里", "⾦"=>"金", "⾧"=>"長", "⾨"=>"門", "⾩"=>"阜", "⾪"=>"隶", "⾫"=>"隹", "⾬"=>"雨", "⾭"=>"靑", "⾮"=>"非", "⾯"=>"面", "⾰"=>"革", "⾱"=>"韋", "⾲"=>"韭", "⾳"=>"音", "⾴"=>"頁", "⾵"=>"風", "⾶"=>"飛", "⾷"=>"食", "⾸"=>"首", "⾹"=>"香", "⾺"=>"馬", "⾻"=>"骨", "⾼"=>"高", "⾽"=>"髟", "⾾"=>"鬥", "⾿"=>"鬯", "⿀"=>"鬲", "⿁"=>"鬼", "⿂"=>"魚", "⿃"=>"鳥", "⿄"=>"鹵", "⿅"=>"鹿", "⿆"=>"麥", "⿇"=>"麻", "⿈"=>"黃", "⿉"=>"黍", "⿊"=>"黑", "⿋"=>"黹", "⿌"=>"黽", "⿍"=>"鼎", "⿎"=>"鼓", "⿏"=>"鼠", "⿐"=>"鼻", "⿑"=>"齊", "⿒"=>"齒", "⿓"=>"龍", "⿔"=>"龜", "⿕"=>"龠"}[radical]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,64 @@
1
+ module Analects
2
+ module Models
3
+ class Zi
4
+ RANGES = IceNine.deep_freeze(
5
+ unified:
6
+ { :name => "CJK Unified Ideographs",
7
+ :range => 0x4E00 .. 0x9FFF, # Ox9FC4..9FFF have no Unihan data
8
+ :sort_diff => -0x4E00
9
+ },
10
+ extension_A:
11
+ { :name => "CJK Unified Ideographs Extension A",
12
+ :range => 0x3400 .. 0x4DBF, # 0x4DB6..4DBF have no Unihan data
13
+ :sort_diff => 0x1E00
14
+ },
15
+ extension_B:
16
+ { :name => "CJK Unified Ideographs Extension B",
17
+ :range => 0x20000 .. 0x2A6DF, # 0x2A6D7..2A6DF have no Unihan data
18
+ :sort_diff => -0x19400
19
+ },
20
+ compatibility:
21
+ { :name => "CJK Compatibility Ideographs",
22
+ :range => 0xF900 .. 0xFAFF, # 0xFADA..FAFF; 0xFA2E..0xFA2F; 0xFA6B..0xFA6F have no Unihan data
23
+ :sort_diff => 0xFD00
24
+ },
25
+ supplement:
26
+ { :name => "CJK Compatibility Ideographs Supplement",
27
+ :range => 0x2F800 .. 0x2FA1F, # 0x2FA1E..0x2FA1F have no Unihan data
28
+ :sort_diff => -0x10000
29
+ },
30
+ radicals_supplement:
31
+ { name: "CJK Radicals supplement",
32
+ range: 0x2E80 .. 0x2EFF
33
+ },
34
+ kangxi_radicals:
35
+ { name: "Kangxi Radicals",
36
+ range: 0x2F00 .. 0x2FDF
37
+ }
38
+ )
39
+
40
+ def self.codepoint_ranges
41
+ RANGES.values.map{|v| v[:range]}
42
+ end
43
+
44
+ # Regexp that matches a single CJK character
45
+ REGEXP = Regexp.union(
46
+ codepoint_ranges.map { |range|
47
+ Regexp.new('[\u{%s}-\u{%s}]' % [ range.begin.to_s(16), range.end.to_s(16) ])
48
+ }
49
+ )
50
+
51
+ ANTIREGEXP = Regexp.new(
52
+ '[^'+
53
+ codepoint_ranges.map { |range| '\u{%s}-\u{%s}' % [ range.begin.to_s(16), range.end.to_s(16) ] }.join +
54
+ ']'
55
+ )
56
+
57
+ def self.each_radical(&block)
58
+ RANGES[:kangxi_radicals][:range].each do |codepoint|
59
+ block.([codepoint].pack('U'))
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,49 @@
1
+ require 'rake/tasklib'
2
+
3
+ module Analects
4
+ class RakeTasks < Rake::TaskLib
5
+ def initialize(name = :analects, &blk)
6
+ @name = name
7
+ if block_given?
8
+ if blk.arity == 0
9
+ self.instance_eval(&blk)
10
+ else
11
+ yield self
12
+ end
13
+ end
14
+ define
15
+ end
16
+
17
+ def library
18
+ @library ||= Analects::Library.new(options)
19
+ end
20
+
21
+ def options
22
+ @options ||= {}
23
+ end
24
+
25
+ def data_dir(dir)
26
+ options[:data_dir] = dir
27
+ end
28
+
29
+ def define
30
+ namespace @name do
31
+ namespace :download do
32
+ desc 'download CC-CEDICT'
33
+ task :cedict do
34
+ library.cedict.retrieve!
35
+ end
36
+
37
+ desc 'download Chise-IDS'
38
+ task :chise_ids do
39
+ library.chise_ids.retrieve!
40
+ end
41
+
42
+ desc 'download all sources'
43
+ task :all => [:cedict, :chise_ids]
44
+ end
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,70 @@
1
+ module Analects
2
+ class Source
3
+ include Enumerable
4
+ attr_reader :options
5
+
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ def name ; options[:name] ; end
11
+ def url ; options[:url] ; end
12
+ def retrieval ; Array(options[:retrieval]) ; end
13
+
14
+ def loader
15
+ @loader ||= options[:loader].new(Pathname(location))
16
+ end
17
+
18
+ def data_dir
19
+ options[:data_dir]
20
+ end
21
+
22
+ def location
23
+ options[:data_file] ? File.join( data_dir, options[:data_file] ) : File.join( data_dir, options[:name].to_s )
24
+ end
25
+
26
+ def data_file_present?
27
+ File.exist? location
28
+ end
29
+
30
+ def retrieve
31
+ retrieve! unless data_file_present?
32
+ end
33
+
34
+ def retrieve!
35
+ retrieval.inject( url ) do | result, method |
36
+ self.send( "retrieve_#{method}", result )
37
+ end
38
+ end
39
+
40
+ # url -> stream
41
+ def retrieve_http( url )
42
+ require 'open-uri'
43
+ open( url )
44
+ end
45
+
46
+ # gzipped stream -> uncompressed stream
47
+ def retrieve_gunzip( stream )
48
+ require 'zlib'
49
+ Zlib::GzipReader.new( stream )
50
+ end
51
+
52
+ # stream|string -> create data file
53
+ def retrieve_save( data )
54
+ File.open( location, 'w' ) do |f|
55
+ f << ( data.respond_to?(:read) ? data.read : data )
56
+ end
57
+ end
58
+
59
+ # url -> clones repo
60
+ def retrieve_git( url )
61
+ `git clone #{url} #{data_dir}/#{name}` # Admittedly crude
62
+ end
63
+
64
+ def each(&block)
65
+ return to_enum unless block_given?
66
+ loader.each(&block)
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,54 @@
1
+ module Analects
2
+ class Tokenizer
3
+ #ALGO = RMMSeg::Algorithm
4
+ ALGO = RMMSeg::SimpleAlgorithm
5
+
6
+ def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic')
7
+ unless File.exist?(chars_dic) && File.exist?(words_dic)
8
+ create_dict_from_cedict( chars_dic, words_dic )
9
+ end
10
+ #RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]]
11
+ RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]]
12
+ end
13
+
14
+ def library
15
+ @library ||= Analects::Library.new
16
+ end
17
+
18
+ def cedict( fn = '/tmp/cedict.json' )
19
+ require 'json'
20
+ unless File.exist?( fn )
21
+ library.cedict.retrieve
22
+ File.write( fn, library.cedict.to_a.to_json )
23
+ end
24
+ @cedict ||= JSON.parse IO.read( fn )
25
+ end
26
+
27
+ def create_dict_from_cedict(chars_dic, words_dic)
28
+ words = Set.new
29
+ histo = Hash.new(0)
30
+
31
+ cedict.each do |c|
32
+ words << c[0]
33
+ words << c[1]
34
+ (c[0] + c[1]).chars.each do |c|
35
+ histo[c] += 1
36
+ end
37
+ end
38
+
39
+ File.write(words_dic, words.sort.join("\n"))
40
+ File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join )
41
+ end
42
+
43
+ def tokenize( str )
44
+ [].tap do |result|
45
+ ALGO.new( str ).tap do |alg|
46
+ until (tok = alg.next_token).nil?
47
+ result << tok.text.force_encoding('UTF-8')
48
+ end
49
+ end
50
+ end
51
+ end
52
+ alias call tokenize
53
+ end
54
+ end