analects 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +14 -0
  5. data/Gemfile +17 -0
  6. data/Gemfile.devtools +71 -0
  7. data/Gemfile.lock +236 -0
  8. data/LICENSE.txt +674 -0
  9. data/README.md +81 -0
  10. data/Rakefile +26 -0
  11. data/SOURCES.md +17 -0
  12. data/analects.gemspec +29 -0
  13. data/bin/wp_hsk_filter +36 -0
  14. data/config/devtools.yml +2 -0
  15. data/config/flay.yml +3 -0
  16. data/config/flog.yml +2 -0
  17. data/config/mutant.yml +3 -0
  18. data/config/reek.yml +103 -0
  19. data/config/rubocop.yml +58 -0
  20. data/config/yardstick.yml +2 -0
  21. data/data/.gitkeep +0 -0
  22. data/lib/analects.rb +37 -0
  23. data/lib/analects/cedict_loader.rb +44 -0
  24. data/lib/analects/chise_ids_loader.rb +34 -0
  25. data/lib/analects/cli/progress.rb +37 -0
  26. data/lib/analects/encoding.rb +61 -0
  27. data/lib/analects/library.rb +68 -0
  28. data/lib/analects/models/kangxi_radical.rb +14 -0
  29. data/lib/analects/models/zi.rb +64 -0
  30. data/lib/analects/rake_tasks.rb +49 -0
  31. data/lib/analects/source.rb +70 -0
  32. data/lib/analects/tokenizer.rb +54 -0
  33. data/lib/analects/version.rb +3 -0
  34. data/lib/cjk_string.rb +56 -0
  35. data/lib/generators/analects.rb +20 -0
  36. data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
  37. data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
  38. data/lib/generators/analects/cedict/templates/model.rb +3 -0
  39. data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
  40. data/spec/analects/cedict_loader_spec.rb +48 -0
  41. data/spec/analects/chise_ids_loader_spec.rb +50 -0
  42. data/spec/analects/library_spec.rb +50 -0
  43. data/spec/analects/source_spec.rb +18 -0
  44. data/spec/spec_helper.rb +19 -0
  45. data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
  46. metadata +221 -0
@@ -0,0 +1,34 @@
1
+ module Analects
2
+ class ChiseIdsLoader
3
+ include Enumerable
4
+
5
+ attr_accessor :only_unicode
6
+
7
+ class MultiFile < Struct.new(:files)
8
+ def each_line(&blk)
9
+ return to_enum(__method__) unless block_given?
10
+ files.each do |file|
11
+ file.each_line(&blk)
12
+ end
13
+ self
14
+ end
15
+ end
16
+
17
+ def initialize(pathname, only_unicode = true)
18
+ @contents = MultiFile.new(pathname.children.select{|ch| ch.to_s =~ /IDS-.*\.txt/})
19
+ @only_unicode = only_unicode
20
+ end
21
+
22
+ def field_names
23
+ [:name, :representation, :ids]
24
+ end
25
+
26
+ def each(&blk)
27
+ return to_enum(__method__) unless block_given?
28
+ @entries ||= @contents.each_line
29
+ .reject {|line| line !~ /\t/ || (only_unicode && line !~ /^U/) }
30
+ .map {|line| line.strip.split("\t")[0..2] }
31
+ @entries.each(&blk)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,37 @@
1
+ module Analects
2
+ module CLI
3
+ # Command line progress bar
4
+ class Progress
5
+ attr_accessor :length, :count
6
+
7
+ def initialize(total, accuracy = 1000, prefix = '')
8
+ @total = total
9
+ @current = 0
10
+ @length = 60
11
+ @count = 100
12
+ @accuracy = accuracy
13
+ @prefix = prefix
14
+ end
15
+
16
+ def next
17
+ @current += 1
18
+ draw if (@current % (Float(@total)/@accuracy).ceil) == 0 || @current == @total
19
+ end
20
+
21
+ def draw
22
+ return unless
23
+ x = pos(@length).floor
24
+ total_count = @count == 100 ? '%' : "/#{@count}"
25
+ print "\e[%dD\e[32m%s[\e[31m%s%s\e[32m]\e[34m %d%s\e[0m" % [@length+10+@prefix.length, @prefix, '='*x, ' '*(@length-x), pos(@count), total_count]
26
+ end
27
+
28
+ def pos(scale)
29
+ if @current == @total
30
+ scale
31
+ else
32
+ Float(@current)/@total * scale
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,61 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module Analects
4
+ module Encoding
5
+ extend self
6
+
7
+ GB = ::Encoding::GB18030
8
+ BIG5 = ::Encoding::BIG5_UAO
9
+
10
+ def recode(enc, str)
11
+ str.force_encoding(enc).encode('UTF-8')
12
+ end
13
+
14
+ def from_gb(str)
15
+ recode(GB, str)
16
+ end
17
+
18
+ def from_big5(str)
19
+ recode(BIG5, str)
20
+ end
21
+
22
+ def valid_cjk(str)
23
+ [GB, BIG5].map do |enc|
24
+ begin
25
+ recode(enc, str)
26
+ enc
27
+ rescue ::Encoding::UndefinedConversionError
28
+ rescue ::Encoding::InvalidByteSequenceError
29
+ end
30
+ end.compact
31
+ end
32
+
33
+ # Crude way to guess which encoding it is
34
+ def ratings(str)
35
+ all_valid_cjk(str).map do |enc|
36
+ [
37
+ enc,
38
+ recode(enc, str).codepoints.map do |point|
39
+ Analects::Models::Zi.codepoint_ranges.map.with_index do |range, idx|
40
+ next 6-idx if range.include?(point)
41
+ 0
42
+ end.inject(:+)
43
+ end.inject(:+)
44
+ ]
45
+ end.sort_by(&:last).reverse
46
+ end
47
+
48
+ end
49
+ end
50
+
51
+ # For info on Taiwanese Big5 variants + Ruby
52
+ # * https://bugs.ruby-lang.org/issues/1784
53
+ # * http://lists.gnu.org/archive/html/bug-gnu-libiconv/2010-11/msg00007.html
54
+
55
+ # Wikipedia pages of GB (国家标准) encodings (chronological?)
56
+ # * http://en.wikipedia.org/wiki/GB_2312
57
+ # * http://en.wikipedia.org/wiki/GBK
58
+ # * http://en.wikipedia.org/wiki/GB18030
59
+
60
+ # Ruby also knows about this one, but can't convert it to UTF-8
61
+ # * http://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW
@@ -0,0 +1,68 @@
1
+ module Analects
2
+ CEDICT_URL = 'http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz'
3
+ CHISE_IDS_URL = 'http://git.chise.org/git/chise/ids.git'
4
+ UNIHAN_URL = ''
5
+ HSK_URL = ''
6
+
7
+ class Library
8
+ attr_reader :options
9
+
10
+ def initialize(options = {})
11
+ @options = options.freeze
12
+ end
13
+
14
+ def data_dir
15
+ if options[:data_dir]
16
+ Dir.mkdir(options[:data_dir]) unless File.exist?(options[:data_dir])
17
+ return options[:data_dir]
18
+ end
19
+ File.join(Dir.home, '.analects').tap do |dir|
20
+ unless File.exist? dir
21
+ Dir.mkdir dir
22
+ end
23
+ end
24
+ end
25
+
26
+ def cedict
27
+ @cedict ||= Source.new(
28
+ {
29
+ data_file: 'cedict_1_0_ts_utf-8_mdbg.txt',
30
+ retrieval: [ :http, :gunzip, :save ]
31
+ }.merge(options_for :cedict)
32
+ )
33
+ end
34
+
35
+ def chise_ids
36
+ @chise_ids ||= Source.new(
37
+ {
38
+ retrieval: :git
39
+ }.merge(options_for :chise_ids)
40
+ )
41
+ end
42
+
43
+ def unihan
44
+ @unihan ||= Source.new(
45
+ {
46
+ data_file: ''
47
+ }.merge(options_for :chise_ids)
48
+ )
49
+ end
50
+ # def hsk
51
+ # @hsk ||= Source.new(
52
+
53
+ # ).merge(options_for :hsk)
54
+ # end
55
+
56
+ private
57
+
58
+ def options_for(name)
59
+ {
60
+ name: name,
61
+ url: Analects.const_get("#{name.to_s.upcase}_URL"),
62
+ loader: Analects.const_get("#{Inflecto.camelize name}Loader"),
63
+ data_dir: data_dir
64
+ }.merge(options.fetch(name, {}))
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ module Analects
3
+ module Models
4
+ class KangxiRadical
5
+ # Mapping of Kangxi radical (Unicode Symbol) to the compatibility character that is
6
+ # composed of that radical (Unicode Letter). The former are not full characters
7
+ # and not normally used in text unless explicitly referring to only their
8
+ # usage as radicals.
9
+ def self.compat(radical)
10
+ {"⼀"=>"一", "⼁"=>"丨", "⼂"=>"丶", "⼃"=>"丿", "⼄"=>"乙", "⼅"=>"亅", "⼆"=>"二", "⼇"=>"亠", "⼈"=>"人", "⼉"=>"儿", "⼊"=>"入", "⼋"=>"八", "⼌"=>"冂", "⼍"=>"冖", "⼎"=>"冫", "⼏"=>"几", "⼐"=>"凵", "⼑"=>"刀", "⼒"=>"力", "⼓"=>"勹", "⼔"=>"匕", "⼕"=>"匚", "⼖"=>"匸", "⼗"=>"十", "⼘"=>"卜", "⼙"=>"卩", "⼚"=>"厂", "⼛"=>"厶", "⼜"=>"又", "⼝"=>"口", "⼞"=>"囗", "⼟"=>"土", "⼠"=>"士", "⼡"=>"夂", "⼢"=>"夊", "⼣"=>"夕", "⼤"=>"大", "⼥"=>"女", "⼦"=>"子", "⼧"=>"宀", "⼨"=>"寸", "⼩"=>"小", "⼪"=>"尢", "⼫"=>"尸", "⼬"=>"屮", "⼭"=>"山", "⼮"=>"巛", "⼯"=>"工", "⼰"=>"己", "⼱"=>"巾", "⼲"=>"干", "⼳"=>"幺", "⼴"=>"广", "⼵"=>"廴", "⼶"=>"廾", "⼷"=>"弋", "⼸"=>"弓", "⼹"=>"彐", "⼺"=>"彡", "⼻"=>"彳", "⼼"=>"心", "⼽"=>"戈", "⼾"=>"戶", "⼿"=>"手", "⽀"=>"支", "⽁"=>"攴", "⽂"=>"文", "⽃"=>"斗", "⽄"=>"斤", "⽅"=>"方", "⽆"=>"无", "⽇"=>"日", "⽈"=>"曰", "⽉"=>"月", "⽊"=>"木", "⽋"=>"欠", "⽌"=>"止", "⽍"=>"歹", "⽎"=>"殳", "⽏"=>"毋", "⽐"=>"比", "⽑"=>"毛", "⽒"=>"氏", "⽓"=>"气", "⽔"=>"水", "⽕"=>"火", "⽖"=>"爪", "⽗"=>"父", "⽘"=>"爻", "⽙"=>"爿", "⽚"=>"片", "⽛"=>"牙", "⽜"=>"牛", "⽝"=>"犬", "⽞"=>"玄", "⽟"=>"玉", "⽠"=>"瓜", "⽡"=>"瓦", "⽢"=>"甘", "⽣"=>"生", "⽤"=>"用", "⽥"=>"田", "⽦"=>"疋", "⽧"=>"疒", "⽨"=>"癶", "⽩"=>"白", "⽪"=>"皮", "⽫"=>"皿", "⽬"=>"目", "⽭"=>"矛", "⽮"=>"矢", "⽯"=>"石", "⽰"=>"示", "⽱"=>"禸", "⽲"=>"禾", "⽳"=>"穴", "⽴"=>"立", "⽵"=>"竹", "⽶"=>"米", "⽷"=>"糸", "⽸"=>"缶", "⽹"=>"网", "⽺"=>"羊", "⽻"=>"羽", "⽼"=>"老", "⽽"=>"而", "⽾"=>"耒", "⽿"=>"耳", "⾀"=>"聿", "⾁"=>"肉", "⾂"=>"臣", "⾃"=>"自", "⾄"=>"至", "⾅"=>"臼", "⾆"=>"舌", "⾇"=>"舛", "⾈"=>"舟", "⾉"=>"艮", "⾊"=>"色", "⾋"=>"艸", "⾌"=>"虍", "⾍"=>"虫", "⾎"=>"血", "⾏"=>"行", "⾐"=>"衣", "⾑"=>"襾", "⾒"=>"見", "⾓"=>"角", "⾔"=>"言", "⾕"=>"谷", "⾖"=>"豆", "⾗"=>"豕", "⾘"=>"豸", "⾙"=>"貝", "⾚"=>"赤", "⾛"=>"走", "⾜"=>"足", "⾝"=>"身", "⾞"=>"車", "⾟"=>"辛", "⾠"=>"辰", "⾡"=>"辵", "⾢"=>"邑", "⾣"=>"酉", "⾤"=>"釆", "⾥"=>"里", "⾦"=>"金", "⾧"=>"長", "⾨"=>"門", "⾩"=>"阜", "⾪"=>"隶", "⾫"=>"隹", "⾬"=>"雨", "⾭"=>"靑", "⾮"=>"非", "⾯"=>"面", "⾰"=>"革", "⾱"=>"韋", "⾲"=>"韭", "⾳"=>"音", "⾴"=>"頁", "⾵"=>"風", "⾶"=>"飛", "⾷"=>"食", "⾸"=>"首", "⾹"=>"香", "⾺"=>"馬", "⾻"=>"骨", "⾼"=>"高", "⾽"=>"髟", "⾾"=>"鬥", "⾿"=>"鬯", "⿀"=>"鬲", "⿁"=>"鬼", "⿂"=>"魚", "⿃"=>"鳥", "⿄"=>"鹵", "⿅"=>"鹿", "⿆"=>"麥", "⿇"=>"麻", "⿈"=>"黃", "⿉"=>"黍", "⿊"=>"黑", "⿋"=>"黹", "⿌"=>"黽", "⿍"=>"鼎", "⿎"=>"鼓", "⿏"=>"鼠", "⿐"=>"鼻", "⿑"=>"齊", "⿒"=>"齒", "⿓"=>"龍", "⿔"=>"龜", "⿕"=>"龠"}[radical]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,64 @@
1
+ module Analects
2
+ module Models
3
+ class Zi
4
+ RANGES = IceNine.deep_freeze(
5
+ unified:
6
+ { :name => "CJK Unified Ideographs",
7
+ :range => 0x4E00 .. 0x9FFF, # Ox9FC4..9FFF have no Unihan data
8
+ :sort_diff => -0x4E00
9
+ },
10
+ extension_A:
11
+ { :name => "CJK Unified Ideographs Extension A",
12
+ :range => 0x3400 .. 0x4DBF, # 0x4DB6..4DBF have no Unihan data
13
+ :sort_diff => 0x1E00
14
+ },
15
+ extension_B:
16
+ { :name => "CJK Unified Ideographs Extension B",
17
+ :range => 0x20000 .. 0x2A6DF, # 0x2A6D7..2A6DF have no Unihan data
18
+ :sort_diff => -0x19400
19
+ },
20
+ compatibility:
21
+ { :name => "CJK Compatibility Ideographs",
22
+ :range => 0xF900 .. 0xFAFF, # 0xFADA..FAFF; 0xFA2E..0xFA2F; 0xFA6B..0xFA6F have no Unihan data
23
+ :sort_diff => 0xFD00
24
+ },
25
+ supplement:
26
+ { :name => "CJK Compatibility Ideographs Supplement",
27
+ :range => 0x2F800 .. 0x2FA1F, # 0x2FA1E..0x2FA1F have no Unihan data
28
+ :sort_diff => -0x10000
29
+ },
30
+ radicals_supplement:
31
+ { name: "CJK Radicals supplement",
32
+ range: 0x2E80 .. 0x2EFF
33
+ },
34
+ kangxi_radicals:
35
+ { name: "Kangxi Radicals",
36
+ range: 0x2F00 .. 0x2FDF
37
+ }
38
+ )
39
+
40
+ def self.codepoint_ranges
41
+ RANGES.values.map{|v| v[:range]}
42
+ end
43
+
44
+ # Regexp that matches a single CJK character
45
+ REGEXP = Regexp.union(
46
+ codepoint_ranges.map { |range|
47
+ Regexp.new('[\u{%s}-\u{%s}]' % [ range.begin.to_s(16), range.end.to_s(16) ])
48
+ }
49
+ )
50
+
51
+ ANTIREGEXP = Regexp.new(
52
+ '[^'+
53
+ codepoint_ranges.map { |range| '\u{%s}-\u{%s}' % [ range.begin.to_s(16), range.end.to_s(16) ] }.join +
54
+ ']'
55
+ )
56
+
57
+ def self.each_radical(&block)
58
+ RANGES[:kangxi_radicals][:range].each do |codepoint|
59
+ block.([codepoint].pack('U'))
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,49 @@
1
+ require 'rake/tasklib'
2
+
3
+ module Analects
4
+ class RakeTasks < Rake::TaskLib
5
+ def initialize(name = :analects, &blk)
6
+ @name = name
7
+ if block_given?
8
+ if blk.arity == 0
9
+ self.instance_eval(&blk)
10
+ else
11
+ yield self
12
+ end
13
+ end
14
+ define
15
+ end
16
+
17
+ def library
18
+ @library ||= Analects::Library.new(options)
19
+ end
20
+
21
+ def options
22
+ @options ||= {}
23
+ end
24
+
25
+ def data_dir(dir)
26
+ options[:data_dir] = dir
27
+ end
28
+
29
+ def define
30
+ namespace @name do
31
+ namespace :download do
32
+ desc 'download CC-CEDICT'
33
+ task :cedict do
34
+ library.cedict.retrieve!
35
+ end
36
+
37
+ desc 'download Chise-IDS'
38
+ task :chise_ids do
39
+ library.chise_ids.retrieve!
40
+ end
41
+
42
+ desc 'download all sources'
43
+ task :all => [:cedict, :chise_ids]
44
+ end
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,70 @@
1
+ module Analects
2
+ class Source
3
+ include Enumerable
4
+ attr_reader :options
5
+
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ def name ; options[:name] ; end
11
+ def url ; options[:url] ; end
12
+ def retrieval ; Array(options[:retrieval]) ; end
13
+
14
+ def loader
15
+ @loader ||= options[:loader].new(Pathname(location))
16
+ end
17
+
18
+ def data_dir
19
+ options[:data_dir]
20
+ end
21
+
22
+ def location
23
+ options[:data_file] ? File.join( data_dir, options[:data_file] ) : File.join( data_dir, options[:name].to_s )
24
+ end
25
+
26
+ def data_file_present?
27
+ File.exist? location
28
+ end
29
+
30
+ def retrieve
31
+ retrieve! unless data_file_present?
32
+ end
33
+
34
+ def retrieve!
35
+ retrieval.inject( url ) do | result, method |
36
+ self.send( "retrieve_#{method}", result )
37
+ end
38
+ end
39
+
40
+ # url -> stream
41
+ def retrieve_http( url )
42
+ require 'open-uri'
43
+ open( url )
44
+ end
45
+
46
+ # gzipped stream -> uncompressed stream
47
+ def retrieve_gunzip( stream )
48
+ require 'zlib'
49
+ Zlib::GzipReader.new( stream )
50
+ end
51
+
52
+ # stream|string -> create data file
53
+ def retrieve_save( data )
54
+ File.open( location, 'w' ) do |f|
55
+ f << ( data.respond_to?(:read) ? data.read : data )
56
+ end
57
+ end
58
+
59
+ # url -> clones repo
60
+ def retrieve_git( url )
61
+ `git clone #{url} #{data_dir}/#{name}` # Admittedly crude
62
+ end
63
+
64
+ def each(&block)
65
+ return to_enum unless block_given?
66
+ loader.each(&block)
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,54 @@
1
+ module Analects
2
+ class Tokenizer
3
+ #ALGO = RMMSeg::Algorithm
4
+ ALGO = RMMSeg::SimpleAlgorithm
5
+
6
+ def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic')
7
+ unless File.exist?(chars_dic) && File.exist?(words_dic)
8
+ create_dict_from_cedict( chars_dic, words_dic )
9
+ end
10
+ #RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]]
11
+ RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]]
12
+ end
13
+
14
+ def library
15
+ @library ||= Analects::Library.new
16
+ end
17
+
18
+ def cedict( fn = '/tmp/cedict.json' )
19
+ require 'json'
20
+ unless File.exist?( fn )
21
+ library.cedict.retrieve
22
+ File.write( fn, library.cedict.to_a.to_json )
23
+ end
24
+ @cedict ||= JSON.parse IO.read( fn )
25
+ end
26
+
27
+ def create_dict_from_cedict(chars_dic, words_dic)
28
+ words = Set.new
29
+ histo = Hash.new(0)
30
+
31
+ cedict.each do |c|
32
+ words << c[0]
33
+ words << c[1]
34
+ (c[0] + c[1]).chars.each do |c|
35
+ histo[c] += 1
36
+ end
37
+ end
38
+
39
+ File.write(words_dic, words.sort.join("\n"))
40
+ File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join )
41
+ end
42
+
43
+ def tokenize( str )
44
+ [].tap do |result|
45
+ ALGO.new( str ).tap do |alg|
46
+ until (tok = alg.next_token).nil?
47
+ result << tok.text.force_encoding('UTF-8')
48
+ end
49
+ end
50
+ end
51
+ end
52
+ alias call tokenize
53
+ end
54
+ end