analects 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rvmrc +1 -0
- data/.travis.yml +14 -0
- data/Gemfile +17 -0
- data/Gemfile.devtools +71 -0
- data/Gemfile.lock +236 -0
- data/LICENSE.txt +674 -0
- data/README.md +81 -0
- data/Rakefile +26 -0
- data/SOURCES.md +17 -0
- data/analects.gemspec +29 -0
- data/bin/wp_hsk_filter +36 -0
- data/config/devtools.yml +2 -0
- data/config/flay.yml +3 -0
- data/config/flog.yml +2 -0
- data/config/mutant.yml +3 -0
- data/config/reek.yml +103 -0
- data/config/rubocop.yml +58 -0
- data/config/yardstick.yml +2 -0
- data/data/.gitkeep +0 -0
- data/lib/analects.rb +37 -0
- data/lib/analects/cedict_loader.rb +44 -0
- data/lib/analects/chise_ids_loader.rb +34 -0
- data/lib/analects/cli/progress.rb +37 -0
- data/lib/analects/encoding.rb +61 -0
- data/lib/analects/library.rb +68 -0
- data/lib/analects/models/kangxi_radical.rb +14 -0
- data/lib/analects/models/zi.rb +64 -0
- data/lib/analects/rake_tasks.rb +49 -0
- data/lib/analects/source.rb +70 -0
- data/lib/analects/tokenizer.rb +54 -0
- data/lib/analects/version.rb +3 -0
- data/lib/cjk_string.rb +56 -0
- data/lib/generators/analects.rb +20 -0
- data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
- data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
- data/lib/generators/analects/cedict/templates/model.rb +3 -0
- data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
- data/spec/analects/cedict_loader_spec.rb +48 -0
- data/spec/analects/chise_ids_loader_spec.rb +50 -0
- data/spec/analects/library_spec.rb +50 -0
- data/spec/analects/source_spec.rb +18 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
- metadata +221 -0
@@ -0,0 +1,34 @@
|
|
1
|
+
module Analects
|
2
|
+
class ChiseIdsLoader
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
attr_accessor :only_unicode
|
6
|
+
|
7
|
+
class MultiFile < Struct.new(:files)
|
8
|
+
def each_line(&blk)
|
9
|
+
return to_enum(__method__) unless block_given?
|
10
|
+
files.each do |file|
|
11
|
+
file.each_line(&blk)
|
12
|
+
end
|
13
|
+
self
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(pathname, only_unicode = true)
|
18
|
+
@contents = MultiFile.new(pathname.children.select{|ch| ch.to_s =~ /IDS-.*\.txt/})
|
19
|
+
@only_unicode = only_unicode
|
20
|
+
end
|
21
|
+
|
22
|
+
def field_names
|
23
|
+
[:name, :representation, :ids]
|
24
|
+
end
|
25
|
+
|
26
|
+
def each(&blk)
|
27
|
+
return to_enum(__method__) unless block_given?
|
28
|
+
@entries ||= @contents.each_line
|
29
|
+
.reject {|line| line !~ /\t/ || (only_unicode && line !~ /^U/) }
|
30
|
+
.map {|line| line.strip.split("\t")[0..2] }
|
31
|
+
@entries.each(&blk)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Analects
|
2
|
+
module CLI
|
3
|
+
# Command line progress bar
|
4
|
+
class Progress
|
5
|
+
attr_accessor :length, :count
|
6
|
+
|
7
|
+
def initialize(total, accuracy = 1000, prefix = '')
|
8
|
+
@total = total
|
9
|
+
@current = 0
|
10
|
+
@length = 60
|
11
|
+
@count = 100
|
12
|
+
@accuracy = accuracy
|
13
|
+
@prefix = prefix
|
14
|
+
end
|
15
|
+
|
16
|
+
def next
|
17
|
+
@current += 1
|
18
|
+
draw if (@current % (Float(@total)/@accuracy).ceil) == 0 || @current == @total
|
19
|
+
end
|
20
|
+
|
21
|
+
def draw
|
22
|
+
return unless
|
23
|
+
x = pos(@length).floor
|
24
|
+
total_count = @count == 100 ? '%' : "/#{@count}"
|
25
|
+
print "\e[%dD\e[32m%s[\e[31m%s%s\e[32m]\e[34m %d%s\e[0m" % [@length+10+@prefix.length, @prefix, '='*x, ' '*(@length-x), pos(@count), total_count]
|
26
|
+
end
|
27
|
+
|
28
|
+
def pos(scale)
|
29
|
+
if @current == @total
|
30
|
+
scale
|
31
|
+
else
|
32
|
+
Float(@current)/@total * scale
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Analects
|
4
|
+
module Encoding
|
5
|
+
extend self
|
6
|
+
|
7
|
+
GB = ::Encoding::GB18030
|
8
|
+
BIG5 = ::Encoding::BIG5_UAO
|
9
|
+
|
10
|
+
def recode(enc, str)
|
11
|
+
str.force_encoding(enc).encode('UTF-8')
|
12
|
+
end
|
13
|
+
|
14
|
+
def from_gb(str)
|
15
|
+
recode(GB, str)
|
16
|
+
end
|
17
|
+
|
18
|
+
def from_big5(str)
|
19
|
+
recode(BIG5, str)
|
20
|
+
end
|
21
|
+
|
22
|
+
def valid_cjk(str)
|
23
|
+
[GB, BIG5].map do |enc|
|
24
|
+
begin
|
25
|
+
recode(enc, str)
|
26
|
+
enc
|
27
|
+
rescue ::Encoding::UndefinedConversionError
|
28
|
+
rescue ::Encoding::InvalidByteSequenceError
|
29
|
+
end
|
30
|
+
end.compact
|
31
|
+
end
|
32
|
+
|
33
|
+
# Crude way to guess which encoding it is
|
34
|
+
def ratings(str)
|
35
|
+
all_valid_cjk(str).map do |enc|
|
36
|
+
[
|
37
|
+
enc,
|
38
|
+
recode(enc, str).codepoints.map do |point|
|
39
|
+
Analects::Models::Zi.codepoint_ranges.map.with_index do |range, idx|
|
40
|
+
next 6-idx if range.include?(point)
|
41
|
+
0
|
42
|
+
end.inject(:+)
|
43
|
+
end.inject(:+)
|
44
|
+
]
|
45
|
+
end.sort_by(&:last).reverse
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# For info on Taiwanese Big5 variants + Ruby
|
52
|
+
# * https://bugs.ruby-lang.org/issues/1784
|
53
|
+
# * http://lists.gnu.org/archive/html/bug-gnu-libiconv/2010-11/msg00007.html
|
54
|
+
|
55
|
+
# Wikipedia pages of GB (国家标准) encodings (chronological?)
|
56
|
+
# * http://en.wikipedia.org/wiki/GB_2312
|
57
|
+
# * http://en.wikipedia.org/wiki/GBK
|
58
|
+
# * http://en.wikipedia.org/wiki/GB18030
|
59
|
+
|
60
|
+
# Ruby also knows about this one, but can't convert it to UTF-8
|
61
|
+
# * http://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Analects
|
2
|
+
CEDICT_URL = 'http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz'
|
3
|
+
CHISE_IDS_URL = 'http://git.chise.org/git/chise/ids.git'
|
4
|
+
UNIHAN_URL = ''
|
5
|
+
HSK_URL = ''
|
6
|
+
|
7
|
+
class Library
|
8
|
+
attr_reader :options
|
9
|
+
|
10
|
+
def initialize(options = {})
|
11
|
+
@options = options.freeze
|
12
|
+
end
|
13
|
+
|
14
|
+
def data_dir
|
15
|
+
if options[:data_dir]
|
16
|
+
Dir.mkdir(options[:data_dir]) unless File.exist?(options[:data_dir])
|
17
|
+
return options[:data_dir]
|
18
|
+
end
|
19
|
+
File.join(Dir.home, '.analects').tap do |dir|
|
20
|
+
unless File.exist? dir
|
21
|
+
Dir.mkdir dir
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def cedict
|
27
|
+
@cedict ||= Source.new(
|
28
|
+
{
|
29
|
+
data_file: 'cedict_1_0_ts_utf-8_mdbg.txt',
|
30
|
+
retrieval: [ :http, :gunzip, :save ]
|
31
|
+
}.merge(options_for :cedict)
|
32
|
+
)
|
33
|
+
end
|
34
|
+
|
35
|
+
def chise_ids
|
36
|
+
@chise_ids ||= Source.new(
|
37
|
+
{
|
38
|
+
retrieval: :git
|
39
|
+
}.merge(options_for :chise_ids)
|
40
|
+
)
|
41
|
+
end
|
42
|
+
|
43
|
+
def unihan
|
44
|
+
@unihan ||= Source.new(
|
45
|
+
{
|
46
|
+
data_file: ''
|
47
|
+
}.merge(options_for :chise_ids)
|
48
|
+
)
|
49
|
+
end
|
50
|
+
# def hsk
|
51
|
+
# @hsk ||= Source.new(
|
52
|
+
|
53
|
+
# ).merge(options_for :hsk)
|
54
|
+
# end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def options_for(name)
|
59
|
+
{
|
60
|
+
name: name,
|
61
|
+
url: Analects.const_get("#{name.to_s.upcase}_URL"),
|
62
|
+
loader: Analects.const_get("#{Inflecto.camelize name}Loader"),
|
63
|
+
data_dir: data_dir
|
64
|
+
}.merge(options.fetch(name, {}))
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Analects
|
3
|
+
module Models
|
4
|
+
class KangxiRadical
|
5
|
+
# Mapping of Kangxi radical (Unicode Symbol) to the compatibility character that is
|
6
|
+
# composed of that radical (Unicode Letter). The former are not full characters
|
7
|
+
# and not normally used in text unless explicitly referring to only their
|
8
|
+
# usage as radicals.
|
9
|
+
def self.compat(radical)
|
10
|
+
{"⼀"=>"一", "⼁"=>"丨", "⼂"=>"丶", "⼃"=>"丿", "⼄"=>"乙", "⼅"=>"亅", "⼆"=>"二", "⼇"=>"亠", "⼈"=>"人", "⼉"=>"儿", "⼊"=>"入", "⼋"=>"八", "⼌"=>"冂", "⼍"=>"冖", "⼎"=>"冫", "⼏"=>"几", "⼐"=>"凵", "⼑"=>"刀", "⼒"=>"力", "⼓"=>"勹", "⼔"=>"匕", "⼕"=>"匚", "⼖"=>"匸", "⼗"=>"十", "⼘"=>"卜", "⼙"=>"卩", "⼚"=>"厂", "⼛"=>"厶", "⼜"=>"又", "⼝"=>"口", "⼞"=>"囗", "⼟"=>"土", "⼠"=>"士", "⼡"=>"夂", "⼢"=>"夊", "⼣"=>"夕", "⼤"=>"大", "⼥"=>"女", "⼦"=>"子", "⼧"=>"宀", "⼨"=>"寸", "⼩"=>"小", "⼪"=>"尢", "⼫"=>"尸", "⼬"=>"屮", "⼭"=>"山", "⼮"=>"巛", "⼯"=>"工", "⼰"=>"己", "⼱"=>"巾", "⼲"=>"干", "⼳"=>"幺", "⼴"=>"广", "⼵"=>"廴", "⼶"=>"廾", "⼷"=>"弋", "⼸"=>"弓", "⼹"=>"彐", "⼺"=>"彡", "⼻"=>"彳", "⼼"=>"心", "⼽"=>"戈", "⼾"=>"戶", "⼿"=>"手", "⽀"=>"支", "⽁"=>"攴", "⽂"=>"文", "⽃"=>"斗", "⽄"=>"斤", "⽅"=>"方", "⽆"=>"无", "⽇"=>"日", "⽈"=>"曰", "⽉"=>"月", "⽊"=>"木", "⽋"=>"欠", "⽌"=>"止", "⽍"=>"歹", "⽎"=>"殳", "⽏"=>"毋", "⽐"=>"比", "⽑"=>"毛", "⽒"=>"氏", "⽓"=>"气", "⽔"=>"水", "⽕"=>"火", "⽖"=>"爪", "⽗"=>"父", "⽘"=>"爻", "⽙"=>"爿", "⽚"=>"片", "⽛"=>"牙", "⽜"=>"牛", "⽝"=>"犬", "⽞"=>"玄", "⽟"=>"玉", "⽠"=>"瓜", "⽡"=>"瓦", "⽢"=>"甘", "⽣"=>"生", "⽤"=>"用", "⽥"=>"田", "⽦"=>"疋", "⽧"=>"疒", "⽨"=>"癶", "⽩"=>"白", "⽪"=>"皮", "⽫"=>"皿", "⽬"=>"目", "⽭"=>"矛", "⽮"=>"矢", "⽯"=>"石", "⽰"=>"示", "⽱"=>"禸", "⽲"=>"禾", "⽳"=>"穴", "⽴"=>"立", "⽵"=>"竹", "⽶"=>"米", "⽷"=>"糸", "⽸"=>"缶", "⽹"=>"网", "⽺"=>"羊", "⽻"=>"羽", "⽼"=>"老", "⽽"=>"而", "⽾"=>"耒", "⽿"=>"耳", "⾀"=>"聿", "⾁"=>"肉", "⾂"=>"臣", "⾃"=>"自", "⾄"=>"至", "⾅"=>"臼", "⾆"=>"舌", "⾇"=>"舛", "⾈"=>"舟", "⾉"=>"艮", "⾊"=>"色", "⾋"=>"艸", "⾌"=>"虍", "⾍"=>"虫", "⾎"=>"血", "⾏"=>"行", "⾐"=>"衣", "⾑"=>"襾", "⾒"=>"見", "⾓"=>"角", "⾔"=>"言", "⾕"=>"谷", "⾖"=>"豆", "⾗"=>"豕", "⾘"=>"豸", "⾙"=>"貝", "⾚"=>"赤", "⾛"=>"走", "⾜"=>"足", "⾝"=>"身", "⾞"=>"車", "⾟"=>"辛", "⾠"=>"辰", "⾡"=>"辵", "⾢"=>"邑", "⾣"=>"酉", "⾤"=>"釆", "⾥"=>"里", "⾦"=>"金", "⾧"=>"長", "⾨"=>"門", "⾩"=>"阜", "⾪"=>"隶", "⾫"=>"隹", "⾬"=>"雨", "⾭"=>"靑", "⾮"=>"非", "⾯"=>"面", "⾰"=>"革", "⾱"=>"韋", "⾲"=>"韭", "⾳"=>"音", "⾴"=>"頁", "⾵"=>"風", "⾶"=>"飛", "⾷"=>"食", "⾸"=>"首", "⾹"=>"香", "⾺"=>"馬", "⾻"=>"骨", "⾼"=>"高", "⾽"=>"髟", "⾾"=>"鬥", "⾿"=>"鬯", "⿀"=>"鬲", "⿁"=>"鬼", "⿂"=>"魚", "⿃"=>"鳥", "⿄"=>"鹵", "⿅"=>"鹿", "⿆"=>"麥", "⿇"=>"麻", "⿈"=>"黃", "⿉"=>"黍", "⿊"=>"黑", "⿋"=>"黹", "⿌"=>"黽", "⿍"=>"鼎", "⿎"=>"鼓", "⿏"=>"鼠", "⿐"=>"鼻", "⿑"=>"齊", "⿒"=>"齒", "⿓"=>"龍", "⿔"=>"龜", "⿕"=>"龠"}[radical]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Analects
|
2
|
+
module Models
|
3
|
+
class Zi
|
4
|
+
RANGES = IceNine.deep_freeze(
|
5
|
+
unified:
|
6
|
+
{ :name => "CJK Unified Ideographs",
|
7
|
+
:range => 0x4E00 .. 0x9FFF, # Ox9FC4..9FFF have no Unihan data
|
8
|
+
:sort_diff => -0x4E00
|
9
|
+
},
|
10
|
+
extension_A:
|
11
|
+
{ :name => "CJK Unified Ideographs Extension A",
|
12
|
+
:range => 0x3400 .. 0x4DBF, # 0x4DB6..4DBF have no Unihan data
|
13
|
+
:sort_diff => 0x1E00
|
14
|
+
},
|
15
|
+
extension_B:
|
16
|
+
{ :name => "CJK Unified Ideographs Extension B",
|
17
|
+
:range => 0x20000 .. 0x2A6DF, # 0x2A6D7..2A6DF have no Unihan data
|
18
|
+
:sort_diff => -0x19400
|
19
|
+
},
|
20
|
+
compatibility:
|
21
|
+
{ :name => "CJK Compatibility Ideographs",
|
22
|
+
:range => 0xF900 .. 0xFAFF, # 0xFADA..FAFF; 0xFA2E..0xFA2F; 0xFA6B..0xFA6F have no Unihan data
|
23
|
+
:sort_diff => 0xFD00
|
24
|
+
},
|
25
|
+
supplement:
|
26
|
+
{ :name => "CJK Compatibility Ideographs Supplement",
|
27
|
+
:range => 0x2F800 .. 0x2FA1F, # 0x2FA1E..0x2FA1F have no Unihan data
|
28
|
+
:sort_diff => -0x10000
|
29
|
+
},
|
30
|
+
radicals_supplement:
|
31
|
+
{ name: "CJK Radicals supplement",
|
32
|
+
range: 0x2E80 .. 0x2EFF
|
33
|
+
},
|
34
|
+
kangxi_radicals:
|
35
|
+
{ name: "Kangxi Radicals",
|
36
|
+
range: 0x2F00 .. 0x2FDF
|
37
|
+
}
|
38
|
+
)
|
39
|
+
|
40
|
+
def self.codepoint_ranges
|
41
|
+
RANGES.values.map{|v| v[:range]}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Regexp that matches a single CJK character
|
45
|
+
REGEXP = Regexp.union(
|
46
|
+
codepoint_ranges.map { |range|
|
47
|
+
Regexp.new('[\u{%s}-\u{%s}]' % [ range.begin.to_s(16), range.end.to_s(16) ])
|
48
|
+
}
|
49
|
+
)
|
50
|
+
|
51
|
+
ANTIREGEXP = Regexp.new(
|
52
|
+
'[^'+
|
53
|
+
codepoint_ranges.map { |range| '\u{%s}-\u{%s}' % [ range.begin.to_s(16), range.end.to_s(16) ] }.join +
|
54
|
+
']'
|
55
|
+
)
|
56
|
+
|
57
|
+
def self.each_radical(&block)
|
58
|
+
RANGES[:kangxi_radicals][:range].each do |codepoint|
|
59
|
+
block.([codepoint].pack('U'))
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rake/tasklib'
|
2
|
+
|
3
|
+
module Analects
|
4
|
+
class RakeTasks < Rake::TaskLib
|
5
|
+
def initialize(name = :analects, &blk)
|
6
|
+
@name = name
|
7
|
+
if block_given?
|
8
|
+
if blk.arity == 0
|
9
|
+
self.instance_eval(&blk)
|
10
|
+
else
|
11
|
+
yield self
|
12
|
+
end
|
13
|
+
end
|
14
|
+
define
|
15
|
+
end
|
16
|
+
|
17
|
+
def library
|
18
|
+
@library ||= Analects::Library.new(options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def options
|
22
|
+
@options ||= {}
|
23
|
+
end
|
24
|
+
|
25
|
+
def data_dir(dir)
|
26
|
+
options[:data_dir] = dir
|
27
|
+
end
|
28
|
+
|
29
|
+
def define
|
30
|
+
namespace @name do
|
31
|
+
namespace :download do
|
32
|
+
desc 'download CC-CEDICT'
|
33
|
+
task :cedict do
|
34
|
+
library.cedict.retrieve!
|
35
|
+
end
|
36
|
+
|
37
|
+
desc 'download Chise-IDS'
|
38
|
+
task :chise_ids do
|
39
|
+
library.chise_ids.retrieve!
|
40
|
+
end
|
41
|
+
|
42
|
+
desc 'download all sources'
|
43
|
+
task :all => [:cedict, :chise_ids]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Analects
|
2
|
+
class Source
|
3
|
+
include Enumerable
|
4
|
+
attr_reader :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def name ; options[:name] ; end
|
11
|
+
def url ; options[:url] ; end
|
12
|
+
def retrieval ; Array(options[:retrieval]) ; end
|
13
|
+
|
14
|
+
def loader
|
15
|
+
@loader ||= options[:loader].new(Pathname(location))
|
16
|
+
end
|
17
|
+
|
18
|
+
def data_dir
|
19
|
+
options[:data_dir]
|
20
|
+
end
|
21
|
+
|
22
|
+
def location
|
23
|
+
options[:data_file] ? File.join( data_dir, options[:data_file] ) : File.join( data_dir, options[:name].to_s )
|
24
|
+
end
|
25
|
+
|
26
|
+
def data_file_present?
|
27
|
+
File.exist? location
|
28
|
+
end
|
29
|
+
|
30
|
+
def retrieve
|
31
|
+
retrieve! unless data_file_present?
|
32
|
+
end
|
33
|
+
|
34
|
+
def retrieve!
|
35
|
+
retrieval.inject( url ) do | result, method |
|
36
|
+
self.send( "retrieve_#{method}", result )
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# url -> stream
|
41
|
+
def retrieve_http( url )
|
42
|
+
require 'open-uri'
|
43
|
+
open( url )
|
44
|
+
end
|
45
|
+
|
46
|
+
# gzipped stream -> uncompressed stream
|
47
|
+
def retrieve_gunzip( stream )
|
48
|
+
require 'zlib'
|
49
|
+
Zlib::GzipReader.new( stream )
|
50
|
+
end
|
51
|
+
|
52
|
+
# stream|string -> create data file
|
53
|
+
def retrieve_save( data )
|
54
|
+
File.open( location, 'w' ) do |f|
|
55
|
+
f << ( data.respond_to?(:read) ? data.read : data )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# url -> clones repo
|
60
|
+
def retrieve_git( url )
|
61
|
+
`git clone #{url} #{data_dir}/#{name}` # Admittedly crude
|
62
|
+
end
|
63
|
+
|
64
|
+
def each(&block)
|
65
|
+
return to_enum unless block_given?
|
66
|
+
loader.each(&block)
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Analects
|
2
|
+
class Tokenizer
|
3
|
+
#ALGO = RMMSeg::Algorithm
|
4
|
+
ALGO = RMMSeg::SimpleAlgorithm
|
5
|
+
|
6
|
+
def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic')
|
7
|
+
unless File.exist?(chars_dic) && File.exist?(words_dic)
|
8
|
+
create_dict_from_cedict( chars_dic, words_dic )
|
9
|
+
end
|
10
|
+
#RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]]
|
11
|
+
RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]]
|
12
|
+
end
|
13
|
+
|
14
|
+
def library
|
15
|
+
@library ||= Analects::Library.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def cedict( fn = '/tmp/cedict.json' )
|
19
|
+
require 'json'
|
20
|
+
unless File.exist?( fn )
|
21
|
+
library.cedict.retrieve
|
22
|
+
File.write( fn, library.cedict.to_a.to_json )
|
23
|
+
end
|
24
|
+
@cedict ||= JSON.parse IO.read( fn )
|
25
|
+
end
|
26
|
+
|
27
|
+
def create_dict_from_cedict(chars_dic, words_dic)
|
28
|
+
words = Set.new
|
29
|
+
histo = Hash.new(0)
|
30
|
+
|
31
|
+
cedict.each do |c|
|
32
|
+
words << c[0]
|
33
|
+
words << c[1]
|
34
|
+
(c[0] + c[1]).chars.each do |c|
|
35
|
+
histo[c] += 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
File.write(words_dic, words.sort.join("\n"))
|
40
|
+
File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join )
|
41
|
+
end
|
42
|
+
|
43
|
+
def tokenize( str )
|
44
|
+
[].tap do |result|
|
45
|
+
ALGO.new( str ).tap do |alg|
|
46
|
+
until (tok = alg.next_token).nil?
|
47
|
+
result << tok.text.force_encoding('UTF-8')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
alias call tokenize
|
53
|
+
end
|
54
|
+
end
|