analects 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rvmrc +1 -0
- data/.travis.yml +14 -0
- data/Gemfile +17 -0
- data/Gemfile.devtools +71 -0
- data/Gemfile.lock +236 -0
- data/LICENSE.txt +674 -0
- data/README.md +81 -0
- data/Rakefile +26 -0
- data/SOURCES.md +17 -0
- data/analects.gemspec +29 -0
- data/bin/wp_hsk_filter +36 -0
- data/config/devtools.yml +2 -0
- data/config/flay.yml +3 -0
- data/config/flog.yml +2 -0
- data/config/mutant.yml +3 -0
- data/config/reek.yml +103 -0
- data/config/rubocop.yml +58 -0
- data/config/yardstick.yml +2 -0
- data/data/.gitkeep +0 -0
- data/lib/analects.rb +37 -0
- data/lib/analects/cedict_loader.rb +44 -0
- data/lib/analects/chise_ids_loader.rb +34 -0
- data/lib/analects/cli/progress.rb +37 -0
- data/lib/analects/encoding.rb +61 -0
- data/lib/analects/library.rb +68 -0
- data/lib/analects/models/kangxi_radical.rb +14 -0
- data/lib/analects/models/zi.rb +64 -0
- data/lib/analects/rake_tasks.rb +49 -0
- data/lib/analects/source.rb +70 -0
- data/lib/analects/tokenizer.rb +54 -0
- data/lib/analects/version.rb +3 -0
- data/lib/cjk_string.rb +56 -0
- data/lib/generators/analects.rb +20 -0
- data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
- data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
- data/lib/generators/analects/cedict/templates/model.rb +3 -0
- data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
- data/spec/analects/cedict_loader_spec.rb +48 -0
- data/spec/analects/chise_ids_loader_spec.rb +50 -0
- data/spec/analects/library_spec.rb +50 -0
- data/spec/analects/source_spec.rb +18 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
- metadata +221 -0
@@ -0,0 +1,34 @@
|
|
1
|
+
module Analects
|
2
|
+
class ChiseIdsLoader
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
attr_accessor :only_unicode
|
6
|
+
|
7
|
+
class MultiFile < Struct.new(:files)
|
8
|
+
def each_line(&blk)
|
9
|
+
return to_enum(__method__) unless block_given?
|
10
|
+
files.each do |file|
|
11
|
+
file.each_line(&blk)
|
12
|
+
end
|
13
|
+
self
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(pathname, only_unicode = true)
|
18
|
+
@contents = MultiFile.new(pathname.children.select{|ch| ch.to_s =~ /IDS-.*\.txt/})
|
19
|
+
@only_unicode = only_unicode
|
20
|
+
end
|
21
|
+
|
22
|
+
def field_names
|
23
|
+
[:name, :representation, :ids]
|
24
|
+
end
|
25
|
+
|
26
|
+
def each(&blk)
|
27
|
+
return to_enum(__method__) unless block_given?
|
28
|
+
@entries ||= @contents.each_line
|
29
|
+
.reject {|line| line !~ /\t/ || (only_unicode && line !~ /^U/) }
|
30
|
+
.map {|line| line.strip.split("\t")[0..2] }
|
31
|
+
@entries.each(&blk)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Analects
|
2
|
+
module CLI
|
3
|
+
# Command line progress bar
|
4
|
+
class Progress
|
5
|
+
attr_accessor :length, :count
|
6
|
+
|
7
|
+
def initialize(total, accuracy = 1000, prefix = '')
|
8
|
+
@total = total
|
9
|
+
@current = 0
|
10
|
+
@length = 60
|
11
|
+
@count = 100
|
12
|
+
@accuracy = accuracy
|
13
|
+
@prefix = prefix
|
14
|
+
end
|
15
|
+
|
16
|
+
def next
|
17
|
+
@current += 1
|
18
|
+
draw if (@current % (Float(@total)/@accuracy).ceil) == 0 || @current == @total
|
19
|
+
end
|
20
|
+
|
21
|
+
def draw
|
22
|
+
return unless
|
23
|
+
x = pos(@length).floor
|
24
|
+
total_count = @count == 100 ? '%' : "/#{@count}"
|
25
|
+
print "\e[%dD\e[32m%s[\e[31m%s%s\e[32m]\e[34m %d%s\e[0m" % [@length+10+@prefix.length, @prefix, '='*x, ' '*(@length-x), pos(@count), total_count]
|
26
|
+
end
|
27
|
+
|
28
|
+
def pos(scale)
|
29
|
+
if @current == @total
|
30
|
+
scale
|
31
|
+
else
|
32
|
+
Float(@current)/@total * scale
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Analects
|
4
|
+
module Encoding
|
5
|
+
extend self
|
6
|
+
|
7
|
+
GB = ::Encoding::GB18030
|
8
|
+
BIG5 = ::Encoding::BIG5_UAO
|
9
|
+
|
10
|
+
def recode(enc, str)
|
11
|
+
str.force_encoding(enc).encode('UTF-8')
|
12
|
+
end
|
13
|
+
|
14
|
+
def from_gb(str)
|
15
|
+
recode(GB, str)
|
16
|
+
end
|
17
|
+
|
18
|
+
def from_big5(str)
|
19
|
+
recode(BIG5, str)
|
20
|
+
end
|
21
|
+
|
22
|
+
def valid_cjk(str)
|
23
|
+
[GB, BIG5].map do |enc|
|
24
|
+
begin
|
25
|
+
recode(enc, str)
|
26
|
+
enc
|
27
|
+
rescue ::Encoding::UndefinedConversionError
|
28
|
+
rescue ::Encoding::InvalidByteSequenceError
|
29
|
+
end
|
30
|
+
end.compact
|
31
|
+
end
|
32
|
+
|
33
|
+
# Crude way to guess which encoding it is
|
34
|
+
def ratings(str)
|
35
|
+
all_valid_cjk(str).map do |enc|
|
36
|
+
[
|
37
|
+
enc,
|
38
|
+
recode(enc, str).codepoints.map do |point|
|
39
|
+
Analects::Models::Zi.codepoint_ranges.map.with_index do |range, idx|
|
40
|
+
next 6-idx if range.include?(point)
|
41
|
+
0
|
42
|
+
end.inject(:+)
|
43
|
+
end.inject(:+)
|
44
|
+
]
|
45
|
+
end.sort_by(&:last).reverse
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# For info on Taiwanese Big5 variants + Ruby
|
52
|
+
# * https://bugs.ruby-lang.org/issues/1784
|
53
|
+
# * http://lists.gnu.org/archive/html/bug-gnu-libiconv/2010-11/msg00007.html
|
54
|
+
|
55
|
+
# Wikipedia pages of GB (国家标准) encodings (chronological?)
|
56
|
+
# * http://en.wikipedia.org/wiki/GB_2312
|
57
|
+
# * http://en.wikipedia.org/wiki/GBK
|
58
|
+
# * http://en.wikipedia.org/wiki/GB18030
|
59
|
+
|
60
|
+
# Ruby also knows about this one, but can't convert it to UTF-8
|
61
|
+
# * http://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Analects
|
2
|
+
CEDICT_URL = 'http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz'
|
3
|
+
CHISE_IDS_URL = 'http://git.chise.org/git/chise/ids.git'
|
4
|
+
UNIHAN_URL = ''
|
5
|
+
HSK_URL = ''
|
6
|
+
|
7
|
+
class Library
|
8
|
+
attr_reader :options
|
9
|
+
|
10
|
+
def initialize(options = {})
|
11
|
+
@options = options.freeze
|
12
|
+
end
|
13
|
+
|
14
|
+
def data_dir
|
15
|
+
if options[:data_dir]
|
16
|
+
Dir.mkdir(options[:data_dir]) unless File.exist?(options[:data_dir])
|
17
|
+
return options[:data_dir]
|
18
|
+
end
|
19
|
+
File.join(Dir.home, '.analects').tap do |dir|
|
20
|
+
unless File.exist? dir
|
21
|
+
Dir.mkdir dir
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def cedict
|
27
|
+
@cedict ||= Source.new(
|
28
|
+
{
|
29
|
+
data_file: 'cedict_1_0_ts_utf-8_mdbg.txt',
|
30
|
+
retrieval: [ :http, :gunzip, :save ]
|
31
|
+
}.merge(options_for :cedict)
|
32
|
+
)
|
33
|
+
end
|
34
|
+
|
35
|
+
def chise_ids
|
36
|
+
@chise_ids ||= Source.new(
|
37
|
+
{
|
38
|
+
retrieval: :git
|
39
|
+
}.merge(options_for :chise_ids)
|
40
|
+
)
|
41
|
+
end
|
42
|
+
|
43
|
+
def unihan
|
44
|
+
@unihan ||= Source.new(
|
45
|
+
{
|
46
|
+
data_file: ''
|
47
|
+
}.merge(options_for :chise_ids)
|
48
|
+
)
|
49
|
+
end
|
50
|
+
# def hsk
|
51
|
+
# @hsk ||= Source.new(
|
52
|
+
|
53
|
+
# ).merge(options_for :hsk)
|
54
|
+
# end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def options_for(name)
|
59
|
+
{
|
60
|
+
name: name,
|
61
|
+
url: Analects.const_get("#{name.to_s.upcase}_URL"),
|
62
|
+
loader: Analects.const_get("#{Inflecto.camelize name}Loader"),
|
63
|
+
data_dir: data_dir
|
64
|
+
}.merge(options.fetch(name, {}))
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Analects
|
3
|
+
module Models
|
4
|
+
class KangxiRadical
|
5
|
+
# Mapping of Kangxi radical (Unicode Symbol) to the compatibility character that is
|
6
|
+
# composed of that radical (Unicode Letter). The former are not full characters
|
7
|
+
# and not normally used in text unless explicitly referring to only their
|
8
|
+
# usage as radicals.
|
9
|
+
def self.compat(radical)
|
10
|
+
{"⼀"=>"一", "⼁"=>"丨", "⼂"=>"丶", "⼃"=>"丿", "⼄"=>"乙", "⼅"=>"亅", "⼆"=>"二", "⼇"=>"亠", "⼈"=>"人", "⼉"=>"儿", "⼊"=>"入", "⼋"=>"八", "⼌"=>"冂", "⼍"=>"冖", "⼎"=>"冫", "⼏"=>"几", "⼐"=>"凵", "⼑"=>"刀", "⼒"=>"力", "⼓"=>"勹", "⼔"=>"匕", "⼕"=>"匚", "⼖"=>"匸", "⼗"=>"十", "⼘"=>"卜", "⼙"=>"卩", "⼚"=>"厂", "⼛"=>"厶", "⼜"=>"又", "⼝"=>"口", "⼞"=>"囗", "⼟"=>"土", "⼠"=>"士", "⼡"=>"夂", "⼢"=>"夊", "⼣"=>"夕", "⼤"=>"大", "⼥"=>"女", "⼦"=>"子", "⼧"=>"宀", "⼨"=>"寸", "⼩"=>"小", "⼪"=>"尢", "⼫"=>"尸", "⼬"=>"屮", "⼭"=>"山", "⼮"=>"巛", "⼯"=>"工", "⼰"=>"己", "⼱"=>"巾", "⼲"=>"干", "⼳"=>"幺", "⼴"=>"广", "⼵"=>"廴", "⼶"=>"廾", "⼷"=>"弋", "⼸"=>"弓", "⼹"=>"彐", "⼺"=>"彡", "⼻"=>"彳", "⼼"=>"心", "⼽"=>"戈", "⼾"=>"戶", "⼿"=>"手", "⽀"=>"支", "⽁"=>"攴", "⽂"=>"文", "⽃"=>"斗", "⽄"=>"斤", "⽅"=>"方", "⽆"=>"无", "⽇"=>"日", "⽈"=>"曰", "⽉"=>"月", "⽊"=>"木", "⽋"=>"欠", "⽌"=>"止", "⽍"=>"歹", "⽎"=>"殳", "⽏"=>"毋", "⽐"=>"比", "⽑"=>"毛", "⽒"=>"氏", "⽓"=>"气", "⽔"=>"水", "⽕"=>"火", "⽖"=>"爪", "⽗"=>"父", "⽘"=>"爻", "⽙"=>"爿", "⽚"=>"片", "⽛"=>"牙", "⽜"=>"牛", "⽝"=>"犬", "⽞"=>"玄", "⽟"=>"玉", "⽠"=>"瓜", "⽡"=>"瓦", "⽢"=>"甘", "⽣"=>"生", "⽤"=>"用", "⽥"=>"田", "⽦"=>"疋", "⽧"=>"疒", "⽨"=>"癶", "⽩"=>"白", "⽪"=>"皮", "⽫"=>"皿", "⽬"=>"目", "⽭"=>"矛", "⽮"=>"矢", "⽯"=>"石", "⽰"=>"示", "⽱"=>"禸", "⽲"=>"禾", "⽳"=>"穴", "⽴"=>"立", "⽵"=>"竹", "⽶"=>"米", "⽷"=>"糸", "⽸"=>"缶", "⽹"=>"网", "⽺"=>"羊", "⽻"=>"羽", "⽼"=>"老", "⽽"=>"而", "⽾"=>"耒", "⽿"=>"耳", "⾀"=>"聿", "⾁"=>"肉", "⾂"=>"臣", "⾃"=>"自", "⾄"=>"至", "⾅"=>"臼", "⾆"=>"舌", "⾇"=>"舛", "⾈"=>"舟", "⾉"=>"艮", "⾊"=>"色", "⾋"=>"艸", "⾌"=>"虍", "⾍"=>"虫", "⾎"=>"血", "⾏"=>"行", "⾐"=>"衣", "⾑"=>"襾", "⾒"=>"見", "⾓"=>"角", "⾔"=>"言", "⾕"=>"谷", "⾖"=>"豆", "⾗"=>"豕", "⾘"=>"豸", "⾙"=>"貝", "⾚"=>"赤", "⾛"=>"走", "⾜"=>"足", "⾝"=>"身", "⾞"=>"車", "⾟"=>"辛", "⾠"=>"辰", "⾡"=>"辵", "⾢"=>"邑", "⾣"=>"酉", "⾤"=>"釆", "⾥"=>"里", "⾦"=>"金", "⾧"=>"長", "⾨"=>"門", "⾩"=>"阜", "⾪"=>"隶", "⾫"=>"隹", "⾬"=>"雨", "⾭"=>"靑", "⾮"=>"非", "⾯"=>"面", "⾰"=>"革", "⾱"=>"韋", "⾲"=>"韭", "⾳"=>"音", "⾴"=>"頁", "⾵"=>"風", "⾶"=>"飛", "⾷"=>"食", "⾸"=>"首", "⾹"=>"香", "⾺"=>"馬", "⾻"=>"骨", "⾼"=>"高", "⾽"=>"髟", "⾾"=>"鬥", "⾿"=>"鬯", "⿀"=>"鬲", "⿁"=>"鬼", "⿂"=>"魚", "⿃"=>"鳥", "⿄"=>"鹵", "⿅"=>"鹿", "⿆"=>"麥", "⿇"=>"麻", "⿈"=>"黃", "⿉"=>"黍", "⿊"=>"黑", "⿋"=>"黹", "⿌"=>"黽", "⿍"=>"鼎", "⿎"=>"鼓", "⿏"=>"鼠", "⿐"=>"鼻", "⿑"=>"齊", "⿒"=>"齒", "⿓"=>"龍", "⿔"=>"龜", "⿕"=>"龠"}[radical]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Analects
|
2
|
+
module Models
|
3
|
+
class Zi
|
4
|
+
RANGES = IceNine.deep_freeze(
|
5
|
+
unified:
|
6
|
+
{ :name => "CJK Unified Ideographs",
|
7
|
+
:range => 0x4E00 .. 0x9FFF, # Ox9FC4..9FFF have no Unihan data
|
8
|
+
:sort_diff => -0x4E00
|
9
|
+
},
|
10
|
+
extension_A:
|
11
|
+
{ :name => "CJK Unified Ideographs Extension A",
|
12
|
+
:range => 0x3400 .. 0x4DBF, # 0x4DB6..4DBF have no Unihan data
|
13
|
+
:sort_diff => 0x1E00
|
14
|
+
},
|
15
|
+
extension_B:
|
16
|
+
{ :name => "CJK Unified Ideographs Extension B",
|
17
|
+
:range => 0x20000 .. 0x2A6DF, # 0x2A6D7..2A6DF have no Unihan data
|
18
|
+
:sort_diff => -0x19400
|
19
|
+
},
|
20
|
+
compatibility:
|
21
|
+
{ :name => "CJK Compatibility Ideographs",
|
22
|
+
:range => 0xF900 .. 0xFAFF, # 0xFADA..FAFF; 0xFA2E..0xFA2F; 0xFA6B..0xFA6F have no Unihan data
|
23
|
+
:sort_diff => 0xFD00
|
24
|
+
},
|
25
|
+
supplement:
|
26
|
+
{ :name => "CJK Compatibility Ideographs Supplement",
|
27
|
+
:range => 0x2F800 .. 0x2FA1F, # 0x2FA1E..0x2FA1F have no Unihan data
|
28
|
+
:sort_diff => -0x10000
|
29
|
+
},
|
30
|
+
radicals_supplement:
|
31
|
+
{ name: "CJK Radicals supplement",
|
32
|
+
range: 0x2E80 .. 0x2EFF
|
33
|
+
},
|
34
|
+
kangxi_radicals:
|
35
|
+
{ name: "Kangxi Radicals",
|
36
|
+
range: 0x2F00 .. 0x2FDF
|
37
|
+
}
|
38
|
+
)
|
39
|
+
|
40
|
+
def self.codepoint_ranges
|
41
|
+
RANGES.values.map{|v| v[:range]}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Regexp that matches a single CJK character
|
45
|
+
REGEXP = Regexp.union(
|
46
|
+
codepoint_ranges.map { |range|
|
47
|
+
Regexp.new('[\u{%s}-\u{%s}]' % [ range.begin.to_s(16), range.end.to_s(16) ])
|
48
|
+
}
|
49
|
+
)
|
50
|
+
|
51
|
+
ANTIREGEXP = Regexp.new(
|
52
|
+
'[^'+
|
53
|
+
codepoint_ranges.map { |range| '\u{%s}-\u{%s}' % [ range.begin.to_s(16), range.end.to_s(16) ] }.join +
|
54
|
+
']'
|
55
|
+
)
|
56
|
+
|
57
|
+
def self.each_radical(&block)
|
58
|
+
RANGES[:kangxi_radicals][:range].each do |codepoint|
|
59
|
+
block.([codepoint].pack('U'))
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rake/tasklib'
|
2
|
+
|
3
|
+
module Analects
|
4
|
+
class RakeTasks < Rake::TaskLib
|
5
|
+
def initialize(name = :analects, &blk)
|
6
|
+
@name = name
|
7
|
+
if block_given?
|
8
|
+
if blk.arity == 0
|
9
|
+
self.instance_eval(&blk)
|
10
|
+
else
|
11
|
+
yield self
|
12
|
+
end
|
13
|
+
end
|
14
|
+
define
|
15
|
+
end
|
16
|
+
|
17
|
+
def library
|
18
|
+
@library ||= Analects::Library.new(options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def options
|
22
|
+
@options ||= {}
|
23
|
+
end
|
24
|
+
|
25
|
+
def data_dir(dir)
|
26
|
+
options[:data_dir] = dir
|
27
|
+
end
|
28
|
+
|
29
|
+
def define
|
30
|
+
namespace @name do
|
31
|
+
namespace :download do
|
32
|
+
desc 'download CC-CEDICT'
|
33
|
+
task :cedict do
|
34
|
+
library.cedict.retrieve!
|
35
|
+
end
|
36
|
+
|
37
|
+
desc 'download Chise-IDS'
|
38
|
+
task :chise_ids do
|
39
|
+
library.chise_ids.retrieve!
|
40
|
+
end
|
41
|
+
|
42
|
+
desc 'download all sources'
|
43
|
+
task :all => [:cedict, :chise_ids]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Analects
|
2
|
+
class Source
|
3
|
+
include Enumerable
|
4
|
+
attr_reader :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def name ; options[:name] ; end
|
11
|
+
def url ; options[:url] ; end
|
12
|
+
def retrieval ; Array(options[:retrieval]) ; end
|
13
|
+
|
14
|
+
def loader
|
15
|
+
@loader ||= options[:loader].new(Pathname(location))
|
16
|
+
end
|
17
|
+
|
18
|
+
def data_dir
|
19
|
+
options[:data_dir]
|
20
|
+
end
|
21
|
+
|
22
|
+
def location
|
23
|
+
options[:data_file] ? File.join( data_dir, options[:data_file] ) : File.join( data_dir, options[:name].to_s )
|
24
|
+
end
|
25
|
+
|
26
|
+
def data_file_present?
|
27
|
+
File.exist? location
|
28
|
+
end
|
29
|
+
|
30
|
+
def retrieve
|
31
|
+
retrieve! unless data_file_present?
|
32
|
+
end
|
33
|
+
|
34
|
+
def retrieve!
|
35
|
+
retrieval.inject( url ) do | result, method |
|
36
|
+
self.send( "retrieve_#{method}", result )
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# url -> stream
|
41
|
+
def retrieve_http( url )
|
42
|
+
require 'open-uri'
|
43
|
+
open( url )
|
44
|
+
end
|
45
|
+
|
46
|
+
# gzipped stream -> uncompressed stream
|
47
|
+
def retrieve_gunzip( stream )
|
48
|
+
require 'zlib'
|
49
|
+
Zlib::GzipReader.new( stream )
|
50
|
+
end
|
51
|
+
|
52
|
+
# stream|string -> create data file
|
53
|
+
def retrieve_save( data )
|
54
|
+
File.open( location, 'w' ) do |f|
|
55
|
+
f << ( data.respond_to?(:read) ? data.read : data )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# url -> clones repo
|
60
|
+
def retrieve_git( url )
|
61
|
+
`git clone #{url} #{data_dir}/#{name}` # Admittedly crude
|
62
|
+
end
|
63
|
+
|
64
|
+
def each(&block)
|
65
|
+
return to_enum unless block_given?
|
66
|
+
loader.each(&block)
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Analects
|
2
|
+
class Tokenizer
|
3
|
+
#ALGO = RMMSeg::Algorithm
|
4
|
+
ALGO = RMMSeg::SimpleAlgorithm
|
5
|
+
|
6
|
+
def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic')
|
7
|
+
unless File.exist?(chars_dic) && File.exist?(words_dic)
|
8
|
+
create_dict_from_cedict( chars_dic, words_dic )
|
9
|
+
end
|
10
|
+
#RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]]
|
11
|
+
RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]]
|
12
|
+
end
|
13
|
+
|
14
|
+
def library
|
15
|
+
@library ||= Analects::Library.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def cedict( fn = '/tmp/cedict.json' )
|
19
|
+
require 'json'
|
20
|
+
unless File.exist?( fn )
|
21
|
+
library.cedict.retrieve
|
22
|
+
File.write( fn, library.cedict.to_a.to_json )
|
23
|
+
end
|
24
|
+
@cedict ||= JSON.parse IO.read( fn )
|
25
|
+
end
|
26
|
+
|
27
|
+
def create_dict_from_cedict(chars_dic, words_dic)
|
28
|
+
words = Set.new
|
29
|
+
histo = Hash.new(0)
|
30
|
+
|
31
|
+
cedict.each do |c|
|
32
|
+
words << c[0]
|
33
|
+
words << c[1]
|
34
|
+
(c[0] + c[1]).chars.each do |c|
|
35
|
+
histo[c] += 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
File.write(words_dic, words.sort.join("\n"))
|
40
|
+
File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join )
|
41
|
+
end
|
42
|
+
|
43
|
+
def tokenize( str )
|
44
|
+
[].tap do |result|
|
45
|
+
ALGO.new( str ).tap do |alg|
|
46
|
+
until (tok = alg.next_token).nil?
|
47
|
+
result << tok.text.force_encoding('UTF-8')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
alias call tokenize
|
53
|
+
end
|
54
|
+
end
|