analects 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rvmrc +1 -0
- data/.travis.yml +14 -0
- data/Gemfile +17 -0
- data/Gemfile.devtools +71 -0
- data/Gemfile.lock +236 -0
- data/LICENSE.txt +674 -0
- data/README.md +81 -0
- data/Rakefile +26 -0
- data/SOURCES.md +17 -0
- data/analects.gemspec +29 -0
- data/bin/wp_hsk_filter +36 -0
- data/config/devtools.yml +2 -0
- data/config/flay.yml +3 -0
- data/config/flog.yml +2 -0
- data/config/mutant.yml +3 -0
- data/config/reek.yml +103 -0
- data/config/rubocop.yml +58 -0
- data/config/yardstick.yml +2 -0
- data/data/.gitkeep +0 -0
- data/lib/analects.rb +37 -0
- data/lib/analects/cedict_loader.rb +44 -0
- data/lib/analects/chise_ids_loader.rb +34 -0
- data/lib/analects/cli/progress.rb +37 -0
- data/lib/analects/encoding.rb +61 -0
- data/lib/analects/library.rb +68 -0
- data/lib/analects/models/kangxi_radical.rb +14 -0
- data/lib/analects/models/zi.rb +64 -0
- data/lib/analects/rake_tasks.rb +49 -0
- data/lib/analects/source.rb +70 -0
- data/lib/analects/tokenizer.rb +54 -0
- data/lib/analects/version.rb +3 -0
- data/lib/cjk_string.rb +56 -0
- data/lib/generators/analects.rb +20 -0
- data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
- data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
- data/lib/generators/analects/cedict/templates/model.rb +3 -0
- data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
- data/spec/analects/cedict_loader_spec.rb +48 -0
- data/spec/analects/chise_ids_loader_spec.rb +50 -0
- data/spec/analects/library_spec.rb +50 -0
- data/spec/analects/source_spec.rb +18 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
- metadata +221 -0
data/README.md
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
analects.rb
|
2
|
+
===========
|
3
|
+
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/analects.png)][gem]
|
5
|
+
[![Build Status](https://secure.travis-ci.org/plexus/analects.png?branch=master)][travis]
|
6
|
+
[![Dependency Status](https://gemnasium.com/plexus/analects.png)][gemnasium]
|
7
|
+
[![Code Climate](https://codeclimate.com/github/plexus/analects.png)][codeclimate]
|
8
|
+
|
9
|
+
[gem]: https://rubygems.org/gems/analects
|
10
|
+
[travis]: https://travis-ci.org/plexus/analects
|
11
|
+
[gemnasium]: https://gemnasium.com/plexus/analects
|
12
|
+
[codeclimate]: https://codeclimate.com/github/plexus/analects
|
13
|
+
|
14
|
+
Public datasets on the Chinese language, accessible from Ruby
|
15
|
+
|
16
|
+
## Download the data
|
17
|
+
|
18
|
+
With Rake
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
# Rakefile
|
22
|
+
require 'analects/rake_tasks'
|
23
|
+
|
24
|
+
Analects.init_rake_tasks do
|
25
|
+
data_dir '/tmp/analects' # defaults to ~/.analects
|
26
|
+
|
27
|
+
task :import_cedict do
|
28
|
+
library.cedict.each do |entry|
|
29
|
+
# ..
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
```sh
|
36
|
+
rake analects:download:all # download all sources
|
37
|
+
rake analects:download:cedict # download CC-CEDICT
|
38
|
+
rake analects:download:chise_ids # download Chise-IDS
|
39
|
+
```
|
40
|
+
|
41
|
+
Or from Ruby
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
analects = Analects::Library.new(data_dir: '/tmp/analects')
|
45
|
+
analects.cedict.retrieve
|
46
|
+
analects.chise_ids.retrieve
|
47
|
+
```
|
48
|
+
|
49
|
+
## Use the data
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
analects = Analects::Library.new(data_dir: '/tmp/analects')
|
53
|
+
analects.cedict.take(3)
|
54
|
+
# => [["AA制", "AA制", "A A zhi4", "/to split the bill/to go Dutch/"], ["A咖", "A咖", "A ka1", "/class \"A\"/top grade/"], ["A片", "A片", "A pian4", "/adult movie/pornography/"]]
|
55
|
+
|
56
|
+
analects.chise_ids.to_a.sample(3)
|
57
|
+
# [["U+59BF", "妿", "⿱加女"], ["U-0002441B", "𤐛", "⿰火閙"], ["U+83A1", "莡", "⿱艹足"]]
|
58
|
+
```
|
59
|
+
|
60
|
+
## Other stuff
|
61
|
+
|
62
|
+
Analects wraps RMMSeg for easy segmenting of Chinese text
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
Analects::Tokenizer.new.tokenize("为待那个朋友拿哟出来,咿呀噢哎…")
|
66
|
+
# => ["为", "待", "那个", "朋友", "拿", "哟", "出来", ",", "咿", "呀", "噢", "哎", "…"]
|
67
|
+
```
|
68
|
+
|
69
|
+
If you have Chinese text in GB or BIG5 encoding, you can do stuff like this
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
Analects::Encoding.valid_cjk(str)
|
73
|
+
Analects::Encoding.from_gb(str) # returns UTF-8
|
74
|
+
Analects::Encoding.from_big5(str) # returns UTF-8
|
75
|
+
```
|
76
|
+
|
77
|
+
## License
|
78
|
+
|
79
|
+
Copyright ⓒ Arne Brasseur 2012-2014
|
80
|
+
|
81
|
+
Licensed as GPL-v3
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
require 'devtools'
|
3
|
+
require 'rubygems/tasks'
|
4
|
+
require 'rubygems/package_task'
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
|
7
|
+
require 'analects'
|
8
|
+
|
9
|
+
Devtools.init_rake_tasks
|
10
|
+
Analects.init_rake_tasks
|
11
|
+
|
12
|
+
RSpec::Core::RakeTask.new(:spec)
|
13
|
+
task :default => :spec
|
14
|
+
|
15
|
+
Gem::Tasks.new
|
16
|
+
|
17
|
+
spec = Gem::Specification.load(File.expand_path('../analects.gemspec', __FILE__))
|
18
|
+
gem = Gem::PackageTask.new(spec)
|
19
|
+
gem.define
|
20
|
+
|
21
|
+
desc "Push gem to rubygems.org"
|
22
|
+
task :push => :gem do
|
23
|
+
sh "git tag v#{Analects::VERSION}"
|
24
|
+
sh "git push --tags"
|
25
|
+
sh "gem push pkg/analects-#{Analects::VERSION}.gem"
|
26
|
+
end
|
data/SOURCES.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# More sources to add
|
2
|
+
|
3
|
+
* [CJK Decomposition data](http://cjkdecomp.codeplex.com/)
|
4
|
+
* [Jun-Da Character Frequenct Lists](http://lingua.mtsu.edu/chinese-computing/)
|
5
|
+
* Unihan
|
6
|
+
** [On line lookup](http://unicode.org/charts/unihan.html)
|
7
|
+
** [Raw Data](http://www.unicode.org/Public/UNIDATA/)
|
8
|
+
** [Single Zip](http://www.unicode.org/Public/UNIDATA/Unihan.zip)
|
9
|
+
* [KanjiVG](https://github.com/kanjivg/kanjivg)
|
10
|
+
* [Wikipedia : Ancient Chinese characters project](http://commons.wikimedia.org/wiki/Commons:Ancient_Chinese_characters_project)
|
11
|
+
* [Hanzim Data](http://interstitiality.net/hanziData.html)
|
12
|
+
|
13
|
+
## Corpora
|
14
|
+
|
15
|
+
* [Leiden Weibo Corpus](http://lwc.daanvanesch.nl/)
|
16
|
+
* [The Lancaster Corpus of Mandarin Chinese](http://www.ota.ox.ac.uk/headers/2474.xml)
|
17
|
+
* [Blog post: Top 5 "Language data consortium" corpora for Mandarin](http://corplinguistics.wordpress.com/2011/10/30/top-five-ldc-corpora/)
|
data/analects.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path('../lib/analects/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.name = 'analects'
|
5
|
+
gem.version = Analects::VERSION
|
6
|
+
gem.platform = Gem::Platform::RUBY
|
7
|
+
gem.authors = ['Arne Brasseur']
|
8
|
+
gem.email = ['arne.brasseur@gmail.com']
|
9
|
+
gem.homepage = 'https://github.com/arnebrasseur/analects.rb'
|
10
|
+
gem.license = 'GPL-3.0'
|
11
|
+
gem.summary = 'Toolkit for Mandarin language learning apps'
|
12
|
+
gem.description = gem.summary
|
13
|
+
|
14
|
+
gem.require_paths = %w[lib]
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.test_files = `git ls-files -- spec`.split($/)
|
17
|
+
gem.extra_rdoc_files = %w[README.md]
|
18
|
+
|
19
|
+
gem.add_development_dependency 'rspec'
|
20
|
+
gem.add_development_dependency 'simplecov'
|
21
|
+
gem.add_development_dependency 'rake'
|
22
|
+
gem.add_development_dependency 'rubygems-tasks'
|
23
|
+
gem.add_development_dependency 'pry'
|
24
|
+
|
25
|
+
gem.add_runtime_dependency 'inflecto', '~> 0.0.2'
|
26
|
+
gem.add_runtime_dependency 'rmmseg'
|
27
|
+
gem.add_runtime_dependency 'ting', '~> 0.9.0'
|
28
|
+
gem.add_runtime_dependency 'ice_nine'
|
29
|
+
end
|
data/bin/wp_hsk_filter
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'analects'
|
5
|
+
|
6
|
+
RXP = Regexp.union(Analects::Models::Zi::REGEXP, '…', '.')
|
7
|
+
|
8
|
+
hanyu = Ting.writer(:hanyu, :accents)
|
9
|
+
all_pinyin = Ting.all_syllables.map {|s| hanyu << s }.sort_by(&:length).reverse
|
10
|
+
PY = Regexp.union(*all_pinyin.map{|p| p+'r'}, *all_pinyin, 'ng', '.', ' ', '\'')
|
11
|
+
|
12
|
+
def compile(str)
|
13
|
+
Regexp.new(
|
14
|
+
Regexp.escape(str.gsub(/\s+/, 'WW'))
|
15
|
+
.gsub('ZZ', "(#{RXP}+)")
|
16
|
+
.gsub('PY', "(#{PY}+)")
|
17
|
+
.gsub('WW','\s+'))
|
18
|
+
end
|
19
|
+
|
20
|
+
csv = CSV.generate do |csv|
|
21
|
+
$stdin.read.lines.each do |line|
|
22
|
+
case line
|
23
|
+
when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}} (also: {{l|cmn|PY|sc=Hani}})'), compile('[[ZZ]] [[PY]], [[PY]]')
|
24
|
+
csv << [$1, $2, $3]
|
25
|
+
csv << [$1, $2, $4]
|
26
|
+
when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}}'), compile('[[ZZ]] ([[ZZ]]) [[PY]]')
|
27
|
+
csv << [$1, $2, $3]
|
28
|
+
when compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc=Hani}}'), compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc= Hani }}'), compile('[[ZZ]] [[PY]]')
|
29
|
+
csv << [$1, $1, $2]
|
30
|
+
else
|
31
|
+
$stderr << line
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
puts csv
|
data/config/devtools.yml
ADDED
data/config/flay.yml
ADDED
data/config/flog.yml
ADDED
data/config/mutant.yml
ADDED
data/config/reek.yml
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
---
|
2
|
+
Attribute:
|
3
|
+
enabled: true
|
4
|
+
exclude: []
|
5
|
+
BooleanParameter:
|
6
|
+
enabled: true
|
7
|
+
exclude: []
|
8
|
+
ClassVariable:
|
9
|
+
enabled: true
|
10
|
+
exclude: []
|
11
|
+
ControlParameter:
|
12
|
+
enabled: true
|
13
|
+
exclude: []
|
14
|
+
DataClump:
|
15
|
+
enabled: true
|
16
|
+
exclude: []
|
17
|
+
max_copies: 2
|
18
|
+
min_clump_size: 2
|
19
|
+
DuplicateMethodCall:
|
20
|
+
enabled: true
|
21
|
+
exclude: []
|
22
|
+
max_calls: 1
|
23
|
+
allow_calls: []
|
24
|
+
FeatureEnvy:
|
25
|
+
enabled: true
|
26
|
+
exclude: []
|
27
|
+
IrresponsibleModule:
|
28
|
+
enabled: true
|
29
|
+
exclude: []
|
30
|
+
LongParameterList:
|
31
|
+
enabled: true
|
32
|
+
exclude: []
|
33
|
+
max_params: 2
|
34
|
+
overrides:
|
35
|
+
initialize:
|
36
|
+
max_params: 3
|
37
|
+
LongYieldList:
|
38
|
+
enabled: true
|
39
|
+
exclude: []
|
40
|
+
max_params: 2
|
41
|
+
NestedIterators:
|
42
|
+
enabled: true
|
43
|
+
exclude: []
|
44
|
+
max_allowed_nesting: 1
|
45
|
+
ignore_iterators: []
|
46
|
+
NilCheck:
|
47
|
+
enabled: true
|
48
|
+
exclude: []
|
49
|
+
RepeatedConditional:
|
50
|
+
enabled: true
|
51
|
+
exclude: []
|
52
|
+
max_ifs: 1
|
53
|
+
TooManyInstanceVariables:
|
54
|
+
enabled: true
|
55
|
+
exclude: []
|
56
|
+
max_instance_variables: 3
|
57
|
+
TooManyMethods:
|
58
|
+
enabled: true
|
59
|
+
exclude: []
|
60
|
+
max_methods: 10
|
61
|
+
TooManyStatements:
|
62
|
+
enabled: true
|
63
|
+
exclude:
|
64
|
+
- each
|
65
|
+
max_statements: 2
|
66
|
+
UncommunicativeMethodName:
|
67
|
+
enabled: true
|
68
|
+
exclude: []
|
69
|
+
reject:
|
70
|
+
- !ruby/regexp /^[a-z]$/
|
71
|
+
- !ruby/regexp /[0-9]$/
|
72
|
+
- !ruby/regexp /[A-Z]/
|
73
|
+
accept: []
|
74
|
+
UncommunicativeModuleName:
|
75
|
+
enabled: true
|
76
|
+
exclude: []
|
77
|
+
reject:
|
78
|
+
- !ruby/regexp /^.$/
|
79
|
+
- !ruby/regexp /[0-9]$/
|
80
|
+
accept: []
|
81
|
+
UncommunicativeParameterName:
|
82
|
+
enabled: true
|
83
|
+
exclude: []
|
84
|
+
reject:
|
85
|
+
- !ruby/regexp /^.$/
|
86
|
+
- !ruby/regexp /[0-9]$/
|
87
|
+
- !ruby/regexp /[A-Z]/
|
88
|
+
accept: []
|
89
|
+
UncommunicativeVariableName:
|
90
|
+
enabled: true
|
91
|
+
exclude: []
|
92
|
+
reject:
|
93
|
+
- !ruby/regexp /^.$/
|
94
|
+
- !ruby/regexp /[0-9]$/
|
95
|
+
- !ruby/regexp /[A-Z]/
|
96
|
+
accept: []
|
97
|
+
UnusedParameters:
|
98
|
+
enabled: true
|
99
|
+
exclude: []
|
100
|
+
UtilityFunction:
|
101
|
+
enabled: true
|
102
|
+
exclude: []
|
103
|
+
max_helper_calls: 0
|
data/config/rubocop.yml
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
AllCops:
|
2
|
+
Includes:
|
3
|
+
- '**/*.rake'
|
4
|
+
- 'Gemfile'
|
5
|
+
- 'Gemfile.devtools'
|
6
|
+
Excludes:
|
7
|
+
- '**/vendor/**'
|
8
|
+
- '**/benchmarks/**'
|
9
|
+
|
10
|
+
# Avoid parameter lists longer than five parameters.
|
11
|
+
ParameterLists:
|
12
|
+
Max: 3
|
13
|
+
CountKeywordArgs: true
|
14
|
+
|
15
|
+
# Avoid more than `Max` levels of nesting.
|
16
|
+
BlockNesting:
|
17
|
+
Max: 3
|
18
|
+
|
19
|
+
# Align with the style guide.
|
20
|
+
CollectionMethods:
|
21
|
+
PreferredMethods:
|
22
|
+
collect: 'map'
|
23
|
+
inject: 'reduce'
|
24
|
+
find: 'detect'
|
25
|
+
find_all: 'select'
|
26
|
+
|
27
|
+
# Do not force public/protected/private keyword to be indented at the same
|
28
|
+
# level as the def keyword. My personal preference is to outdent these keywords
|
29
|
+
# because I think when scanning code it makes it easier to identify the
|
30
|
+
# sections of code and visually separate them. When the keyword is at the same
|
31
|
+
# level I think it sort of blends in with the def keywords and makes it harder
|
32
|
+
# to scan the code and see where the sections are.
|
33
|
+
AccessControl:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
# Limit line length
|
37
|
+
LineLength:
|
38
|
+
Max: 79
|
39
|
+
|
40
|
+
# Disable documentation checking until a class needs to be documented once
|
41
|
+
Documentation:
|
42
|
+
Enabled: false
|
43
|
+
|
44
|
+
# Do not favor modifier if/unless usage when you have a single-line body
|
45
|
+
IfUnlessModifier:
|
46
|
+
Enabled: false
|
47
|
+
|
48
|
+
# Allow case equality operator (in limited use within the specs)
|
49
|
+
CaseEquality:
|
50
|
+
Enabled: false
|
51
|
+
|
52
|
+
# Constants do not always have to use SCREAMING_SNAKE_CASE
|
53
|
+
ConstantName:
|
54
|
+
Enabled: false
|
55
|
+
|
56
|
+
# Not all trivial readers/writers can be defined with attr_* methods
|
57
|
+
TrivialAccessors:
|
58
|
+
Enabled: false
|
data/data/.gitkeep
ADDED
File without changes
|
data/lib/analects.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'pathname'
|
3
|
+
require 'delegate'
|
4
|
+
|
5
|
+
require 'inflecto'
|
6
|
+
require 'ice_nine'
|
7
|
+
require 'rmmseg'
|
8
|
+
require 'ting'
|
9
|
+
|
10
|
+
module Analects
|
11
|
+
ROOT = Pathname(__FILE__).dirname.parent
|
12
|
+
|
13
|
+
def self.init_rake_tasks(*args, &blk)
|
14
|
+
require 'analects/rake_tasks'
|
15
|
+
Analects::RakeTasks.new(*args, &blk)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.cjk?(str)
|
19
|
+
str.codepoints.all? do |cp|
|
20
|
+
Analects::Models::Zi.codepoint_ranges.any? {|range| range.include?(cp)}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'cjk_string'
|
26
|
+
|
27
|
+
require 'analects/version'
|
28
|
+
require 'analects/encoding'
|
29
|
+
require 'analects/cli/progress'
|
30
|
+
require 'analects/cedict_loader'
|
31
|
+
require 'analects/chise_ids_loader'
|
32
|
+
require 'analects/source'
|
33
|
+
require 'analects/library'
|
34
|
+
require 'analects/tokenizer'
|
35
|
+
|
36
|
+
require 'analects/models/zi'
|
37
|
+
require 'analects/models/kangxi_radical'
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Analects
|
4
|
+
class CedictLoader
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
attr_reader :headers
|
8
|
+
|
9
|
+
def initialize(io)
|
10
|
+
@contents = io.read
|
11
|
+
@headers = {}
|
12
|
+
@contents.each_line do |line|
|
13
|
+
if line =~ /^#! (.*)=(.*)/
|
14
|
+
@headers[$1.strip] = $2.strip
|
15
|
+
end
|
16
|
+
break unless line =~ /^#/
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def field_names
|
21
|
+
[:traditional, :simplified, :pinyin, :definitions]
|
22
|
+
end
|
23
|
+
|
24
|
+
def each
|
25
|
+
if block_given?
|
26
|
+
@contents.each_line do |line|
|
27
|
+
yield process_contents(line) if line !~ /^#/
|
28
|
+
end
|
29
|
+
else
|
30
|
+
enum_for(:each)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def process_contents(line)
|
37
|
+
if line.strip =~ /^([^\s]*) ([^\s]*) \[([\w\d:,· ]+)\](.*)/
|
38
|
+
[$1,$2,$3,$4].map{|x| x.strip}
|
39
|
+
else
|
40
|
+
raise "Unexpected contents : #{line.inspect}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|