analects 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rvmrc +1 -0
- data/.travis.yml +14 -0
- data/Gemfile +17 -0
- data/Gemfile.devtools +71 -0
- data/Gemfile.lock +236 -0
- data/LICENSE.txt +674 -0
- data/README.md +81 -0
- data/Rakefile +26 -0
- data/SOURCES.md +17 -0
- data/analects.gemspec +29 -0
- data/bin/wp_hsk_filter +36 -0
- data/config/devtools.yml +2 -0
- data/config/flay.yml +3 -0
- data/config/flog.yml +2 -0
- data/config/mutant.yml +3 -0
- data/config/reek.yml +103 -0
- data/config/rubocop.yml +58 -0
- data/config/yardstick.yml +2 -0
- data/data/.gitkeep +0 -0
- data/lib/analects.rb +37 -0
- data/lib/analects/cedict_loader.rb +44 -0
- data/lib/analects/chise_ids_loader.rb +34 -0
- data/lib/analects/cli/progress.rb +37 -0
- data/lib/analects/encoding.rb +61 -0
- data/lib/analects/library.rb +68 -0
- data/lib/analects/models/kangxi_radical.rb +14 -0
- data/lib/analects/models/zi.rb +64 -0
- data/lib/analects/rake_tasks.rb +49 -0
- data/lib/analects/source.rb +70 -0
- data/lib/analects/tokenizer.rb +54 -0
- data/lib/analects/version.rb +3 -0
- data/lib/cjk_string.rb +56 -0
- data/lib/generators/analects.rb +20 -0
- data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
- data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
- data/lib/generators/analects/cedict/templates/model.rb +3 -0
- data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
- data/spec/analects/cedict_loader_spec.rb +48 -0
- data/spec/analects/chise_ids_loader_spec.rb +50 -0
- data/spec/analects/library_spec.rb +50 -0
- data/spec/analects/source_spec.rb +18 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
- metadata +221 -0
data/README.md
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
analects.rb
|
2
|
+
===========
|
3
|
+
|
4
|
+
[][gem]
|
5
|
+
[][travis]
|
6
|
+
[][gemnasium]
|
7
|
+
[][codeclimate]
|
8
|
+
|
9
|
+
[gem]: https://rubygems.org/gems/analects
|
10
|
+
[travis]: https://travis-ci.org/plexus/analects
|
11
|
+
[gemnasium]: https://gemnasium.com/plexus/analects
|
12
|
+
[codeclimate]: https://codeclimate.com/github/plexus/analects
|
13
|
+
|
14
|
+
Public datasets on the Chinese language, accessible from Ruby
|
15
|
+
|
16
|
+
## Download the data
|
17
|
+
|
18
|
+
With Rake
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
# Rakefile
|
22
|
+
require 'analects/rake_tasks'
|
23
|
+
|
24
|
+
Analects.init_rake_tasks do
|
25
|
+
data_dir '/tmp/analects' # defaults to ~/.analects
|
26
|
+
|
27
|
+
task :import_cedict do
|
28
|
+
library.cedict.each do |entry|
|
29
|
+
# ..
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
```sh
|
36
|
+
rake analects:download:all # download all sources
|
37
|
+
rake analects:download:cedict # download CC-CEDICT
|
38
|
+
rake analects:download:chise_ids # download Chise-IDS
|
39
|
+
```
|
40
|
+
|
41
|
+
Or from Ruby
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
analects = Analects::Library.new(data_dir: '/tmp/analects')
|
45
|
+
analects.cedict.retrieve
|
46
|
+
analects.chise_ids.retrieve
|
47
|
+
```
|
48
|
+
|
49
|
+
## Use the data
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
analects = Analects::Library.new(data_dir: '/tmp/analects')
|
53
|
+
analects.cedict.take(3)
|
54
|
+
# => [["AA制", "AA制", "A A zhi4", "/to split the bill/to go Dutch/"], ["A咖", "A咖", "A ka1", "/class \"A\"/top grade/"], ["A片", "A片", "A pian4", "/adult movie/pornography/"]]
|
55
|
+
|
56
|
+
analects.chise_ids.to_a.sample(3)
|
57
|
+
# [["U+59BF", "妿", "⿱加女"], ["U-0002441B", "𤐛", "⿰火閙"], ["U+83A1", "莡", "⿱艹足"]]
|
58
|
+
```
|
59
|
+
|
60
|
+
## Other stuff
|
61
|
+
|
62
|
+
Analects wraps RMMSeg for easy segmenting of Chinese text
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
Analects::Tokenizer.new.tokenize("为待那个朋友拿哟出来,咿呀噢哎…")
|
66
|
+
# => ["为", "待", "那个", "朋友", "拿", "哟", "出来", ",", "咿", "呀", "噢", "哎", "…"]
|
67
|
+
```
|
68
|
+
|
69
|
+
If you have Chinese text in GB or BIG5 encoding, you can do stuff like this
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
Analects::Encoding.valid_cjk(str)
|
73
|
+
Analects::Encoding.from_gb(str) # returns UTF-8
|
74
|
+
Analects::Encoding.from_big5(str) # returns UTF-8
|
75
|
+
```
|
76
|
+
|
77
|
+
## License
|
78
|
+
|
79
|
+
Copyright ⓒ Arne Brasseur 2012-2014
|
80
|
+
|
81
|
+
Licensed as GPL-v3
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
require 'devtools'
|
3
|
+
require 'rubygems/tasks'
|
4
|
+
require 'rubygems/package_task'
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
|
7
|
+
require 'analects'
|
8
|
+
|
9
|
+
Devtools.init_rake_tasks
|
10
|
+
Analects.init_rake_tasks
|
11
|
+
|
12
|
+
RSpec::Core::RakeTask.new(:spec)
|
13
|
+
task :default => :spec
|
14
|
+
|
15
|
+
Gem::Tasks.new
|
16
|
+
|
17
|
+
spec = Gem::Specification.load(File.expand_path('../analects.gemspec', __FILE__))
|
18
|
+
gem = Gem::PackageTask.new(spec)
|
19
|
+
gem.define
|
20
|
+
|
21
|
+
desc "Push gem to rubygems.org"
|
22
|
+
task :push => :gem do
|
23
|
+
sh "git tag v#{Analects::VERSION}"
|
24
|
+
sh "git push --tags"
|
25
|
+
sh "gem push pkg/analects-#{Analects::VERSION}.gem"
|
26
|
+
end
|
data/SOURCES.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# More sources to add
|
2
|
+
|
3
|
+
* [CJK Decomposition data](http://cjkdecomp.codeplex.com/)
|
4
|
+
* [Jun-Da Character Frequenct Lists](http://lingua.mtsu.edu/chinese-computing/)
|
5
|
+
* Unihan
|
6
|
+
** [On line lookup](http://unicode.org/charts/unihan.html)
|
7
|
+
** [Raw Data](http://www.unicode.org/Public/UNIDATA/)
|
8
|
+
** [Single Zip](http://www.unicode.org/Public/UNIDATA/Unihan.zip)
|
9
|
+
* [KanjiVG](https://github.com/kanjivg/kanjivg)
|
10
|
+
* [Wikipedia : Ancient Chinese characters project](http://commons.wikimedia.org/wiki/Commons:Ancient_Chinese_characters_project)
|
11
|
+
* [Hanzim Data](http://interstitiality.net/hanziData.html)
|
12
|
+
|
13
|
+
## Corpora
|
14
|
+
|
15
|
+
* [Leiden Weibo Corpus](http://lwc.daanvanesch.nl/)
|
16
|
+
* [The Lancaster Corpus of Mandarin Chinese](http://www.ota.ox.ac.uk/headers/2474.xml)
|
17
|
+
* [Blog post: Top 5 "Language data consortium" corpora for Mandarin](http://corplinguistics.wordpress.com/2011/10/30/top-five-ldc-corpora/)
|
data/analects.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path('../lib/analects/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.name = 'analects'
|
5
|
+
gem.version = Analects::VERSION
|
6
|
+
gem.platform = Gem::Platform::RUBY
|
7
|
+
gem.authors = ['Arne Brasseur']
|
8
|
+
gem.email = ['arne.brasseur@gmail.com']
|
9
|
+
gem.homepage = 'https://github.com/arnebrasseur/analects.rb'
|
10
|
+
gem.license = 'GPL-3.0'
|
11
|
+
gem.summary = 'Toolkit for Mandarin language learning apps'
|
12
|
+
gem.description = gem.summary
|
13
|
+
|
14
|
+
gem.require_paths = %w[lib]
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.test_files = `git ls-files -- spec`.split($/)
|
17
|
+
gem.extra_rdoc_files = %w[README.md]
|
18
|
+
|
19
|
+
gem.add_development_dependency 'rspec'
|
20
|
+
gem.add_development_dependency 'simplecov'
|
21
|
+
gem.add_development_dependency 'rake'
|
22
|
+
gem.add_development_dependency 'rubygems-tasks'
|
23
|
+
gem.add_development_dependency 'pry'
|
24
|
+
|
25
|
+
gem.add_runtime_dependency 'inflecto', '~> 0.0.2'
|
26
|
+
gem.add_runtime_dependency 'rmmseg'
|
27
|
+
gem.add_runtime_dependency 'ting', '~> 0.9.0'
|
28
|
+
gem.add_runtime_dependency 'ice_nine'
|
29
|
+
end
|
data/bin/wp_hsk_filter
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'analects'
|
5
|
+
|
6
|
+
RXP = Regexp.union(Analects::Models::Zi::REGEXP, '…', '.')
|
7
|
+
|
8
|
+
hanyu = Ting.writer(:hanyu, :accents)
|
9
|
+
all_pinyin = Ting.all_syllables.map {|s| hanyu << s }.sort_by(&:length).reverse
|
10
|
+
PY = Regexp.union(*all_pinyin.map{|p| p+'r'}, *all_pinyin, 'ng', '.', ' ', '\'')
|
11
|
+
|
12
|
+
def compile(str)
|
13
|
+
Regexp.new(
|
14
|
+
Regexp.escape(str.gsub(/\s+/, 'WW'))
|
15
|
+
.gsub('ZZ', "(#{RXP}+)")
|
16
|
+
.gsub('PY', "(#{PY}+)")
|
17
|
+
.gsub('WW','\s+'))
|
18
|
+
end
|
19
|
+
|
20
|
+
csv = CSV.generate do |csv|
|
21
|
+
$stdin.read.lines.each do |line|
|
22
|
+
case line
|
23
|
+
when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}} (also: {{l|cmn|PY|sc=Hani}})'), compile('[[ZZ]] [[PY]], [[PY]]')
|
24
|
+
csv << [$1, $2, $3]
|
25
|
+
csv << [$1, $2, $4]
|
26
|
+
when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}}'), compile('[[ZZ]] ([[ZZ]]) [[PY]]')
|
27
|
+
csv << [$1, $2, $3]
|
28
|
+
when compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc=Hani}}'), compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc= Hani }}'), compile('[[ZZ]] [[PY]]')
|
29
|
+
csv << [$1, $1, $2]
|
30
|
+
else
|
31
|
+
$stderr << line
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
puts csv
|
data/config/devtools.yml
ADDED
data/config/flay.yml
ADDED
data/config/flog.yml
ADDED
data/config/mutant.yml
ADDED
data/config/reek.yml
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
---
|
2
|
+
Attribute:
|
3
|
+
enabled: true
|
4
|
+
exclude: []
|
5
|
+
BooleanParameter:
|
6
|
+
enabled: true
|
7
|
+
exclude: []
|
8
|
+
ClassVariable:
|
9
|
+
enabled: true
|
10
|
+
exclude: []
|
11
|
+
ControlParameter:
|
12
|
+
enabled: true
|
13
|
+
exclude: []
|
14
|
+
DataClump:
|
15
|
+
enabled: true
|
16
|
+
exclude: []
|
17
|
+
max_copies: 2
|
18
|
+
min_clump_size: 2
|
19
|
+
DuplicateMethodCall:
|
20
|
+
enabled: true
|
21
|
+
exclude: []
|
22
|
+
max_calls: 1
|
23
|
+
allow_calls: []
|
24
|
+
FeatureEnvy:
|
25
|
+
enabled: true
|
26
|
+
exclude: []
|
27
|
+
IrresponsibleModule:
|
28
|
+
enabled: true
|
29
|
+
exclude: []
|
30
|
+
LongParameterList:
|
31
|
+
enabled: true
|
32
|
+
exclude: []
|
33
|
+
max_params: 2
|
34
|
+
overrides:
|
35
|
+
initialize:
|
36
|
+
max_params: 3
|
37
|
+
LongYieldList:
|
38
|
+
enabled: true
|
39
|
+
exclude: []
|
40
|
+
max_params: 2
|
41
|
+
NestedIterators:
|
42
|
+
enabled: true
|
43
|
+
exclude: []
|
44
|
+
max_allowed_nesting: 1
|
45
|
+
ignore_iterators: []
|
46
|
+
NilCheck:
|
47
|
+
enabled: true
|
48
|
+
exclude: []
|
49
|
+
RepeatedConditional:
|
50
|
+
enabled: true
|
51
|
+
exclude: []
|
52
|
+
max_ifs: 1
|
53
|
+
TooManyInstanceVariables:
|
54
|
+
enabled: true
|
55
|
+
exclude: []
|
56
|
+
max_instance_variables: 3
|
57
|
+
TooManyMethods:
|
58
|
+
enabled: true
|
59
|
+
exclude: []
|
60
|
+
max_methods: 10
|
61
|
+
TooManyStatements:
|
62
|
+
enabled: true
|
63
|
+
exclude:
|
64
|
+
- each
|
65
|
+
max_statements: 2
|
66
|
+
UncommunicativeMethodName:
|
67
|
+
enabled: true
|
68
|
+
exclude: []
|
69
|
+
reject:
|
70
|
+
- !ruby/regexp /^[a-z]$/
|
71
|
+
- !ruby/regexp /[0-9]$/
|
72
|
+
- !ruby/regexp /[A-Z]/
|
73
|
+
accept: []
|
74
|
+
UncommunicativeModuleName:
|
75
|
+
enabled: true
|
76
|
+
exclude: []
|
77
|
+
reject:
|
78
|
+
- !ruby/regexp /^.$/
|
79
|
+
- !ruby/regexp /[0-9]$/
|
80
|
+
accept: []
|
81
|
+
UncommunicativeParameterName:
|
82
|
+
enabled: true
|
83
|
+
exclude: []
|
84
|
+
reject:
|
85
|
+
- !ruby/regexp /^.$/
|
86
|
+
- !ruby/regexp /[0-9]$/
|
87
|
+
- !ruby/regexp /[A-Z]/
|
88
|
+
accept: []
|
89
|
+
UncommunicativeVariableName:
|
90
|
+
enabled: true
|
91
|
+
exclude: []
|
92
|
+
reject:
|
93
|
+
- !ruby/regexp /^.$/
|
94
|
+
- !ruby/regexp /[0-9]$/
|
95
|
+
- !ruby/regexp /[A-Z]/
|
96
|
+
accept: []
|
97
|
+
UnusedParameters:
|
98
|
+
enabled: true
|
99
|
+
exclude: []
|
100
|
+
UtilityFunction:
|
101
|
+
enabled: true
|
102
|
+
exclude: []
|
103
|
+
max_helper_calls: 0
|
data/config/rubocop.yml
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
AllCops:
|
2
|
+
Includes:
|
3
|
+
- '**/*.rake'
|
4
|
+
- 'Gemfile'
|
5
|
+
- 'Gemfile.devtools'
|
6
|
+
Excludes:
|
7
|
+
- '**/vendor/**'
|
8
|
+
- '**/benchmarks/**'
|
9
|
+
|
10
|
+
# Avoid parameter lists longer than five parameters.
|
11
|
+
ParameterLists:
|
12
|
+
Max: 3
|
13
|
+
CountKeywordArgs: true
|
14
|
+
|
15
|
+
# Avoid more than `Max` levels of nesting.
|
16
|
+
BlockNesting:
|
17
|
+
Max: 3
|
18
|
+
|
19
|
+
# Align with the style guide.
|
20
|
+
CollectionMethods:
|
21
|
+
PreferredMethods:
|
22
|
+
collect: 'map'
|
23
|
+
inject: 'reduce'
|
24
|
+
find: 'detect'
|
25
|
+
find_all: 'select'
|
26
|
+
|
27
|
+
# Do not force public/protected/private keyword to be indented at the same
|
28
|
+
# level as the def keyword. My personal preference is to outdent these keywords
|
29
|
+
# because I think when scanning code it makes it easier to identify the
|
30
|
+
# sections of code and visually separate them. When the keyword is at the same
|
31
|
+
# level I think it sort of blends in with the def keywords and makes it harder
|
32
|
+
# to scan the code and see where the sections are.
|
33
|
+
AccessControl:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
# Limit line length
|
37
|
+
LineLength:
|
38
|
+
Max: 79
|
39
|
+
|
40
|
+
# Disable documentation checking until a class needs to be documented once
|
41
|
+
Documentation:
|
42
|
+
Enabled: false
|
43
|
+
|
44
|
+
# Do not favor modifier if/unless usage when you have a single-line body
|
45
|
+
IfUnlessModifier:
|
46
|
+
Enabled: false
|
47
|
+
|
48
|
+
# Allow case equality operator (in limited use within the specs)
|
49
|
+
CaseEquality:
|
50
|
+
Enabled: false
|
51
|
+
|
52
|
+
# Constants do not always have to use SCREAMING_SNAKE_CASE
|
53
|
+
ConstantName:
|
54
|
+
Enabled: false
|
55
|
+
|
56
|
+
# Not all trivial readers/writers can be defined with attr_* methods
|
57
|
+
TrivialAccessors:
|
58
|
+
Enabled: false
|
data/data/.gitkeep
ADDED
File without changes
|
data/lib/analects.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'pathname'
|
3
|
+
require 'delegate'
|
4
|
+
|
5
|
+
require 'inflecto'
|
6
|
+
require 'ice_nine'
|
7
|
+
require 'rmmseg'
|
8
|
+
require 'ting'
|
9
|
+
|
10
|
+
module Analects
|
11
|
+
ROOT = Pathname(__FILE__).dirname.parent
|
12
|
+
|
13
|
+
def self.init_rake_tasks(*args, &blk)
|
14
|
+
require 'analects/rake_tasks'
|
15
|
+
Analects::RakeTasks.new(*args, &blk)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.cjk?(str)
|
19
|
+
str.codepoints.all? do |cp|
|
20
|
+
Analects::Models::Zi.codepoint_ranges.any? {|range| range.include?(cp)}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'cjk_string'
|
26
|
+
|
27
|
+
require 'analects/version'
|
28
|
+
require 'analects/encoding'
|
29
|
+
require 'analects/cli/progress'
|
30
|
+
require 'analects/cedict_loader'
|
31
|
+
require 'analects/chise_ids_loader'
|
32
|
+
require 'analects/source'
|
33
|
+
require 'analects/library'
|
34
|
+
require 'analects/tokenizer'
|
35
|
+
|
36
|
+
require 'analects/models/zi'
|
37
|
+
require 'analects/models/kangxi_radical'
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Analects
|
4
|
+
class CedictLoader
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
attr_reader :headers
|
8
|
+
|
9
|
+
def initialize(io)
|
10
|
+
@contents = io.read
|
11
|
+
@headers = {}
|
12
|
+
@contents.each_line do |line|
|
13
|
+
if line =~ /^#! (.*)=(.*)/
|
14
|
+
@headers[$1.strip] = $2.strip
|
15
|
+
end
|
16
|
+
break unless line =~ /^#/
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def field_names
|
21
|
+
[:traditional, :simplified, :pinyin, :definitions]
|
22
|
+
end
|
23
|
+
|
24
|
+
def each
|
25
|
+
if block_given?
|
26
|
+
@contents.each_line do |line|
|
27
|
+
yield process_contents(line) if line !~ /^#/
|
28
|
+
end
|
29
|
+
else
|
30
|
+
enum_for(:each)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def process_contents(line)
|
37
|
+
if line.strip =~ /^([^\s]*) ([^\s]*) \[([\w\d:,· ]+)\](.*)/
|
38
|
+
[$1,$2,$3,$4].map{|x| x.strip}
|
39
|
+
else
|
40
|
+
raise "Unexpected contents : #{line.inspect}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|