analects 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +14 -0
  5. data/Gemfile +17 -0
  6. data/Gemfile.devtools +71 -0
  7. data/Gemfile.lock +236 -0
  8. data/LICENSE.txt +674 -0
  9. data/README.md +81 -0
  10. data/Rakefile +26 -0
  11. data/SOURCES.md +17 -0
  12. data/analects.gemspec +29 -0
  13. data/bin/wp_hsk_filter +36 -0
  14. data/config/devtools.yml +2 -0
  15. data/config/flay.yml +3 -0
  16. data/config/flog.yml +2 -0
  17. data/config/mutant.yml +3 -0
  18. data/config/reek.yml +103 -0
  19. data/config/rubocop.yml +58 -0
  20. data/config/yardstick.yml +2 -0
  21. data/data/.gitkeep +0 -0
  22. data/lib/analects.rb +37 -0
  23. data/lib/analects/cedict_loader.rb +44 -0
  24. data/lib/analects/chise_ids_loader.rb +34 -0
  25. data/lib/analects/cli/progress.rb +37 -0
  26. data/lib/analects/encoding.rb +61 -0
  27. data/lib/analects/library.rb +68 -0
  28. data/lib/analects/models/kangxi_radical.rb +14 -0
  29. data/lib/analects/models/zi.rb +64 -0
  30. data/lib/analects/rake_tasks.rb +49 -0
  31. data/lib/analects/source.rb +70 -0
  32. data/lib/analects/tokenizer.rb +54 -0
  33. data/lib/analects/version.rb +3 -0
  34. data/lib/cjk_string.rb +56 -0
  35. data/lib/generators/analects.rb +20 -0
  36. data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
  37. data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
  38. data/lib/generators/analects/cedict/templates/model.rb +3 -0
  39. data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
  40. data/spec/analects/cedict_loader_spec.rb +48 -0
  41. data/spec/analects/chise_ids_loader_spec.rb +50 -0
  42. data/spec/analects/library_spec.rb +50 -0
  43. data/spec/analects/source_spec.rb +18 -0
  44. data/spec/spec_helper.rb +19 -0
  45. data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
  46. metadata +221 -0
data/README.md ADDED
@@ -0,0 +1,81 @@
1
+ analects.rb
2
+ ===========
3
+
4
+ [![Gem Version](https://badge.fury.io/rb/analects.png)][gem]
5
+ [![Build Status](https://secure.travis-ci.org/plexus/analects.png?branch=master)][travis]
6
+ [![Dependency Status](https://gemnasium.com/plexus/analects.png)][gemnasium]
7
+ [![Code Climate](https://codeclimate.com/github/plexus/analects.png)][codeclimate]
8
+
9
+ [gem]: https://rubygems.org/gems/analects
10
+ [travis]: https://travis-ci.org/plexus/analects
11
+ [gemnasium]: https://gemnasium.com/plexus/analects
12
+ [codeclimate]: https://codeclimate.com/github/plexus/analects
13
+
14
+ Public datasets on the Chinese language, accessible from Ruby
15
+
16
+ ## Download the data
17
+
18
+ With Rake
19
+
20
+ ```ruby
21
+ # Rakefile
22
+ require 'analects/rake_tasks'
23
+
24
+ Analects.init_rake_tasks do
25
+ data_dir '/tmp/analects' # defaults to ~/.analects
26
+
27
+ task :import_cedict do
28
+ library.cedict.each do |entry|
29
+ # ..
30
+ end
31
+ end
32
+ end
33
+ ```
34
+
35
+ ```sh
36
+ rake analects:download:all # download all sources
37
+ rake analects:download:cedict # download CC-CEDICT
38
+ rake analects:download:chise_ids # download Chise-IDS
39
+ ```
40
+
41
+ Or from Ruby
42
+
43
+ ```ruby
44
+ analects = Analects::Library.new(data_dir: '/tmp/analects')
45
+ analects.cedict.retrieve
46
+ analects.chise_ids.retrieve
47
+ ```
48
+
49
+ ## Use the data
50
+
51
+ ```ruby
52
+ analects = Analects::Library.new(data_dir: '/tmp/analects')
53
+ analects.cedict.take(3)
54
+ # => [["AA制", "AA制", "A A zhi4", "/to split the bill/to go Dutch/"], ["A咖", "A咖", "A ka1", "/class \"A\"/top grade/"], ["A片", "A片", "A pian4", "/adult movie/pornography/"]]
55
+
56
+ analects.chise_ids.to_a.sample(3)
57
+ # [["U+59BF", "妿", "⿱加女"], ["U-0002441B", "𤐛", "⿰火閙"], ["U+83A1", "莡", "⿱艹足"]]
58
+ ```
59
+
60
+ ## Other stuff
61
+
62
+ Analects wraps RMMSeg for easy segmenting of Chinese text
63
+
64
+ ```ruby
65
+ Analects::Tokenizer.new.tokenize("为待那个朋友拿哟出来,咿呀噢哎…")
66
+ # => ["为", "待", "那个", "朋友", "拿", "哟", "出来", ",", "咿", "呀", "噢", "哎", "…"]
67
+ ```
68
+
69
+ If you have Chinese text in GB or BIG5 encoding, you can do stuff like this
70
+
71
+ ```ruby
72
+ Analects::Encoding.valid_cjk(str)
73
+ Analects::Encoding.from_gb(str) # returns UTF-8
74
+ Analects::Encoding.from_big5(str) # returns UTF-8
75
+ ```
76
+
77
+ ## License
78
+
79
+ Copyright ⓒ Arne Brasseur 2012-2014
80
+
81
+ Licensed as GPL-v3
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'devtools'
3
+ require 'rubygems/tasks'
4
+ require 'rubygems/package_task'
5
+
6
+ $LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
7
+ require 'analects'
8
+
9
+ Devtools.init_rake_tasks
10
+ Analects.init_rake_tasks
11
+
12
+ RSpec::Core::RakeTask.new(:spec)
13
+ task :default => :spec
14
+
15
+ Gem::Tasks.new
16
+
17
+ spec = Gem::Specification.load(File.expand_path('../analects.gemspec', __FILE__))
18
+ gem = Gem::PackageTask.new(spec)
19
+ gem.define
20
+
21
+ desc "Push gem to rubygems.org"
22
+ task :push => :gem do
23
+ sh "git tag v#{Analects::VERSION}"
24
+ sh "git push --tags"
25
+ sh "gem push pkg/analects-#{Analects::VERSION}.gem"
26
+ end
data/SOURCES.md ADDED
@@ -0,0 +1,17 @@
1
+ # More sources to add
2
+
3
+ * [CJK Decomposition data](http://cjkdecomp.codeplex.com/)
4
+ * [Jun-Da Character Frequenct Lists](http://lingua.mtsu.edu/chinese-computing/)
5
+ * Unihan
6
+ ** [On line lookup](http://unicode.org/charts/unihan.html)
7
+ ** [Raw Data](http://www.unicode.org/Public/UNIDATA/)
8
+ ** [Single Zip](http://www.unicode.org/Public/UNIDATA/Unihan.zip)
9
+ * [KanjiVG](https://github.com/kanjivg/kanjivg)
10
+ * [Wikipedia : Ancient Chinese characters project](http://commons.wikimedia.org/wiki/Commons:Ancient_Chinese_characters_project)
11
+ * [Hanzim Data](http://interstitiality.net/hanziData.html)
12
+
13
+ ## Corpora
14
+
15
+ * [Leiden Weibo Corpus](http://lwc.daanvanesch.nl/)
16
+ * [The Lancaster Corpus of Mandarin Chinese](http://www.ota.ox.ac.uk/headers/2474.xml)
17
+ * [Blog post: Top 5 "Language data consortium" corpora for Mandarin](http://corplinguistics.wordpress.com/2011/10/30/top-five-ldc-corpora/)
data/analects.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ require File.expand_path('../lib/analects/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = 'analects'
5
+ gem.version = Analects::VERSION
6
+ gem.platform = Gem::Platform::RUBY
7
+ gem.authors = ['Arne Brasseur']
8
+ gem.email = ['arne.brasseur@gmail.com']
9
+ gem.homepage = 'https://github.com/arnebrasseur/analects.rb'
10
+ gem.license = 'GPL-3.0'
11
+ gem.summary = 'Toolkit for Mandarin language learning apps'
12
+ gem.description = gem.summary
13
+
14
+ gem.require_paths = %w[lib]
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.test_files = `git ls-files -- spec`.split($/)
17
+ gem.extra_rdoc_files = %w[README.md]
18
+
19
+ gem.add_development_dependency 'rspec'
20
+ gem.add_development_dependency 'simplecov'
21
+ gem.add_development_dependency 'rake'
22
+ gem.add_development_dependency 'rubygems-tasks'
23
+ gem.add_development_dependency 'pry'
24
+
25
+ gem.add_runtime_dependency 'inflecto', '~> 0.0.2'
26
+ gem.add_runtime_dependency 'rmmseg'
27
+ gem.add_runtime_dependency 'ting', '~> 0.9.0'
28
+ gem.add_runtime_dependency 'ice_nine'
29
+ end
data/bin/wp_hsk_filter ADDED
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'analects'
5
+
6
+ RXP = Regexp.union(Analects::Models::Zi::REGEXP, '…', '.')
7
+
8
+ hanyu = Ting.writer(:hanyu, :accents)
9
+ all_pinyin = Ting.all_syllables.map {|s| hanyu << s }.sort_by(&:length).reverse
10
+ PY = Regexp.union(*all_pinyin.map{|p| p+'r'}, *all_pinyin, 'ng', '.', ' ', '\'')
11
+
12
+ def compile(str)
13
+ Regexp.new(
14
+ Regexp.escape(str.gsub(/\s+/, 'WW'))
15
+ .gsub('ZZ', "(#{RXP}+)")
16
+ .gsub('PY', "(#{PY}+)")
17
+ .gsub('WW','\s+'))
18
+ end
19
+
20
+ csv = CSV.generate do |csv|
21
+ $stdin.read.lines.each do |line|
22
+ case line
23
+ when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}} (also: {{l|cmn|PY|sc=Hani}})'), compile('[[ZZ]] [[PY]], [[PY]]')
24
+ csv << [$1, $2, $3]
25
+ csv << [$1, $2, $4]
26
+ when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}}'), compile('[[ZZ]] ([[ZZ]]) [[PY]]')
27
+ csv << [$1, $2, $3]
28
+ when compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc=Hani}}'), compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc= Hani }}'), compile('[[ZZ]] [[PY]]')
29
+ csv << [$1, $1, $2]
30
+ else
31
+ $stderr << line
32
+ end
33
+ end
34
+ end
35
+
36
+ puts csv
@@ -0,0 +1,2 @@
1
+ ---
2
+ unit_test_timeout: 0.1
data/config/flay.yml ADDED
@@ -0,0 +1,3 @@
1
+ ---
2
+ threshold: 11
3
+ total_score: 184
data/config/flog.yml ADDED
@@ -0,0 +1,2 @@
1
+ ---
2
+ threshold: 0
data/config/mutant.yml ADDED
@@ -0,0 +1,3 @@
1
+ ---
2
+ name: your_lib
3
+ namespace: YourLib
data/config/reek.yml ADDED
@@ -0,0 +1,103 @@
1
+ ---
2
+ Attribute:
3
+ enabled: true
4
+ exclude: []
5
+ BooleanParameter:
6
+ enabled: true
7
+ exclude: []
8
+ ClassVariable:
9
+ enabled: true
10
+ exclude: []
11
+ ControlParameter:
12
+ enabled: true
13
+ exclude: []
14
+ DataClump:
15
+ enabled: true
16
+ exclude: []
17
+ max_copies: 2
18
+ min_clump_size: 2
19
+ DuplicateMethodCall:
20
+ enabled: true
21
+ exclude: []
22
+ max_calls: 1
23
+ allow_calls: []
24
+ FeatureEnvy:
25
+ enabled: true
26
+ exclude: []
27
+ IrresponsibleModule:
28
+ enabled: true
29
+ exclude: []
30
+ LongParameterList:
31
+ enabled: true
32
+ exclude: []
33
+ max_params: 2
34
+ overrides:
35
+ initialize:
36
+ max_params: 3
37
+ LongYieldList:
38
+ enabled: true
39
+ exclude: []
40
+ max_params: 2
41
+ NestedIterators:
42
+ enabled: true
43
+ exclude: []
44
+ max_allowed_nesting: 1
45
+ ignore_iterators: []
46
+ NilCheck:
47
+ enabled: true
48
+ exclude: []
49
+ RepeatedConditional:
50
+ enabled: true
51
+ exclude: []
52
+ max_ifs: 1
53
+ TooManyInstanceVariables:
54
+ enabled: true
55
+ exclude: []
56
+ max_instance_variables: 3
57
+ TooManyMethods:
58
+ enabled: true
59
+ exclude: []
60
+ max_methods: 10
61
+ TooManyStatements:
62
+ enabled: true
63
+ exclude:
64
+ - each
65
+ max_statements: 2
66
+ UncommunicativeMethodName:
67
+ enabled: true
68
+ exclude: []
69
+ reject:
70
+ - !ruby/regexp /^[a-z]$/
71
+ - !ruby/regexp /[0-9]$/
72
+ - !ruby/regexp /[A-Z]/
73
+ accept: []
74
+ UncommunicativeModuleName:
75
+ enabled: true
76
+ exclude: []
77
+ reject:
78
+ - !ruby/regexp /^.$/
79
+ - !ruby/regexp /[0-9]$/
80
+ accept: []
81
+ UncommunicativeParameterName:
82
+ enabled: true
83
+ exclude: []
84
+ reject:
85
+ - !ruby/regexp /^.$/
86
+ - !ruby/regexp /[0-9]$/
87
+ - !ruby/regexp /[A-Z]/
88
+ accept: []
89
+ UncommunicativeVariableName:
90
+ enabled: true
91
+ exclude: []
92
+ reject:
93
+ - !ruby/regexp /^.$/
94
+ - !ruby/regexp /[0-9]$/
95
+ - !ruby/regexp /[A-Z]/
96
+ accept: []
97
+ UnusedParameters:
98
+ enabled: true
99
+ exclude: []
100
+ UtilityFunction:
101
+ enabled: true
102
+ exclude: []
103
+ max_helper_calls: 0
@@ -0,0 +1,58 @@
1
+ AllCops:
2
+ Includes:
3
+ - '**/*.rake'
4
+ - 'Gemfile'
5
+ - 'Gemfile.devtools'
6
+ Excludes:
7
+ - '**/vendor/**'
8
+ - '**/benchmarks/**'
9
+
10
+ # Avoid parameter lists longer than five parameters.
11
+ ParameterLists:
12
+ Max: 3
13
+ CountKeywordArgs: true
14
+
15
+ # Avoid more than `Max` levels of nesting.
16
+ BlockNesting:
17
+ Max: 3
18
+
19
+ # Align with the style guide.
20
+ CollectionMethods:
21
+ PreferredMethods:
22
+ collect: 'map'
23
+ inject: 'reduce'
24
+ find: 'detect'
25
+ find_all: 'select'
26
+
27
+ # Do not force public/protected/private keyword to be indented at the same
28
+ # level as the def keyword. My personal preference is to outdent these keywords
29
+ # because I think when scanning code it makes it easier to identify the
30
+ # sections of code and visually separate them. When the keyword is at the same
31
+ # level I think it sort of blends in with the def keywords and makes it harder
32
+ # to scan the code and see where the sections are.
33
+ AccessControl:
34
+ Enabled: false
35
+
36
+ # Limit line length
37
+ LineLength:
38
+ Max: 79
39
+
40
+ # Disable documentation checking until a class needs to be documented once
41
+ Documentation:
42
+ Enabled: false
43
+
44
+ # Do not favor modifier if/unless usage when you have a single-line body
45
+ IfUnlessModifier:
46
+ Enabled: false
47
+
48
+ # Allow case equality operator (in limited use within the specs)
49
+ CaseEquality:
50
+ Enabled: false
51
+
52
+ # Constants do not always have to use SCREAMING_SNAKE_CASE
53
+ ConstantName:
54
+ Enabled: false
55
+
56
+ # Not all trivial readers/writers can be defined with attr_* methods
57
+ TrivialAccessors:
58
+ Enabled: false
@@ -0,0 +1,2 @@
1
+ ---
2
+ threshold: 53.3
data/data/.gitkeep ADDED
File without changes
data/lib/analects.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'tmpdir'
2
+ require 'pathname'
3
+ require 'delegate'
4
+
5
+ require 'inflecto'
6
+ require 'ice_nine'
7
+ require 'rmmseg'
8
+ require 'ting'
9
+
10
+ module Analects
11
+ ROOT = Pathname(__FILE__).dirname.parent
12
+
13
+ def self.init_rake_tasks(*args, &blk)
14
+ require 'analects/rake_tasks'
15
+ Analects::RakeTasks.new(*args, &blk)
16
+ end
17
+
18
+ def self.cjk?(str)
19
+ str.codepoints.all? do |cp|
20
+ Analects::Models::Zi.codepoint_ranges.any? {|range| range.include?(cp)}
21
+ end
22
+ end
23
+ end
24
+
25
+ require 'cjk_string'
26
+
27
+ require 'analects/version'
28
+ require 'analects/encoding'
29
+ require 'analects/cli/progress'
30
+ require 'analects/cedict_loader'
31
+ require 'analects/chise_ids_loader'
32
+ require 'analects/source'
33
+ require 'analects/library'
34
+ require 'analects/tokenizer'
35
+
36
+ require 'analects/models/zi'
37
+ require 'analects/models/kangxi_radical'
@@ -0,0 +1,44 @@
1
+ # encoding: UTF-8
2
+
3
+ module Analects
4
+ class CedictLoader
5
+ include Enumerable
6
+
7
+ attr_reader :headers
8
+
9
+ def initialize(io)
10
+ @contents = io.read
11
+ @headers = {}
12
+ @contents.each_line do |line|
13
+ if line =~ /^#! (.*)=(.*)/
14
+ @headers[$1.strip] = $2.strip
15
+ end
16
+ break unless line =~ /^#/
17
+ end
18
+ end
19
+
20
+ def field_names
21
+ [:traditional, :simplified, :pinyin, :definitions]
22
+ end
23
+
24
+ def each
25
+ if block_given?
26
+ @contents.each_line do |line|
27
+ yield process_contents(line) if line !~ /^#/
28
+ end
29
+ else
30
+ enum_for(:each)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def process_contents(line)
37
+ if line.strip =~ /^([^\s]*) ([^\s]*) \[([\w\d:,· ]+)\](.*)/
38
+ [$1,$2,$3,$4].map{|x| x.strip}
39
+ else
40
+ raise "Unexpected contents : #{line.inspect}"
41
+ end
42
+ end
43
+ end
44
+ end