analects 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +14 -0
  5. data/Gemfile +17 -0
  6. data/Gemfile.devtools +71 -0
  7. data/Gemfile.lock +236 -0
  8. data/LICENSE.txt +674 -0
  9. data/README.md +81 -0
  10. data/Rakefile +26 -0
  11. data/SOURCES.md +17 -0
  12. data/analects.gemspec +29 -0
  13. data/bin/wp_hsk_filter +36 -0
  14. data/config/devtools.yml +2 -0
  15. data/config/flay.yml +3 -0
  16. data/config/flog.yml +2 -0
  17. data/config/mutant.yml +3 -0
  18. data/config/reek.yml +103 -0
  19. data/config/rubocop.yml +58 -0
  20. data/config/yardstick.yml +2 -0
  21. data/data/.gitkeep +0 -0
  22. data/lib/analects.rb +37 -0
  23. data/lib/analects/cedict_loader.rb +44 -0
  24. data/lib/analects/chise_ids_loader.rb +34 -0
  25. data/lib/analects/cli/progress.rb +37 -0
  26. data/lib/analects/encoding.rb +61 -0
  27. data/lib/analects/library.rb +68 -0
  28. data/lib/analects/models/kangxi_radical.rb +14 -0
  29. data/lib/analects/models/zi.rb +64 -0
  30. data/lib/analects/rake_tasks.rb +49 -0
  31. data/lib/analects/source.rb +70 -0
  32. data/lib/analects/tokenizer.rb +54 -0
  33. data/lib/analects/version.rb +3 -0
  34. data/lib/cjk_string.rb +56 -0
  35. data/lib/generators/analects.rb +20 -0
  36. data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
  37. data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
  38. data/lib/generators/analects/cedict/templates/model.rb +3 -0
  39. data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
  40. data/spec/analects/cedict_loader_spec.rb +48 -0
  41. data/spec/analects/chise_ids_loader_spec.rb +50 -0
  42. data/spec/analects/library_spec.rb +50 -0
  43. data/spec/analects/source_spec.rb +18 -0
  44. data/spec/spec_helper.rb +19 -0
  45. data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
  46. metadata +221 -0
data/README.md ADDED
@@ -0,0 +1,81 @@
1
+ analects.rb
2
+ ===========
3
+
4
+ [![Gem Version](https://badge.fury.io/rb/analects.png)][gem]
5
+ [![Build Status](https://secure.travis-ci.org/plexus/analects.png?branch=master)][travis]
6
+ [![Dependency Status](https://gemnasium.com/plexus/analects.png)][gemnasium]
7
+ [![Code Climate](https://codeclimate.com/github/plexus/analects.png)][codeclimate]
8
+
9
+ [gem]: https://rubygems.org/gems/analects
10
+ [travis]: https://travis-ci.org/plexus/analects
11
+ [gemnasium]: https://gemnasium.com/plexus/analects
12
+ [codeclimate]: https://codeclimate.com/github/plexus/analects
13
+
14
+ Public datasets on the Chinese language, accessible from Ruby
15
+
16
+ ## Download the data
17
+
18
+ With Rake
19
+
20
+ ```ruby
21
+ # Rakefile
22
+ require 'analects/rake_tasks'
23
+
24
+ Analects.init_rake_tasks do
25
+ data_dir '/tmp/analects' # defaults to ~/.analects
26
+
27
+ task :import_cedict do
28
+ library.cedict.each do |entry|
29
+ # ..
30
+ end
31
+ end
32
+ end
33
+ ```
34
+
35
+ ```sh
36
+ rake analects:download:all # download all sources
37
+ rake analects:download:cedict # download CC-CEDICT
38
+ rake analects:download:chise_ids # download Chise-IDS
39
+ ```
40
+
41
+ Or from Ruby
42
+
43
+ ```ruby
44
+ analects = Analects::Library.new(data_dir: '/tmp/analects')
45
+ analects.cedict.retrieve
46
+ analects.chise_ids.retrieve
47
+ ```
48
+
49
+ ## Use the data
50
+
51
+ ```ruby
52
+ analects = Analects::Library.new(data_dir: '/tmp/analects')
53
+ analects.cedict.take(3)
54
+ # => [["AA制", "AA制", "A A zhi4", "/to split the bill/to go Dutch/"], ["A咖", "A咖", "A ka1", "/class \"A\"/top grade/"], ["A片", "A片", "A pian4", "/adult movie/pornography/"]]
55
+
56
+ analects.chise_ids.to_a.sample(3)
57
+ # [["U+59BF", "妿", "⿱加女"], ["U-0002441B", "𤐛", "⿰火閙"], ["U+83A1", "莡", "⿱艹足"]]
58
+ ```
59
+
60
+ ## Other stuff
61
+
62
+ Analects wraps RMMSeg for easy segmenting of Chinese text
63
+
64
+ ```ruby
65
+ Analects::Tokenizer.new.tokenize("为待那个朋友拿哟出来,咿呀噢哎…")
66
+ # => ["为", "待", "那个", "朋友", "拿", "哟", "出来", ",", "咿", "呀", "噢", "哎", "…"]
67
+ ```
68
+
69
+ If you have Chinese text in GB or BIG5 encoding, you can do stuff like this
70
+
71
+ ```ruby
72
+ Analects::Encoding.valid_cjk(str)
73
+ Analects::Encoding.from_gb(str) # returns UTF-8
74
+ Analects::Encoding.from_big5(str) # returns UTF-8
75
+ ```
76
+
77
+ ## License
78
+
79
+ Copyright ⓒ Arne Brasseur 2012-2014
80
+
81
+ Licensed as GPL-v3
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'devtools'
3
+ require 'rubygems/tasks'
4
+ require 'rubygems/package_task'
5
+
6
+ $LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
7
+ require 'analects'
8
+
9
+ Devtools.init_rake_tasks
10
+ Analects.init_rake_tasks
11
+
12
+ RSpec::Core::RakeTask.new(:spec)
13
+ task :default => :spec
14
+
15
+ Gem::Tasks.new
16
+
17
+ spec = Gem::Specification.load(File.expand_path('../analects.gemspec', __FILE__))
18
+ gem = Gem::PackageTask.new(spec)
19
+ gem.define
20
+
21
+ desc "Push gem to rubygems.org"
22
+ task :push => :gem do
23
+ sh "git tag v#{Analects::VERSION}"
24
+ sh "git push --tags"
25
+ sh "gem push pkg/analects-#{Analects::VERSION}.gem"
26
+ end
data/SOURCES.md ADDED
@@ -0,0 +1,17 @@
1
+ # More sources to add
2
+
3
+ * [CJK Decomposition data](http://cjkdecomp.codeplex.com/)
4
+ * [Jun-Da Character Frequenct Lists](http://lingua.mtsu.edu/chinese-computing/)
5
+ * Unihan
6
+ ** [On line lookup](http://unicode.org/charts/unihan.html)
7
+ ** [Raw Data](http://www.unicode.org/Public/UNIDATA/)
8
+ ** [Single Zip](http://www.unicode.org/Public/UNIDATA/Unihan.zip)
9
+ * [KanjiVG](https://github.com/kanjivg/kanjivg)
10
+ * [Wikipedia : Ancient Chinese characters project](http://commons.wikimedia.org/wiki/Commons:Ancient_Chinese_characters_project)
11
+ * [Hanzim Data](http://interstitiality.net/hanziData.html)
12
+
13
+ ## Corpora
14
+
15
+ * [Leiden Weibo Corpus](http://lwc.daanvanesch.nl/)
16
+ * [The Lancaster Corpus of Mandarin Chinese](http://www.ota.ox.ac.uk/headers/2474.xml)
17
+ * [Blog post: Top 5 "Language data consortium" corpora for Mandarin](http://corplinguistics.wordpress.com/2011/10/30/top-five-ldc-corpora/)
data/analects.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ require File.expand_path('../lib/analects/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = 'analects'
5
+ gem.version = Analects::VERSION
6
+ gem.platform = Gem::Platform::RUBY
7
+ gem.authors = ['Arne Brasseur']
8
+ gem.email = ['arne.brasseur@gmail.com']
9
+ gem.homepage = 'https://github.com/arnebrasseur/analects.rb'
10
+ gem.license = 'GPL-3.0'
11
+ gem.summary = 'Toolkit for Mandarin language learning apps'
12
+ gem.description = gem.summary
13
+
14
+ gem.require_paths = %w[lib]
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.test_files = `git ls-files -- spec`.split($/)
17
+ gem.extra_rdoc_files = %w[README.md]
18
+
19
+ gem.add_development_dependency 'rspec'
20
+ gem.add_development_dependency 'simplecov'
21
+ gem.add_development_dependency 'rake'
22
+ gem.add_development_dependency 'rubygems-tasks'
23
+ gem.add_development_dependency 'pry'
24
+
25
+ gem.add_runtime_dependency 'inflecto', '~> 0.0.2'
26
+ gem.add_runtime_dependency 'rmmseg'
27
+ gem.add_runtime_dependency 'ting', '~> 0.9.0'
28
+ gem.add_runtime_dependency 'ice_nine'
29
+ end
data/bin/wp_hsk_filter ADDED
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'analects'
5
+
6
+ RXP = Regexp.union(Analects::Models::Zi::REGEXP, '…', '.')
7
+
8
+ hanyu = Ting.writer(:hanyu, :accents)
9
+ all_pinyin = Ting.all_syllables.map {|s| hanyu << s }.sort_by(&:length).reverse
10
+ PY = Regexp.union(*all_pinyin.map{|p| p+'r'}, *all_pinyin, 'ng', '.', ' ', '\'')
11
+
12
+ def compile(str)
13
+ Regexp.new(
14
+ Regexp.escape(str.gsub(/\s+/, 'WW'))
15
+ .gsub('ZZ', "(#{RXP}+)")
16
+ .gsub('PY', "(#{PY}+)")
17
+ .gsub('WW','\s+'))
18
+ end
19
+
20
+ csv = CSV.generate do |csv|
21
+ $stdin.read.lines.each do |line|
22
+ case line
23
+ when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}} (also: {{l|cmn|PY|sc=Hani}})'), compile('[[ZZ]] [[PY]], [[PY]]')
24
+ csv << [$1, $2, $3]
25
+ csv << [$1, $2, $4]
26
+ when compile('{{l|cmn|ZZ|sc=Hani}} ({{l|cmn|ZZ|sc=Hani}}) {{l|cmn|PY|sc=Hani}}'), compile('[[ZZ]] ([[ZZ]]) [[PY]]')
27
+ csv << [$1, $2, $3]
28
+ when compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc=Hani}}'), compile('{{l|cmn|ZZ|sc=Hani}} {{l|cmn|PY|sc= Hani }}'), compile('[[ZZ]] [[PY]]')
29
+ csv << [$1, $1, $2]
30
+ else
31
+ $stderr << line
32
+ end
33
+ end
34
+ end
35
+
36
+ puts csv
@@ -0,0 +1,2 @@
1
+ ---
2
+ unit_test_timeout: 0.1
data/config/flay.yml ADDED
@@ -0,0 +1,3 @@
1
+ ---
2
+ threshold: 11
3
+ total_score: 184
data/config/flog.yml ADDED
@@ -0,0 +1,2 @@
1
+ ---
2
+ threshold: 0
data/config/mutant.yml ADDED
@@ -0,0 +1,3 @@
1
+ ---
2
+ name: your_lib
3
+ namespace: YourLib
data/config/reek.yml ADDED
@@ -0,0 +1,103 @@
1
+ ---
2
+ Attribute:
3
+ enabled: true
4
+ exclude: []
5
+ BooleanParameter:
6
+ enabled: true
7
+ exclude: []
8
+ ClassVariable:
9
+ enabled: true
10
+ exclude: []
11
+ ControlParameter:
12
+ enabled: true
13
+ exclude: []
14
+ DataClump:
15
+ enabled: true
16
+ exclude: []
17
+ max_copies: 2
18
+ min_clump_size: 2
19
+ DuplicateMethodCall:
20
+ enabled: true
21
+ exclude: []
22
+ max_calls: 1
23
+ allow_calls: []
24
+ FeatureEnvy:
25
+ enabled: true
26
+ exclude: []
27
+ IrresponsibleModule:
28
+ enabled: true
29
+ exclude: []
30
+ LongParameterList:
31
+ enabled: true
32
+ exclude: []
33
+ max_params: 2
34
+ overrides:
35
+ initialize:
36
+ max_params: 3
37
+ LongYieldList:
38
+ enabled: true
39
+ exclude: []
40
+ max_params: 2
41
+ NestedIterators:
42
+ enabled: true
43
+ exclude: []
44
+ max_allowed_nesting: 1
45
+ ignore_iterators: []
46
+ NilCheck:
47
+ enabled: true
48
+ exclude: []
49
+ RepeatedConditional:
50
+ enabled: true
51
+ exclude: []
52
+ max_ifs: 1
53
+ TooManyInstanceVariables:
54
+ enabled: true
55
+ exclude: []
56
+ max_instance_variables: 3
57
+ TooManyMethods:
58
+ enabled: true
59
+ exclude: []
60
+ max_methods: 10
61
+ TooManyStatements:
62
+ enabled: true
63
+ exclude:
64
+ - each
65
+ max_statements: 2
66
+ UncommunicativeMethodName:
67
+ enabled: true
68
+ exclude: []
69
+ reject:
70
+ - !ruby/regexp /^[a-z]$/
71
+ - !ruby/regexp /[0-9]$/
72
+ - !ruby/regexp /[A-Z]/
73
+ accept: []
74
+ UncommunicativeModuleName:
75
+ enabled: true
76
+ exclude: []
77
+ reject:
78
+ - !ruby/regexp /^.$/
79
+ - !ruby/regexp /[0-9]$/
80
+ accept: []
81
+ UncommunicativeParameterName:
82
+ enabled: true
83
+ exclude: []
84
+ reject:
85
+ - !ruby/regexp /^.$/
86
+ - !ruby/regexp /[0-9]$/
87
+ - !ruby/regexp /[A-Z]/
88
+ accept: []
89
+ UncommunicativeVariableName:
90
+ enabled: true
91
+ exclude: []
92
+ reject:
93
+ - !ruby/regexp /^.$/
94
+ - !ruby/regexp /[0-9]$/
95
+ - !ruby/regexp /[A-Z]/
96
+ accept: []
97
+ UnusedParameters:
98
+ enabled: true
99
+ exclude: []
100
+ UtilityFunction:
101
+ enabled: true
102
+ exclude: []
103
+ max_helper_calls: 0
@@ -0,0 +1,58 @@
1
+ AllCops:
2
+ Includes:
3
+ - '**/*.rake'
4
+ - 'Gemfile'
5
+ - 'Gemfile.devtools'
6
+ Excludes:
7
+ - '**/vendor/**'
8
+ - '**/benchmarks/**'
9
+
10
+ # Avoid parameter lists longer than five parameters.
11
+ ParameterLists:
12
+ Max: 3
13
+ CountKeywordArgs: true
14
+
15
+ # Avoid more than `Max` levels of nesting.
16
+ BlockNesting:
17
+ Max: 3
18
+
19
+ # Align with the style guide.
20
+ CollectionMethods:
21
+ PreferredMethods:
22
+ collect: 'map'
23
+ inject: 'reduce'
24
+ find: 'detect'
25
+ find_all: 'select'
26
+
27
+ # Do not force public/protected/private keyword to be indented at the same
28
+ # level as the def keyword. My personal preference is to outdent these keywords
29
+ # because I think when scanning code it makes it easier to identify the
30
+ # sections of code and visually separate them. When the keyword is at the same
31
+ # level I think it sort of blends in with the def keywords and makes it harder
32
+ # to scan the code and see where the sections are.
33
+ AccessControl:
34
+ Enabled: false
35
+
36
+ # Limit line length
37
+ LineLength:
38
+ Max: 79
39
+
40
+ # Disable documentation checking until a class needs to be documented once
41
+ Documentation:
42
+ Enabled: false
43
+
44
+ # Do not favor modifier if/unless usage when you have a single-line body
45
+ IfUnlessModifier:
46
+ Enabled: false
47
+
48
+ # Allow case equality operator (in limited use within the specs)
49
+ CaseEquality:
50
+ Enabled: false
51
+
52
+ # Constants do not always have to use SCREAMING_SNAKE_CASE
53
+ ConstantName:
54
+ Enabled: false
55
+
56
+ # Not all trivial readers/writers can be defined with attr_* methods
57
+ TrivialAccessors:
58
+ Enabled: false
@@ -0,0 +1,2 @@
1
+ ---
2
+ threshold: 53.3
data/data/.gitkeep ADDED
File without changes
data/lib/analects.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'tmpdir'
2
+ require 'pathname'
3
+ require 'delegate'
4
+
5
+ require 'inflecto'
6
+ require 'ice_nine'
7
+ require 'rmmseg'
8
+ require 'ting'
9
+
10
+ module Analects
11
+ ROOT = Pathname(__FILE__).dirname.parent
12
+
13
+ def self.init_rake_tasks(*args, &blk)
14
+ require 'analects/rake_tasks'
15
+ Analects::RakeTasks.new(*args, &blk)
16
+ end
17
+
18
+ def self.cjk?(str)
19
+ str.codepoints.all? do |cp|
20
+ Analects::Models::Zi.codepoint_ranges.any? {|range| range.include?(cp)}
21
+ end
22
+ end
23
+ end
24
+
25
+ require 'cjk_string'
26
+
27
+ require 'analects/version'
28
+ require 'analects/encoding'
29
+ require 'analects/cli/progress'
30
+ require 'analects/cedict_loader'
31
+ require 'analects/chise_ids_loader'
32
+ require 'analects/source'
33
+ require 'analects/library'
34
+ require 'analects/tokenizer'
35
+
36
+ require 'analects/models/zi'
37
+ require 'analects/models/kangxi_radical'
@@ -0,0 +1,44 @@
1
+ # encoding: UTF-8
2
+
3
+ module Analects
4
+ class CedictLoader
5
+ include Enumerable
6
+
7
+ attr_reader :headers
8
+
9
+ def initialize(io)
10
+ @contents = io.read
11
+ @headers = {}
12
+ @contents.each_line do |line|
13
+ if line =~ /^#! (.*)=(.*)/
14
+ @headers[$1.strip] = $2.strip
15
+ end
16
+ break unless line =~ /^#/
17
+ end
18
+ end
19
+
20
+ def field_names
21
+ [:traditional, :simplified, :pinyin, :definitions]
22
+ end
23
+
24
+ def each
25
+ if block_given?
26
+ @contents.each_line do |line|
27
+ yield process_contents(line) if line !~ /^#/
28
+ end
29
+ else
30
+ enum_for(:each)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def process_contents(line)
37
+ if line.strip =~ /^([^\s]*) ([^\s]*) \[([\w\d:,· ]+)\](.*)/
38
+ [$1,$2,$3,$4].map{|x| x.strip}
39
+ else
40
+ raise "Unexpected contents : #{line.inspect}"
41
+ end
42
+ end
43
+ end
44
+ end