analects 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +14 -0
  5. data/Gemfile +17 -0
  6. data/Gemfile.devtools +71 -0
  7. data/Gemfile.lock +236 -0
  8. data/LICENSE.txt +674 -0
  9. data/README.md +81 -0
  10. data/Rakefile +26 -0
  11. data/SOURCES.md +17 -0
  12. data/analects.gemspec +29 -0
  13. data/bin/wp_hsk_filter +36 -0
  14. data/config/devtools.yml +2 -0
  15. data/config/flay.yml +3 -0
  16. data/config/flog.yml +2 -0
  17. data/config/mutant.yml +3 -0
  18. data/config/reek.yml +103 -0
  19. data/config/rubocop.yml +58 -0
  20. data/config/yardstick.yml +2 -0
  21. data/data/.gitkeep +0 -0
  22. data/lib/analects.rb +37 -0
  23. data/lib/analects/cedict_loader.rb +44 -0
  24. data/lib/analects/chise_ids_loader.rb +34 -0
  25. data/lib/analects/cli/progress.rb +37 -0
  26. data/lib/analects/encoding.rb +61 -0
  27. data/lib/analects/library.rb +68 -0
  28. data/lib/analects/models/kangxi_radical.rb +14 -0
  29. data/lib/analects/models/zi.rb +64 -0
  30. data/lib/analects/rake_tasks.rb +49 -0
  31. data/lib/analects/source.rb +70 -0
  32. data/lib/analects/tokenizer.rb +54 -0
  33. data/lib/analects/version.rb +3 -0
  34. data/lib/cjk_string.rb +56 -0
  35. data/lib/generators/analects.rb +20 -0
  36. data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
  37. data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
  38. data/lib/generators/analects/cedict/templates/model.rb +3 -0
  39. data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
  40. data/spec/analects/cedict_loader_spec.rb +48 -0
  41. data/spec/analects/chise_ids_loader_spec.rb +50 -0
  42. data/spec/analects/library_spec.rb +50 -0
  43. data/spec/analects/source_spec.rb +18 -0
  44. data/spec/spec_helper.rb +19 -0
  45. data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
  46. metadata +221 -0
@@ -0,0 +1,3 @@
1
+ module Analects
2
+ VERSION = '0.2.0'
3
+ end
data/lib/cjk_string.rb ADDED
@@ -0,0 +1,56 @@
1
+ class CJKString < DelegateClass(String)
2
+ def cjk_chars
3
+ @cjk_chars ||= scan(Analects::Models::Zi::REGEXP)
4
+ end
5
+
6
+ def one_cjk?
7
+ cjk_chars.length == 1
8
+ end
9
+
10
+ def all_cjk?
11
+ length == cjk_chars.length
12
+ end
13
+
14
+ def any_cjk?
15
+ cjk_chars.length > 1
16
+ end
17
+ end
18
+
19
+ class CJKChar < DelegateClass(String)
20
+ def unicode_range
21
+ Analects::Models::Zi::RANGES.each do |name, info|
22
+ return name if info[:range].include? codepoint
23
+ end
24
+ end
25
+
26
+ def unicode_range_name
27
+ Analects::Models::Zi::RANGES[unicode_range][:name]
28
+ end
29
+
30
+ def codepoint
31
+ codepoints.first
32
+ end
33
+ end
34
+
35
+ def CJKChar(str)
36
+ return str if str.is_a? CJKChar
37
+
38
+ if str.length > 1
39
+ if str =~ /^(U\+)?([0-9A-Fa-f]+)/
40
+ str = [$2].pack('U')
41
+ else
42
+ raise ArgumentError, 'CJKChar must have length one'
43
+ end
44
+ end
45
+
46
+ CJKChar.new(str)
47
+ end
48
+
49
+ def CJKString(str)
50
+ if str.is_a? CJKString
51
+ return str
52
+ elsif str.respond_to? :to_cjk
53
+ str = str.to_cjk
54
+ end
55
+ CJKString.new(str.freeze)
56
+ end
@@ -0,0 +1,20 @@
1
+ require 'rails/generators/named_base'
2
+ require 'rails/generators/migration'
3
+ require 'rails/generators/active_record'
4
+
5
+ module Analects
6
+ module Generators
7
+ class Base < ::Rails::Generators::Base
8
+ include Rails::Generators::Migration
9
+
10
+ def self.next_migration_number(dirname)
11
+ ActiveRecord::Generators::Base.next_migration_number(dirname)
12
+ end
13
+
14
+ def self.source_root
15
+ @_analects_source_root ||= File.expand_path(File.join(File.dirname(__FILE__), 'analects', generator_name, 'templates'))
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ require 'generators/analects'
2
+
3
+ module Analects
4
+ module Generators
5
+ class CedictGenerator < Analects::Generators::Base
6
+
7
+ desc %{Description:\n Copy analects CC-CEDICT files to your application.\n}
8
+
9
+ def analects_create_cedict_table
10
+ migration_template "create_cedict_table.rb", "db/migrate/create_cedict_table.rb"
11
+ end
12
+
13
+ def analects_populate_cedict_table
14
+ migration_template "populate_cedict_table.rb", "db/migrate/populate_cedict_table.rb"
15
+ end
16
+
17
+ def analects_cedict_model
18
+ template "model.rb", "app/models/cedict.rb"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,12 @@
1
+ class CreateCedictTable < ActiveRecord::Migration
2
+ def change
3
+ create_table :cedicts do |t|
4
+ t.string :simplified
5
+ t.string :traditional
6
+ t.string :pinyin
7
+ t.string :english
8
+
9
+ t.timestamps
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,3 @@
1
+ class Cedict < ActiveRecord::Base
2
+ attr_accessible :simplified, :traditional, :pinyin, :english
3
+ end
@@ -0,0 +1,41 @@
1
+ require 'analects/cedict'
2
+ require 'analects/cli/progress'
3
+
4
+ class PopulateCedictTable < ActiveRecord::Migration
5
+ def up
6
+ path = ENV['CEDICT_PATH'] || Analects::CedictLoader::LOCAL
7
+ unless File.exist? path
8
+ puts "-- cedict file not found, downloading"
9
+ Analects::CedictLoader.download!
10
+ end
11
+
12
+ if File.exist? path
13
+ f = File.open path
14
+ l = Analects::CedictLoader.new(f)
15
+ puts "-- Inserting CC-CEDICT"
16
+ l.headers.each do |k,v|
17
+ puts " #{k}=#{v}"
18
+ end
19
+ p = Analects::CLI::Progress.new(Integer(l.headers['entries'])-1, 5000, ' ')
20
+ Cedict.transaction do
21
+ l.each do |traditional, simplified, pinyin, english|
22
+ p.next
23
+ Cedict.create!(
24
+ :traditional => traditional,
25
+ :simplified => simplified,
26
+ :pinyin => pinyin,
27
+ :english => english
28
+ )
29
+ end
30
+ end
31
+ f.close
32
+ puts
33
+ else
34
+ raise "CC-Cedict file not found and failed to download"
35
+ end
36
+ end
37
+
38
+ def down
39
+ Cedict.delete_all
40
+ end
41
+ end
@@ -0,0 +1,48 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Analects::CedictLoader do
5
+ let ( :contents ) do
6
+ '# CC-CEDICT
7
+ # Community maintained free Chinese-English dictionary.
8
+ #
9
+ #! charset=UTF-8
10
+ #! entries=104941
11
+ 佰 佰 [bai3] /hundred (banker\'s anti-fraud numeral)/
12
+ 佱 佱 [fa3] /old variant of 法[fa3]/law/
13
+ 佳 佳 [jia1] /beautiful/fine/good/
14
+ '
15
+ end
16
+
17
+ let ( :cedict_loader ) { Analects::CedictLoader.new( StringIO.new(contents) ) }
18
+
19
+ it "should parse headers" do
20
+ cedict_loader.headers.should == { 'charset' => 'UTF-8', 'entries' => '104941' }
21
+ end
22
+
23
+ it "should parse entries" do
24
+ cedict_loader.take(1).should == [ ['佰', '佰', 'bai3', '/hundred (banker\'s anti-fraud numeral)/' ] ]
25
+ end
26
+
27
+ it "can be enumerated multiple times" do
28
+ 2.times do
29
+ cedict_loader.each do |x|
30
+ x.should == ['佰', '佰', 'bai3', '/hundred (banker\'s anti-fraud numeral)/' ]
31
+ break;
32
+ end
33
+ end
34
+ end
35
+
36
+ it "is an enumerable" do
37
+ cedict_loader.count.should === 3
38
+ cedict_loader.to_a.should be_instance_of( Array )
39
+ cedict_loader.map {|x| x.size.should === 4 }
40
+ end
41
+
42
+ if RUBY_VERSION.split('.').take(2).join('.').to_f >= 1.9
43
+ it "returns an enumerator when each is called without block" do
44
+ cedict_loader.each.is_a? Enumerator
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,50 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Analects::ChiseIdsLoader do
5
+ subject(:loader) do
6
+ Analects::ChiseIdsLoader.new(Analects::ROOT.join('spec/test_data/chise_ids'), only_unicode)
7
+ end
8
+
9
+ let(:only_unicode) { false }
10
+ let(:entries) { loader.each.to_a }
11
+
12
+ describe '#field_names' do
13
+ it 'should return names for the fields in an IDS record' do
14
+ expect(loader.field_names).to eq([:name, :representation, :ids])
15
+ end
16
+ end
17
+
18
+ context 'with a loader that only returns data for unicode characters' do
19
+ let(:only_unicode) { true }
20
+
21
+ it 'should still return the unicode entries' do
22
+ expect(entries.first).to eq(['U+4E0D', '不', '不'])
23
+ end
24
+
25
+ it 'should filter out the non-unicode entries' do
26
+ entries.each do |entry|
27
+ expect(entry.first).to match /^U\+[0-9A-F]{4}/
28
+ end
29
+ end
30
+ end
31
+
32
+ describe '#each' do
33
+ it 'should return an enumerator when no block is given' do
34
+ expect(loader.each).to be_instance_of(Enumerator)
35
+ end
36
+
37
+ it 'should loop over all entries' do
38
+ expect(entries.first).to eq(['U+4E0D', '不', '不'])
39
+ expect(entries.last).to eq(['CB00003', '&CB00003;', '⿱㓛&GT-47348;'])
40
+ end
41
+
42
+ it 'should filter out entries without a tab' do
43
+ entry_no_tab = entries.detect do |entry|
44
+ entry.join(' ') =~ /Entry without a tab/
45
+ end
46
+ expect(entry_no_tab).to be_nil
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,50 @@
1
+ require 'tempfile'
2
+ require 'spec_helper'
3
+ require 'securerandom'
4
+
5
+ describe Analects::Library do
6
+ subject(:library) {
7
+ described_class.new(options)
8
+ }
9
+ let(:options) { {} }
10
+
11
+ context 'with a data_dir specified' do
12
+ let(:data_dir) { File.join(Dir.tmpdir, 'analects-' + SecureRandom.hex(16)) }
13
+ let(:options) {
14
+ { data_dir: data_dir }
15
+ }
16
+
17
+ it 'should set that data dir on the sources' do
18
+ subject.cedict.data_dir.should == data_dir
19
+ end
20
+ end
21
+
22
+ describe "#cedict" do
23
+ subject(:cedict) { library.cedict }
24
+
25
+ its(:name) { should == :cedict }
26
+ its(:location) { should == File.join(Dir.home, '.analects/cedict_1_0_ts_utf-8_mdbg.txt') }
27
+
28
+ it "should download and unpack the CEDICT archive" do
29
+ cedict.should_receive(:retrieve_http).once.with(Analects::CEDICT_URL).and_return(:a_stream)
30
+ cedict.should_receive(:retrieve_gunzip).once.with(:a_stream).and_return(:an_unzipped_stream)
31
+
32
+ cedict.retrieve!
33
+ end
34
+ end
35
+
36
+ it "should have a CHISE IDS source" do
37
+ library.chise_ids.name.should == :chise_ids
38
+ end
39
+
40
+ describe "#chise_ids" do
41
+ subject (:chise_ids) { library.chise_ids }
42
+
43
+ its( :name ) { should == :chise_ids }
44
+ its( :retrieval ) { should == [ :git ] }
45
+ its(:location ) { should == File.join(Dir.home, '.analects/chise_ids') }
46
+ its( :url ) { should == Analects::CHISE_IDS_URL}
47
+ end
48
+
49
+
50
+ end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe Analects::Source do
4
+ let( :url ) { 'a_url' }
5
+ let( :source ) { Analects::Source.new( :retrieval => [ :step1, :step2, :step3 ], :url => url ) }
6
+
7
+ it "should do retrieve by pipelining the retrieve methods" do
8
+ source.should_receive(:retrieve_step1).with(url).once.and_return(:intermediary_result_1)
9
+ source.should_receive(:retrieve_step2).with(:intermediary_result_1).once.and_return(:intermediary_result_2)
10
+ source.should_receive(:retrieve_step3).with(:intermediary_result_2).once.and_return(:result)
11
+ source.retrieve!
12
+ end
13
+
14
+ it "should accept both arrays or single values as retrieval methods" do
15
+ Analects::Source.new( :retrieval => :step1 ).retrieval.should == [ :step1 ]
16
+ Analects::Source.new( :retrieval => [ :step1 ] ).retrieval.should == [ :step1 ]
17
+ end
18
+ end
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+
3
+ if ENV['COVERAGE'] == 'true'
4
+ require 'simplecov'
5
+ require 'coveralls'
6
+
7
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
8
+ SimpleCov::Formatter::HTMLFormatter,
9
+ Coveralls::SimpleCov::Formatter
10
+ ]
11
+
12
+ SimpleCov.start do
13
+ command_name 'spec:unit'
14
+ minimum_coverage 72.64
15
+ end
16
+ end
17
+
18
+ require 'devtools/spec_helper'
19
+ require 'analects'
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ U+4E0D 不 不
3
+ U+4E0E 与 ⿹&CDP-8BBF;一
4
+ U+4E12 丒 ⿱刃一
5
+ U+4E15 丕 ⿱不一
6
+ Entry without a tab
7
+ U+4E19 丙 ⿱一内
8
+ CB00001 &I-CB00001; ⿰𠤕欠
9
+ CB00002 &CB00002; ⿰⿱匕示頁
10
+ CB00003 &CB00003; ⿱㓛&GT-47348;
metadata ADDED
@@ -0,0 +1,221 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: analects
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Arne Brasseur
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rubygems-tasks
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: inflecto
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 0.0.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: rmmseg
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: ting
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 0.9.0
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: 0.9.0
125
+ - !ruby/object:Gem::Dependency
126
+ name: ice_nine
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: Toolkit for Mandarin language learning apps
140
+ email:
141
+ - arne.brasseur@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files:
145
+ - README.md
146
+ files:
147
+ - .gitignore
148
+ - .rvmrc
149
+ - .travis.yml
150
+ - Gemfile
151
+ - Gemfile.devtools
152
+ - Gemfile.lock
153
+ - LICENSE.txt
154
+ - README.md
155
+ - Rakefile
156
+ - SOURCES.md
157
+ - analects.gemspec
158
+ - bin/wp_hsk_filter
159
+ - config/devtools.yml
160
+ - config/flay.yml
161
+ - config/flog.yml
162
+ - config/mutant.yml
163
+ - config/reek.yml
164
+ - config/rubocop.yml
165
+ - config/yardstick.yml
166
+ - data/.gitkeep
167
+ - lib/analects.rb
168
+ - lib/analects/cedict_loader.rb
169
+ - lib/analects/chise_ids_loader.rb
170
+ - lib/analects/cli/progress.rb
171
+ - lib/analects/encoding.rb
172
+ - lib/analects/library.rb
173
+ - lib/analects/models/kangxi_radical.rb
174
+ - lib/analects/models/zi.rb
175
+ - lib/analects/rake_tasks.rb
176
+ - lib/analects/source.rb
177
+ - lib/analects/tokenizer.rb
178
+ - lib/analects/version.rb
179
+ - lib/cjk_string.rb
180
+ - lib/generators/analects.rb
181
+ - lib/generators/analects/cedict/cedict_generator.rb
182
+ - lib/generators/analects/cedict/templates/create_cedict_table.rb
183
+ - lib/generators/analects/cedict/templates/model.rb
184
+ - lib/generators/analects/cedict/templates/populate_cedict_table.rb
185
+ - spec/analects/cedict_loader_spec.rb
186
+ - spec/analects/chise_ids_loader_spec.rb
187
+ - spec/analects/library_spec.rb
188
+ - spec/analects/source_spec.rb
189
+ - spec/spec_helper.rb
190
+ - spec/test_data/chise_ids/IDS-foo.txt
191
+ homepage: https://github.com/arnebrasseur/analects.rb
192
+ licenses:
193
+ - GPL-3.0
194
+ metadata: {}
195
+ post_install_message:
196
+ rdoc_options: []
197
+ require_paths:
198
+ - lib
199
+ required_ruby_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - '>='
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ requirements: []
210
+ rubyforge_project:
211
+ rubygems_version: 2.2.1
212
+ signing_key:
213
+ specification_version: 4
214
+ summary: Toolkit for Mandarin language learning apps
215
+ test_files:
216
+ - spec/analects/cedict_loader_spec.rb
217
+ - spec/analects/chise_ids_loader_spec.rb
218
+ - spec/analects/library_spec.rb
219
+ - spec/analects/source_spec.rb
220
+ - spec/spec_helper.rb
221
+ - spec/test_data/chise_ids/IDS-foo.txt