analects 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +14 -0
  5. data/Gemfile +17 -0
  6. data/Gemfile.devtools +71 -0
  7. data/Gemfile.lock +236 -0
  8. data/LICENSE.txt +674 -0
  9. data/README.md +81 -0
  10. data/Rakefile +26 -0
  11. data/SOURCES.md +17 -0
  12. data/analects.gemspec +29 -0
  13. data/bin/wp_hsk_filter +36 -0
  14. data/config/devtools.yml +2 -0
  15. data/config/flay.yml +3 -0
  16. data/config/flog.yml +2 -0
  17. data/config/mutant.yml +3 -0
  18. data/config/reek.yml +103 -0
  19. data/config/rubocop.yml +58 -0
  20. data/config/yardstick.yml +2 -0
  21. data/data/.gitkeep +0 -0
  22. data/lib/analects.rb +37 -0
  23. data/lib/analects/cedict_loader.rb +44 -0
  24. data/lib/analects/chise_ids_loader.rb +34 -0
  25. data/lib/analects/cli/progress.rb +37 -0
  26. data/lib/analects/encoding.rb +61 -0
  27. data/lib/analects/library.rb +68 -0
  28. data/lib/analects/models/kangxi_radical.rb +14 -0
  29. data/lib/analects/models/zi.rb +64 -0
  30. data/lib/analects/rake_tasks.rb +49 -0
  31. data/lib/analects/source.rb +70 -0
  32. data/lib/analects/tokenizer.rb +54 -0
  33. data/lib/analects/version.rb +3 -0
  34. data/lib/cjk_string.rb +56 -0
  35. data/lib/generators/analects.rb +20 -0
  36. data/lib/generators/analects/cedict/cedict_generator.rb +22 -0
  37. data/lib/generators/analects/cedict/templates/create_cedict_table.rb +12 -0
  38. data/lib/generators/analects/cedict/templates/model.rb +3 -0
  39. data/lib/generators/analects/cedict/templates/populate_cedict_table.rb +41 -0
  40. data/spec/analects/cedict_loader_spec.rb +48 -0
  41. data/spec/analects/chise_ids_loader_spec.rb +50 -0
  42. data/spec/analects/library_spec.rb +50 -0
  43. data/spec/analects/source_spec.rb +18 -0
  44. data/spec/spec_helper.rb +19 -0
  45. data/spec/test_data/chise_ids/IDS-foo.txt +10 -0
  46. metadata +221 -0
@@ -0,0 +1,3 @@
1
+ module Analects
2
+ VERSION = '0.2.0'
3
+ end
data/lib/cjk_string.rb ADDED
@@ -0,0 +1,56 @@
1
+ class CJKString < DelegateClass(String)
2
+ def cjk_chars
3
+ @cjk_chars ||= scan(Analects::Models::Zi::REGEXP)
4
+ end
5
+
6
+ def one_cjk?
7
+ cjk_chars.length == 1
8
+ end
9
+
10
+ def all_cjk?
11
+ length == cjk_chars.length
12
+ end
13
+
14
+ def any_cjk?
15
+ cjk_chars.length > 1
16
+ end
17
+ end
18
+
19
+ class CJKChar < DelegateClass(String)
20
+ def unicode_range
21
+ Analects::Models::Zi::RANGES.each do |name, info|
22
+ return name if info[:range].include? codepoint
23
+ end
24
+ end
25
+
26
+ def unicode_range_name
27
+ Analects::Models::Zi::RANGES[unicode_range][:name]
28
+ end
29
+
30
+ def codepoint
31
+ codepoints.first
32
+ end
33
+ end
34
+
35
+ def CJKChar(str)
36
+ return str if str.is_a? CJKChar
37
+
38
+ if str.length > 1
39
+ if str =~ /^(U\+)?([0-9A-Fa-f]+)/
40
+ str = [$2].pack('U')
41
+ else
42
+ raise ArgumentError, 'CJKChar must have length one'
43
+ end
44
+ end
45
+
46
+ CJKChar.new(str)
47
+ end
48
+
49
+ def CJKString(str)
50
+ if str.is_a? CJKString
51
+ return str
52
+ elsif str.respond_to? :to_cjk
53
+ str = str.to_cjk
54
+ end
55
+ CJKString.new(str.freeze)
56
+ end
@@ -0,0 +1,20 @@
1
+ require 'rails/generators/named_base'
2
+ require 'rails/generators/migration'
3
+ require 'rails/generators/active_record'
4
+
5
+ module Analects
6
+ module Generators
7
+ class Base < ::Rails::Generators::Base
8
+ include Rails::Generators::Migration
9
+
10
+ def self.next_migration_number(dirname)
11
+ ActiveRecord::Generators::Base.next_migration_number(dirname)
12
+ end
13
+
14
+ def self.source_root
15
+ @_analects_source_root ||= File.expand_path(File.join(File.dirname(__FILE__), 'analects', generator_name, 'templates'))
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ require 'generators/analects'
2
+
3
+ module Analects
4
+ module Generators
5
+ class CedictGenerator < Analects::Generators::Base
6
+
7
+ desc %{Description:\n Copy analects CC-CEDICT files to your application.\n}
8
+
9
+ def analects_create_cedict_table
10
+ migration_template "create_cedict_table.rb", "db/migrate/create_cedict_table.rb"
11
+ end
12
+
13
+ def analects_populate_cedict_table
14
+ migration_template "populate_cedict_table.rb", "db/migrate/populate_cedict_table.rb"
15
+ end
16
+
17
+ def analects_cedict_model
18
+ template "model.rb", "app/models/cedict.rb"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,12 @@
1
+ class CreateCedictTable < ActiveRecord::Migration
2
+ def change
3
+ create_table :cedicts do |t|
4
+ t.string :simplified
5
+ t.string :traditional
6
+ t.string :pinyin
7
+ t.string :english
8
+
9
+ t.timestamps
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,3 @@
1
+ class Cedict < ActiveRecord::Base
2
+ attr_accessible :simplified, :traditional, :pinyin, :english
3
+ end
@@ -0,0 +1,41 @@
1
+ require 'analects/cedict'
2
+ require 'analects/cli/progress'
3
+
4
+ class PopulateCedictTable < ActiveRecord::Migration
5
+ def up
6
+ path = ENV['CEDICT_PATH'] || Analects::CedictLoader::LOCAL
7
+ unless File.exist? path
8
+ puts "-- cedict file not found, downloading"
9
+ Analects::CedictLoader.download!
10
+ end
11
+
12
+ if File.exist? path
13
+ f = File.open path
14
+ l = Analects::CedictLoader.new(f)
15
+ puts "-- Inserting CC-CEDICT"
16
+ l.headers.each do |k,v|
17
+ puts " #{k}=#{v}"
18
+ end
19
+ p = Analects::CLI::Progress.new(Integer(l.headers['entries'])-1, 5000, ' ')
20
+ Cedict.transaction do
21
+ l.each do |traditional, simplified, pinyin, english|
22
+ p.next
23
+ Cedict.create!(
24
+ :traditional => traditional,
25
+ :simplified => simplified,
26
+ :pinyin => pinyin,
27
+ :english => english
28
+ )
29
+ end
30
+ end
31
+ f.close
32
+ puts
33
+ else
34
+ raise "CC-Cedict file not found and failed to download"
35
+ end
36
+ end
37
+
38
+ def down
39
+ Cedict.delete_all
40
+ end
41
+ end
@@ -0,0 +1,48 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Analects::CedictLoader do
5
+ let ( :contents ) do
6
+ '# CC-CEDICT
7
+ # Community maintained free Chinese-English dictionary.
8
+ #
9
+ #! charset=UTF-8
10
+ #! entries=104941
11
+ 佰 佰 [bai3] /hundred (banker\'s anti-fraud numeral)/
12
+ 佱 佱 [fa3] /old variant of 法[fa3]/law/
13
+ 佳 佳 [jia1] /beautiful/fine/good/
14
+ '
15
+ end
16
+
17
+ let ( :cedict_loader ) { Analects::CedictLoader.new( StringIO.new(contents) ) }
18
+
19
+ it "should parse headers" do
20
+ cedict_loader.headers.should == { 'charset' => 'UTF-8', 'entries' => '104941' }
21
+ end
22
+
23
+ it "should parse entries" do
24
+ cedict_loader.take(1).should == [ ['佰', '佰', 'bai3', '/hundred (banker\'s anti-fraud numeral)/' ] ]
25
+ end
26
+
27
+ it "can be enumerated multiple times" do
28
+ 2.times do
29
+ cedict_loader.each do |x|
30
+ x.should == ['佰', '佰', 'bai3', '/hundred (banker\'s anti-fraud numeral)/' ]
31
+ break;
32
+ end
33
+ end
34
+ end
35
+
36
+ it "is an enumerable" do
37
+ cedict_loader.count.should === 3
38
+ cedict_loader.to_a.should be_instance_of( Array )
39
+ cedict_loader.map {|x| x.size.should === 4 }
40
+ end
41
+
42
+ if RUBY_VERSION.split('.').take(2).join('.').to_f >= 1.9
43
+ it "returns an enumerator when each is called without block" do
44
+ cedict_loader.each.is_a? Enumerator
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,50 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Analects::ChiseIdsLoader do
5
+ subject(:loader) do
6
+ Analects::ChiseIdsLoader.new(Analects::ROOT.join('spec/test_data/chise_ids'), only_unicode)
7
+ end
8
+
9
+ let(:only_unicode) { false }
10
+ let(:entries) { loader.each.to_a }
11
+
12
+ describe '#field_names' do
13
+ it 'should return names for the fields in an IDS record' do
14
+ expect(loader.field_names).to eq([:name, :representation, :ids])
15
+ end
16
+ end
17
+
18
+ context 'with a loader that only returns data for unicode characters' do
19
+ let(:only_unicode) { true }
20
+
21
+ it 'should still return the unicode entries' do
22
+ expect(entries.first).to eq(['U+4E0D', '不', '不'])
23
+ end
24
+
25
+ it 'should filter out the non-unicode entries' do
26
+ entries.each do |entry|
27
+ expect(entry.first).to match /^U\+[0-9A-F]{4}/
28
+ end
29
+ end
30
+ end
31
+
32
+ describe '#each' do
33
+ it 'should return an enumerator when no block is given' do
34
+ expect(loader.each).to be_instance_of(Enumerator)
35
+ end
36
+
37
+ it 'should loop over all entries' do
38
+ expect(entries.first).to eq(['U+4E0D', '不', '不'])
39
+ expect(entries.last).to eq(['CB00003', '&CB00003;', '⿱㓛&GT-47348;'])
40
+ end
41
+
42
+ it 'should filter out entries without a tab' do
43
+ entry_no_tab = entries.detect do |entry|
44
+ entry.join(' ') =~ /Entry without a tab/
45
+ end
46
+ expect(entry_no_tab).to be_nil
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,50 @@
1
+ require 'tempfile'
2
+ require 'spec_helper'
3
+ require 'securerandom'
4
+
5
+ describe Analects::Library do
6
+ subject(:library) {
7
+ described_class.new(options)
8
+ }
9
+ let(:options) { {} }
10
+
11
+ context 'with a data_dir specified' do
12
+ let(:data_dir) { File.join(Dir.tmpdir, 'analects-' + SecureRandom.hex(16)) }
13
+ let(:options) {
14
+ { data_dir: data_dir }
15
+ }
16
+
17
+ it 'should set that data dir on the sources' do
18
+ subject.cedict.data_dir.should == data_dir
19
+ end
20
+ end
21
+
22
+ describe "#cedict" do
23
+ subject(:cedict) { library.cedict }
24
+
25
+ its(:name) { should == :cedict }
26
+ its(:location) { should == File.join(Dir.home, '.analects/cedict_1_0_ts_utf-8_mdbg.txt') }
27
+
28
+ it "should download and unpack the CEDICT archive" do
29
+ cedict.should_receive(:retrieve_http).once.with(Analects::CEDICT_URL).and_return(:a_stream)
30
+ cedict.should_receive(:retrieve_gunzip).once.with(:a_stream).and_return(:an_unzipped_stream)
31
+
32
+ cedict.retrieve!
33
+ end
34
+ end
35
+
36
+ it "should have a CHISE IDS source" do
37
+ library.chise_ids.name.should == :chise_ids
38
+ end
39
+
40
+ describe "#chise_ids" do
41
+ subject (:chise_ids) { library.chise_ids }
42
+
43
+ its( :name ) { should == :chise_ids }
44
+ its( :retrieval ) { should == [ :git ] }
45
+ its(:location ) { should == File.join(Dir.home, '.analects/chise_ids') }
46
+ its( :url ) { should == Analects::CHISE_IDS_URL}
47
+ end
48
+
49
+
50
+ end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe Analects::Source do
4
+ let( :url ) { 'a_url' }
5
+ let( :source ) { Analects::Source.new( :retrieval => [ :step1, :step2, :step3 ], :url => url ) }
6
+
7
+ it "should do retrieve by pipelining the retrieve methods" do
8
+ source.should_receive(:retrieve_step1).with(url).once.and_return(:intermediary_result_1)
9
+ source.should_receive(:retrieve_step2).with(:intermediary_result_1).once.and_return(:intermediary_result_2)
10
+ source.should_receive(:retrieve_step3).with(:intermediary_result_2).once.and_return(:result)
11
+ source.retrieve!
12
+ end
13
+
14
+ it "should accept both arrays or single values as retrieval methods" do
15
+ Analects::Source.new( :retrieval => :step1 ).retrieval.should == [ :step1 ]
16
+ Analects::Source.new( :retrieval => [ :step1 ] ).retrieval.should == [ :step1 ]
17
+ end
18
+ end
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+
3
+ if ENV['COVERAGE'] == 'true'
4
+ require 'simplecov'
5
+ require 'coveralls'
6
+
7
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
8
+ SimpleCov::Formatter::HTMLFormatter,
9
+ Coveralls::SimpleCov::Formatter
10
+ ]
11
+
12
+ SimpleCov.start do
13
+ command_name 'spec:unit'
14
+ minimum_coverage 72.64
15
+ end
16
+ end
17
+
18
+ require 'devtools/spec_helper'
19
+ require 'analects'
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ U+4E0D 不 不
3
+ U+4E0E 与 ⿹&CDP-8BBF;一
4
+ U+4E12 丒 ⿱刃一
5
+ U+4E15 丕 ⿱不一
6
+ Entry without a tab
7
+ U+4E19 丙 ⿱一内
8
+ CB00001 &I-CB00001; ⿰𠤕欠
9
+ CB00002 &CB00002; ⿰⿱匕示頁
10
+ CB00003 &CB00003; ⿱㓛&GT-47348;
metadata ADDED
@@ -0,0 +1,221 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: analects
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Arne Brasseur
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rubygems-tasks
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: inflecto
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 0.0.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: rmmseg
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: ting
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 0.9.0
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: 0.9.0
125
+ - !ruby/object:Gem::Dependency
126
+ name: ice_nine
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: Toolkit for Mandarin language learning apps
140
+ email:
141
+ - arne.brasseur@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files:
145
+ - README.md
146
+ files:
147
+ - .gitignore
148
+ - .rvmrc
149
+ - .travis.yml
150
+ - Gemfile
151
+ - Gemfile.devtools
152
+ - Gemfile.lock
153
+ - LICENSE.txt
154
+ - README.md
155
+ - Rakefile
156
+ - SOURCES.md
157
+ - analects.gemspec
158
+ - bin/wp_hsk_filter
159
+ - config/devtools.yml
160
+ - config/flay.yml
161
+ - config/flog.yml
162
+ - config/mutant.yml
163
+ - config/reek.yml
164
+ - config/rubocop.yml
165
+ - config/yardstick.yml
166
+ - data/.gitkeep
167
+ - lib/analects.rb
168
+ - lib/analects/cedict_loader.rb
169
+ - lib/analects/chise_ids_loader.rb
170
+ - lib/analects/cli/progress.rb
171
+ - lib/analects/encoding.rb
172
+ - lib/analects/library.rb
173
+ - lib/analects/models/kangxi_radical.rb
174
+ - lib/analects/models/zi.rb
175
+ - lib/analects/rake_tasks.rb
176
+ - lib/analects/source.rb
177
+ - lib/analects/tokenizer.rb
178
+ - lib/analects/version.rb
179
+ - lib/cjk_string.rb
180
+ - lib/generators/analects.rb
181
+ - lib/generators/analects/cedict/cedict_generator.rb
182
+ - lib/generators/analects/cedict/templates/create_cedict_table.rb
183
+ - lib/generators/analects/cedict/templates/model.rb
184
+ - lib/generators/analects/cedict/templates/populate_cedict_table.rb
185
+ - spec/analects/cedict_loader_spec.rb
186
+ - spec/analects/chise_ids_loader_spec.rb
187
+ - spec/analects/library_spec.rb
188
+ - spec/analects/source_spec.rb
189
+ - spec/spec_helper.rb
190
+ - spec/test_data/chise_ids/IDS-foo.txt
191
+ homepage: https://github.com/arnebrasseur/analects.rb
192
+ licenses:
193
+ - GPL-3.0
194
+ metadata: {}
195
+ post_install_message:
196
+ rdoc_options: []
197
+ require_paths:
198
+ - lib
199
+ required_ruby_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - '>='
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ requirements: []
210
+ rubyforge_project:
211
+ rubygems_version: 2.2.1
212
+ signing_key:
213
+ specification_version: 4
214
+ summary: Toolkit for Mandarin language learning apps
215
+ test_files:
216
+ - spec/analects/cedict_loader_spec.rb
217
+ - spec/analects/chise_ids_loader_spec.rb
218
+ - spec/analects/library_spec.rb
219
+ - spec/analects/source_spec.rb
220
+ - spec/spec_helper.rb
221
+ - spec/test_data/chise_ids/IDS-foo.txt