mac_japanese 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ module MacJapanese
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/mac_japanese/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["labocho"]
6
+ gem.email = ["labocho@penguinlab.jp"]
7
+ gem.description = %q{Convert MacJapanese string to UTF-8 and vice versa.}
8
+ gem.summary = %q{Convert MacJapanese string to UTF-8 and vice versa.}
9
+ gem.homepage = "https://github.com/labocho/mac_japanese"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "mac_japanese"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = MacJapanese::VERSION
17
+ gem.add_development_dependency "rspec", "~>2.11.0"
18
+ gem.add_development_dependency "guard-rspec", "~>0.7.0"
19
+ gem.add_development_dependency "ruby-debug19"
20
+ end
@@ -0,0 +1,134 @@
1
+ require "spec_helper"
2
+ require "ruby-debug"
3
+
4
+ describe MacJapanese do
5
+ describe ".to_utf8" do
6
+ [true, false].each do |use_pua|
7
+ context "user_pua: #{use_pua}" do
8
+ let(:options) { {} }
9
+ subject { MacJapanese.to_utf8(@src, options.merge(use_pua: use_pua)) }
10
+ it "should convert us-ascii chars to utf8" do
11
+ @src = "foo\n".force_encoding("macjapan")
12
+ should == "foo\n"
13
+ end
14
+ it "should convert additional backslash to utf8" do
15
+ @src = "\x80".force_encoding("macjapan")
16
+ should == "\\"
17
+ end
18
+ it "should convert halfwidth katakana to utf8" do
19
+ @src = "\xA7".force_encoding("macjapan")
20
+ should == "\u{FF67}"
21
+ end
22
+ it "should convert hiragana to utf8" do
23
+ @src = "\x82\x9F".force_encoding("macjapan")
24
+ should == "\u{3041}"
25
+ end
26
+ it "should convert apple additions to utf8" do
27
+ @src = "\x85\x5E".force_encoding("macjapan")
28
+ should == "\u{2474}"
29
+ end
30
+ end
31
+ end
32
+ context "default" do
33
+ subject { MacJapanese.to_utf8(@src) }
34
+ it "should expand composed char with pua" do
35
+ @src = "\x85\xAB".force_encoding("macjapan")
36
+ should == "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
37
+ end
38
+ end
39
+ context "use_pua: true" do
40
+ subject { MacJapanese.to_utf8(@src, use_pua: true) }
41
+ it "should expand composed char with pua" do
42
+ @src = "\x85\xAB".force_encoding("macjapan")
43
+ should == "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
44
+ end
45
+ end
46
+ context "use_pua: false" do
47
+ subject { MacJapanese.to_utf8(@src, use_pua: false) }
48
+ it "should expand composed char without pua" do
49
+ @src = "\x85\xAB".force_encoding("macjapan")
50
+ should == "\u{0058}\u{0049}\u{0049}\u{0049}"
51
+ end
52
+ end
53
+ context "pass another encoding string to .to_utf8" do
54
+ it "should encode just like passing mac japanese string" do
55
+ @src = "\x82\x9F"
56
+ @src.encoding.should == Encoding::ASCII_8BIT
57
+ MacJapanese.to_utf8(@src).should == "\u{3041}"
58
+ end
59
+ end
60
+ end
61
+
62
+ describe ".to_mac_japanese" do
63
+ let(:options) { {} }
64
+ subject { MacJapanese.to_mac_japanese(@src, options) }
65
+ it "should convert us-ascii chars to mac_japanese" do
66
+ @src = "foo\n"
67
+ should == "foo\n".force_encoding("macjapan")
68
+ end
69
+ it "should convert additional backslash to mac_japanese" do
70
+ @src = "\\"
71
+ should == "\x80".force_encoding("macjapan")
72
+ end
73
+ it "should convert halfwidth katakana to mac_japanese" do
74
+ @src = "\u{FF67}"
75
+ should == "\xA7".force_encoding("macjapan")
76
+ end
77
+ it "should convert hiragana to mac_japanese" do
78
+ @src = "\u{3041}"
79
+ should == "\x82\x9F".force_encoding("macjapan")
80
+ end
81
+ it "should convert hiragana followed by composed characters to mac_japanese" do
82
+ @src = "\u{3041}\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
83
+ should == "\x82\x9F\x85\xAB".force_encoding("macjapan")
84
+ end
85
+ it "should convert apple additions to mac_japanese" do
86
+ @src = "\u{2474}"
87
+ should == "\x85\x5E".force_encoding("macjapan")
88
+ end
89
+ it "should compose characters with pua" do
90
+ @src = "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
91
+ should == "\x85\xAB".force_encoding("macjapan")
92
+ end
93
+ it "should not compose characters without pua" do
94
+ @src = "\u{0058}\u{0049}\u{0049}\u{0049}"
95
+ should == "XIII".force_encoding("macjapan")
96
+ end
97
+ context "pass another encoding string to .to_mac_japanese" do
98
+ it "should encode to mac japanese string (via utf8)" do
99
+ @src = "\u{3041}".encode("euc-jp")
100
+ MacJapanese.to_mac_japanese(@src).should == "\x82\x9F".force_encoding("macjapan")
101
+ end
102
+ end
103
+ end
104
+
105
+ context "undef: :replace" do
106
+ it "should replace undefined mac japanese char" do
107
+ @src = "foo\xFC\xFCbar".force_encoding("macjapan")
108
+ MacJapanese.to_utf8(@src, undef: :replace).should == "foo\u{fffd}bar"
109
+ end
110
+ it "should replace undefined utf-8 char" do
111
+ @src = "foo\u{FA11}bar"
112
+ MacJapanese.to_mac_japanese(@src, undef: :replace).should == "foo?bar"
113
+ end
114
+ it "should replace with replace option" do
115
+ @src = "foo\xFC\xFCbar".force_encoding("macjapan")
116
+ MacJapanese.to_utf8(@src, undef: :replace, replace: "*").should == "foo*bar"
117
+ end
118
+ end
119
+
120
+ context "undef: (none)" do
121
+ it "should raise Encoding::UndefinedConversionError for undefined mac japanese char" do
122
+ @src = "foo\xFC\xFCbar".force_encoding("macjapan")
123
+ expect{
124
+ MacJapanese.to_utf8(@src)
125
+ }.to raise_error(Encoding::UndefinedConversionError)
126
+ end
127
+ it "should raise Encoding::UndefinedConversionError for undefined utf-8 japanese char" do
128
+ @src = "foo\u{FA11}bar"
129
+ expect{
130
+ MacJapanese.to_mac_japanese(@src)
131
+ }.to raise_error(Encoding::UndefinedConversionError)
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,20 @@
1
+ $LOAD_PATH.unshift "#{File.dirname(__FILE__)}/../lib"
2
+ require "mac_japanese"
3
+
4
+ # This file was generated by the `rspec --init` command. Conventionally, all
5
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
6
+ # Require this file using `require "spec_helper"` to ensure that it is only
7
+ # loaded once.
8
+ #
9
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
10
+ RSpec.configure do |config|
11
+ config.treat_symbols_as_metadata_keys_with_true_values = true
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
@@ -0,0 +1,133 @@
1
+ require "csv"
2
+ require "open-uri"
3
+
4
+ ROOT_DIR = File.expand_path "#{File.dirname(__FILE__)}/../"
5
+
6
+ # convert string to literal like "\x81\x40"
7
+ def hex_literal(string)
8
+ string.bytes.map{|b|
9
+ "\\x" + b.to_s(16).upcase.rjust(2, "0")
10
+ }.join
11
+ end
12
+
13
+ # convert string to literal like "\u{3041}"
14
+ def unicode_literal(string)
15
+ string.codepoints.map{|c|
16
+ "\\u{" + c.to_s(16).upcase.rjust(4, "0") + "}"
17
+ }.join
18
+ end
19
+
20
+ def pua?(four_hex_with_0x)
21
+ case four_hex_with_0x
22
+ when "\u{F860}", "\u{F861}", "\u{F862}", "\u{F87A}", "\u{F87E}", "\u{F87F}"
23
+ true
24
+ else
25
+ false
26
+ end
27
+ end
28
+
29
+ # Make pairs of string literals [[macjapanese, utf8], ...]
30
+ def make_pairs(use_pua = true)
31
+ pairs = []
32
+ # Control characters
33
+ pairs += (0x00..0x1f).map{|i|
34
+ c = [i].pack("C*")
35
+ [c, c]
36
+ }
37
+ open("#{ROOT_DIR}/src/JAPANESE.txt") do |f|
38
+ f.lines.each do |line|
39
+ next if line =~ /^#/ # ignore comment
40
+ next unless line =~ /^(0x.+)\t(0x.+)\t/ # capture
41
+ macjp_hex, unicode_hex = $~.captures
42
+
43
+ # macjp = macjp_hex[2..-1].chars.each_slice(2).map{|hex| "\\x" + hex.join}.join
44
+ macjp = macjp_hex[2..-1].chars.each_slice(2).map{|h| h.join.to_i(16)}.pack("C*")
45
+
46
+ unicode = unicode_hex.split("+").map{|hex|
47
+ c = hex.to_i(16).chr("utf-8")
48
+ next "" if !use_pua && pua?(c) # skip pua
49
+ c
50
+ }.join
51
+
52
+ pairs.push [macjp, unicode]
53
+ end
54
+ end
55
+ pairs
56
+ end
57
+
58
+ unless File.exist?("#{ROOT_DIR}/src/JAPANESE.txt")
59
+ open("#{ROOT_DIR}/src/JAPANESE.txt", "w") do |f|
60
+ open("http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT") do |g|
61
+ f.print g.read
62
+ end
63
+ end
64
+ end
65
+
66
+ # Make MacJapanese to UTF-8 table (with PUA)
67
+ path = "#{ROOT_DIR}/lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb"
68
+ puts path
69
+ literal_pairs = make_pairs.map{|m, u| [hex_literal(m), unicode_literal(u)]}
70
+ open(path, "w") do |f|
71
+ f.puts <<-EOS
72
+ # This file was automatically generated by `rake tables`.
73
+ # Cannot modify directly.
74
+ module MacJapanese
75
+ MAC_JAPANESE_TO_UTF8_WITH_PUA = Hash[
76
+ [
77
+ #{literal_pairs.map{|m, u| %{ ["#{m}", "#{u}"]}}.join(",\n")}
78
+ ].each{|m, u| m.force_encoding(Encoding::MacJapanese)}
79
+ ]
80
+ end
81
+ EOS
82
+ end
83
+
84
+ # Make MacJapanese to UTF-8 table (without PUA)
85
+ path = "#{ROOT_DIR}/lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb"
86
+ puts path
87
+ literal_pairs = make_pairs(false).map{|m, u| [hex_literal(m), unicode_literal(u)]}
88
+ open(path, "w") do |f|
89
+ f.puts <<-EOS
90
+ # This file was automatically generated by `rake tables`.
91
+ # Cannot modify directly.
92
+ module MacJapanese
93
+ MAC_JAPANESE_TO_UTF8_WITHOUT_PUA = Hash[
94
+ [
95
+ #{literal_pairs.map{|m, u| %{ ["#{m}", "#{u}"]}}.join(",\n")}
96
+ ].each{|m, u| m.force_encoding(Encoding::MacJapanese)}
97
+ ]
98
+ end
99
+ EOS
100
+ end
101
+
102
+ # Make UTF-8 to MacJapanese table
103
+ path = "#{ROOT_DIR}/lib/mac_japanese/utf8_to_mac_japanese.rb"
104
+ puts path
105
+ literal_pairs = make_pairs.map{|m, u| [hex_literal(m), unicode_literal(u)]}
106
+ open(path, "w") do |f|
107
+ f.puts <<-EOS
108
+ # This file was automatically generated by `rake tables`.
109
+ # Cannot modify directly.
110
+ module MacJapanese
111
+ UTF8_TO_MAC_JAPANESE = Hash[
112
+ [
113
+ #{literal_pairs.map{|m, u| %{ ["#{u}", "#{m}"]}}.join(",\n")}
114
+ ].each{|u, m| m.force_encoding(Encoding::MacJapanese)}
115
+ ]
116
+ end
117
+ EOS
118
+ end
119
+
120
+ # Make UTF-8 single character or decomposed characters regexp
121
+ path = "#{ROOT_DIR}/lib/mac_japanese/decomposed_or_normal_character_regexp.rb"
122
+ puts path
123
+ decomposed_or_single_character_regexp =
124
+ "/(" + make_pairs.map{|*, u| u}.select{|u| u.size > 1}.map{|u| unicode_literal(u)}.join("|") + "|.)/m"
125
+ open(path, "w") do |f|
126
+ f.puts <<-EOS
127
+ # This file was automatically generated by `rake tables`.
128
+ # Cannot modify directly.
129
+ module MacJapanese
130
+ DECOMPOSED_OR_NORMAL_CHARACTER_REGEXP = #{decomposed_or_single_character_regexp}
131
+ end
132
+ EOS
133
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mac_japanese
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - labocho
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 2.11.0
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 2.11.0
30
+ - !ruby/object:Gem::Dependency
31
+ name: guard-rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 0.7.0
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.7.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: ruby-debug19
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Convert MacJapanese string to UTF-8 and vice versa.
63
+ email:
64
+ - labocho@penguinlab.jp
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - .rspec
71
+ - Gemfile
72
+ - Guardfile
73
+ - LICENSE
74
+ - README.md
75
+ - Rakefile
76
+ - lib/mac_japanese.rb
77
+ - lib/mac_japanese/decomposed_or_normal_character_regexp.rb
78
+ - lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb
79
+ - lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb
80
+ - lib/mac_japanese/utf8_to_mac_japanese.rb
81
+ - lib/mac_japanese/version.rb
82
+ - mac_japanese.gemspec
83
+ - spec/mac_japanese_spec.rb
84
+ - spec/spec_helper.rb
85
+ - src/generate_conversion_tables.rb
86
+ homepage: https://github.com/labocho/mac_japanese
87
+ licenses: []
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 1.8.23
107
+ signing_key:
108
+ specification_version: 3
109
+ summary: Convert MacJapanese string to UTF-8 and vice versa.
110
+ test_files:
111
+ - spec/mac_japanese_spec.rb
112
+ - spec/spec_helper.rb