mac_japanese 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Guardfile +9 -0
- data/LICENSE +22 -0
- data/README.md +45 -0
- data/Rakefile +8 -0
- data/lib/mac_japanese.rb +70 -0
- data/lib/mac_japanese/decomposed_or_normal_character_regexp.rb +5 -0
- data/lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb +7395 -0
- data/lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb +7395 -0
- data/lib/mac_japanese/utf8_to_mac_japanese.rb +7395 -0
- data/lib/mac_japanese/version.rb +3 -0
- data/mac_japanese.gemspec +20 -0
- data/spec/mac_japanese_spec.rb +134 -0
- data/spec/spec_helper.rb +20 -0
- data/src/generate_conversion_tables.rb +133 -0
- metadata +112 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/mac_japanese/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["labocho"]
|
6
|
+
gem.email = ["labocho@penguinlab.jp"]
|
7
|
+
gem.description = %q{Convert MacJapanese string to UTF-8 and vice versa.}
|
8
|
+
gem.summary = %q{Convert MacJapanese string to UTF-8 and vice versa.}
|
9
|
+
gem.homepage = "https://github.com/labocho/mac_japanese"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "mac_japanese"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = MacJapanese::VERSION
|
17
|
+
gem.add_development_dependency "rspec", "~>2.11.0"
|
18
|
+
gem.add_development_dependency "guard-rspec", "~>0.7.0"
|
19
|
+
gem.add_development_dependency "ruby-debug19"
|
20
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "ruby-debug"
|
3
|
+
|
4
|
+
describe MacJapanese do
|
5
|
+
describe ".to_utf8" do
|
6
|
+
[true, false].each do |use_pua|
|
7
|
+
context "user_pua: #{use_pua}" do
|
8
|
+
let(:options) { {} }
|
9
|
+
subject { MacJapanese.to_utf8(@src, options.merge(use_pua: use_pua)) }
|
10
|
+
it "should convert us-ascii chars to utf8" do
|
11
|
+
@src = "foo\n".force_encoding("macjapan")
|
12
|
+
should == "foo\n"
|
13
|
+
end
|
14
|
+
it "should convert additional backslash to utf8" do
|
15
|
+
@src = "\x80".force_encoding("macjapan")
|
16
|
+
should == "\\"
|
17
|
+
end
|
18
|
+
it "should convert halfwidth katakana to utf8" do
|
19
|
+
@src = "\xA7".force_encoding("macjapan")
|
20
|
+
should == "\u{FF67}"
|
21
|
+
end
|
22
|
+
it "should convert hiragana to utf8" do
|
23
|
+
@src = "\x82\x9F".force_encoding("macjapan")
|
24
|
+
should == "\u{3041}"
|
25
|
+
end
|
26
|
+
it "should convert apple additions to utf8" do
|
27
|
+
@src = "\x85\x5E".force_encoding("macjapan")
|
28
|
+
should == "\u{2474}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
context "default" do
|
33
|
+
subject { MacJapanese.to_utf8(@src) }
|
34
|
+
it "should expand composed char with pua" do
|
35
|
+
@src = "\x85\xAB".force_encoding("macjapan")
|
36
|
+
should == "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
context "use_pua: true" do
|
40
|
+
subject { MacJapanese.to_utf8(@src, use_pua: true) }
|
41
|
+
it "should expand composed char with pua" do
|
42
|
+
@src = "\x85\xAB".force_encoding("macjapan")
|
43
|
+
should == "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
context "use_pua: false" do
|
47
|
+
subject { MacJapanese.to_utf8(@src, use_pua: false) }
|
48
|
+
it "should expand composed char without pua" do
|
49
|
+
@src = "\x85\xAB".force_encoding("macjapan")
|
50
|
+
should == "\u{0058}\u{0049}\u{0049}\u{0049}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
context "pass another encoding string to .to_utf8" do
|
54
|
+
it "should encode just like passing mac japanese string" do
|
55
|
+
@src = "\x82\x9F"
|
56
|
+
@src.encoding.should == Encoding::ASCII_8BIT
|
57
|
+
MacJapanese.to_utf8(@src).should == "\u{3041}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe ".to_mac_japanese" do
|
63
|
+
let(:options) { {} }
|
64
|
+
subject { MacJapanese.to_mac_japanese(@src, options) }
|
65
|
+
it "should convert us-ascii chars to mac_japanese" do
|
66
|
+
@src = "foo\n"
|
67
|
+
should == "foo\n".force_encoding("macjapan")
|
68
|
+
end
|
69
|
+
it "should convert additional backslash to mac_japanese" do
|
70
|
+
@src = "\\"
|
71
|
+
should == "\x80".force_encoding("macjapan")
|
72
|
+
end
|
73
|
+
it "should convert halfwidth katakana to mac_japanese" do
|
74
|
+
@src = "\u{FF67}"
|
75
|
+
should == "\xA7".force_encoding("macjapan")
|
76
|
+
end
|
77
|
+
it "should convert hiragana to mac_japanese" do
|
78
|
+
@src = "\u{3041}"
|
79
|
+
should == "\x82\x9F".force_encoding("macjapan")
|
80
|
+
end
|
81
|
+
it "should convert hiragana followed by composed characters to mac_japanese" do
|
82
|
+
@src = "\u{3041}\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
83
|
+
should == "\x82\x9F\x85\xAB".force_encoding("macjapan")
|
84
|
+
end
|
85
|
+
it "should convert apple additions to mac_japanese" do
|
86
|
+
@src = "\u{2474}"
|
87
|
+
should == "\x85\x5E".force_encoding("macjapan")
|
88
|
+
end
|
89
|
+
it "should compose characters with pua" do
|
90
|
+
@src = "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
91
|
+
should == "\x85\xAB".force_encoding("macjapan")
|
92
|
+
end
|
93
|
+
it "should not compose characters without pua" do
|
94
|
+
@src = "\u{0058}\u{0049}\u{0049}\u{0049}"
|
95
|
+
should == "XIII".force_encoding("macjapan")
|
96
|
+
end
|
97
|
+
context "pass another encoding string to .to_mac_japanese" do
|
98
|
+
it "should encode to mac japanese string (via utf8)" do
|
99
|
+
@src = "\u{3041}".encode("euc-jp")
|
100
|
+
MacJapanese.to_mac_japanese(@src).should == "\x82\x9F".force_encoding("macjapan")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
context "undef: :replace" do
|
106
|
+
it "should replace undefined mac japanese char" do
|
107
|
+
@src = "foo\xFC\xFCbar".force_encoding("macjapan")
|
108
|
+
MacJapanese.to_utf8(@src, undef: :replace).should == "foo\u{fffd}bar"
|
109
|
+
end
|
110
|
+
it "should replace undefined utf-8 char" do
|
111
|
+
@src = "foo\u{FA11}bar"
|
112
|
+
MacJapanese.to_mac_japanese(@src, undef: :replace).should == "foo?bar"
|
113
|
+
end
|
114
|
+
it "should replace with replace option" do
|
115
|
+
@src = "foo\xFC\xFCbar".force_encoding("macjapan")
|
116
|
+
MacJapanese.to_utf8(@src, undef: :replace, replace: "*").should == "foo*bar"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context "undef: (none)" do
|
121
|
+
it "should raise Encoding::UndefinedConversionError for undefined mac japanese char" do
|
122
|
+
@src = "foo\xFC\xFCbar".force_encoding("macjapan")
|
123
|
+
expect{
|
124
|
+
MacJapanese.to_utf8(@src)
|
125
|
+
}.to raise_error(Encoding::UndefinedConversionError)
|
126
|
+
end
|
127
|
+
it "should raise Encoding::UndefinedConversionError for undefined utf-8 japanese char" do
|
128
|
+
@src = "foo\u{FA11}bar"
|
129
|
+
expect{
|
130
|
+
MacJapanese.to_mac_japanese(@src)
|
131
|
+
}.to raise_error(Encoding::UndefinedConversionError)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$LOAD_PATH.unshift "#{File.dirname(__FILE__)}/../lib"
|
2
|
+
require "mac_japanese"
|
3
|
+
|
4
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
5
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
6
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
7
|
+
# loaded once.
|
8
|
+
#
|
9
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
10
|
+
RSpec.configure do |config|
|
11
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
12
|
+
config.run_all_when_everything_filtered = true
|
13
|
+
config.filter_run :focus
|
14
|
+
|
15
|
+
# Run specs in random order to surface order dependencies. If you find an
|
16
|
+
# order dependency and want to debug it, you can fix the order by providing
|
17
|
+
# the seed, which is printed after each run.
|
18
|
+
# --seed 1234
|
19
|
+
config.order = 'random'
|
20
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
ROOT_DIR = File.expand_path "#{File.dirname(__FILE__)}/../"
|
5
|
+
|
6
|
+
# convert string to literal like "\x81\x40"
|
7
|
+
def hex_literal(string)
|
8
|
+
string.bytes.map{|b|
|
9
|
+
"\\x" + b.to_s(16).upcase.rjust(2, "0")
|
10
|
+
}.join
|
11
|
+
end
|
12
|
+
|
13
|
+
# convert string to literal like "\u{3041}"
|
14
|
+
def unicode_literal(string)
|
15
|
+
string.codepoints.map{|c|
|
16
|
+
"\\u{" + c.to_s(16).upcase.rjust(4, "0") + "}"
|
17
|
+
}.join
|
18
|
+
end
|
19
|
+
|
20
|
+
def pua?(four_hex_with_0x)
|
21
|
+
case four_hex_with_0x
|
22
|
+
when "\u{F860}", "\u{F861}", "\u{F862}", "\u{F87A}", "\u{F87E}", "\u{F87F}"
|
23
|
+
true
|
24
|
+
else
|
25
|
+
false
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Make pairs of string literals [[macjapanese, utf8], ...]
|
30
|
+
def make_pairs(use_pua = true)
|
31
|
+
pairs = []
|
32
|
+
# Control characters
|
33
|
+
pairs += (0x00..0x1f).map{|i|
|
34
|
+
c = [i].pack("C*")
|
35
|
+
[c, c]
|
36
|
+
}
|
37
|
+
open("#{ROOT_DIR}/src/JAPANESE.txt") do |f|
|
38
|
+
f.lines.each do |line|
|
39
|
+
next if line =~ /^#/ # ignore comment
|
40
|
+
next unless line =~ /^(0x.+)\t(0x.+)\t/ # capture
|
41
|
+
macjp_hex, unicode_hex = $~.captures
|
42
|
+
|
43
|
+
# macjp = macjp_hex[2..-1].chars.each_slice(2).map{|hex| "\\x" + hex.join}.join
|
44
|
+
macjp = macjp_hex[2..-1].chars.each_slice(2).map{|h| h.join.to_i(16)}.pack("C*")
|
45
|
+
|
46
|
+
unicode = unicode_hex.split("+").map{|hex|
|
47
|
+
c = hex.to_i(16).chr("utf-8")
|
48
|
+
next "" if !use_pua && pua?(c) # skip pua
|
49
|
+
c
|
50
|
+
}.join
|
51
|
+
|
52
|
+
pairs.push [macjp, unicode]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
pairs
|
56
|
+
end
|
57
|
+
|
58
|
+
unless File.exist?("#{ROOT_DIR}/src/JAPANESE.txt")
|
59
|
+
open("#{ROOT_DIR}/src/JAPANESE.txt", "w") do |f|
|
60
|
+
open("http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT") do |g|
|
61
|
+
f.print g.read
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Make MacJapanese to UTF-8 table (with PUA)
|
67
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb"
|
68
|
+
puts path
|
69
|
+
literal_pairs = make_pairs.map{|m, u| [hex_literal(m), unicode_literal(u)]}
|
70
|
+
open(path, "w") do |f|
|
71
|
+
f.puts <<-EOS
|
72
|
+
# This file was automatically generated by `rake tables`.
|
73
|
+
# Cannot modify directly.
|
74
|
+
module MacJapanese
|
75
|
+
MAC_JAPANESE_TO_UTF8_WITH_PUA = Hash[
|
76
|
+
[
|
77
|
+
#{literal_pairs.map{|m, u| %{ ["#{m}", "#{u}"]}}.join(",\n")}
|
78
|
+
].each{|m, u| m.force_encoding(Encoding::MacJapanese)}
|
79
|
+
]
|
80
|
+
end
|
81
|
+
EOS
|
82
|
+
end
|
83
|
+
|
84
|
+
# Make MacJapanese to UTF-8 table (without PUA)
|
85
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb"
|
86
|
+
puts path
|
87
|
+
literal_pairs = make_pairs(false).map{|m, u| [hex_literal(m), unicode_literal(u)]}
|
88
|
+
open(path, "w") do |f|
|
89
|
+
f.puts <<-EOS
|
90
|
+
# This file was automatically generated by `rake tables`.
|
91
|
+
# Cannot modify directly.
|
92
|
+
module MacJapanese
|
93
|
+
MAC_JAPANESE_TO_UTF8_WITHOUT_PUA = Hash[
|
94
|
+
[
|
95
|
+
#{literal_pairs.map{|m, u| %{ ["#{m}", "#{u}"]}}.join(",\n")}
|
96
|
+
].each{|m, u| m.force_encoding(Encoding::MacJapanese)}
|
97
|
+
]
|
98
|
+
end
|
99
|
+
EOS
|
100
|
+
end
|
101
|
+
|
102
|
+
# Make UTF-8 to MacJapanese table
|
103
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/utf8_to_mac_japanese.rb"
|
104
|
+
puts path
|
105
|
+
literal_pairs = make_pairs.map{|m, u| [hex_literal(m), unicode_literal(u)]}
|
106
|
+
open(path, "w") do |f|
|
107
|
+
f.puts <<-EOS
|
108
|
+
# This file was automatically generated by `rake tables`.
|
109
|
+
# Cannot modify directly.
|
110
|
+
module MacJapanese
|
111
|
+
UTF8_TO_MAC_JAPANESE = Hash[
|
112
|
+
[
|
113
|
+
#{literal_pairs.map{|m, u| %{ ["#{u}", "#{m}"]}}.join(",\n")}
|
114
|
+
].each{|u, m| m.force_encoding(Encoding::MacJapanese)}
|
115
|
+
]
|
116
|
+
end
|
117
|
+
EOS
|
118
|
+
end
|
119
|
+
|
120
|
+
# Make UTF-8 single character or decomposed characters regexp
|
121
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/decomposed_or_normal_character_regexp.rb"
|
122
|
+
puts path
|
123
|
+
decomposed_or_single_character_regexp =
|
124
|
+
"/(" + make_pairs.map{|*, u| u}.select{|u| u.size > 1}.map{|u| unicode_literal(u)}.join("|") + "|.)/m"
|
125
|
+
open(path, "w") do |f|
|
126
|
+
f.puts <<-EOS
|
127
|
+
# This file was automatically generated by `rake tables`.
|
128
|
+
# Cannot modify directly.
|
129
|
+
module MacJapanese
|
130
|
+
DECOMPOSED_OR_NORMAL_CHARACTER_REGEXP = #{decomposed_or_single_character_regexp}
|
131
|
+
end
|
132
|
+
EOS
|
133
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mac_japanese
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- labocho
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-22 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.11.0
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 2.11.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: guard-rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 0.7.0
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.7.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: ruby-debug19
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Convert MacJapanese string to UTF-8 and vice versa.
|
63
|
+
email:
|
64
|
+
- labocho@penguinlab.jp
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- .rspec
|
71
|
+
- Gemfile
|
72
|
+
- Guardfile
|
73
|
+
- LICENSE
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- lib/mac_japanese.rb
|
77
|
+
- lib/mac_japanese/decomposed_or_normal_character_regexp.rb
|
78
|
+
- lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb
|
79
|
+
- lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb
|
80
|
+
- lib/mac_japanese/utf8_to_mac_japanese.rb
|
81
|
+
- lib/mac_japanese/version.rb
|
82
|
+
- mac_japanese.gemspec
|
83
|
+
- spec/mac_japanese_spec.rb
|
84
|
+
- spec/spec_helper.rb
|
85
|
+
- src/generate_conversion_tables.rb
|
86
|
+
homepage: https://github.com/labocho/mac_japanese
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 1.8.23
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Convert MacJapanese string to UTF-8 and vice versa.
|
110
|
+
test_files:
|
111
|
+
- spec/mac_japanese_spec.rb
|
112
|
+
- spec/spec_helper.rb
|