mac_japanese 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Guardfile +9 -0
- data/LICENSE +22 -0
- data/README.md +45 -0
- data/Rakefile +8 -0
- data/lib/mac_japanese.rb +70 -0
- data/lib/mac_japanese/decomposed_or_normal_character_regexp.rb +5 -0
- data/lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb +7395 -0
- data/lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb +7395 -0
- data/lib/mac_japanese/utf8_to_mac_japanese.rb +7395 -0
- data/lib/mac_japanese/version.rb +3 -0
- data/mac_japanese.gemspec +20 -0
- data/spec/mac_japanese_spec.rb +134 -0
- data/spec/spec_helper.rb +20 -0
- data/src/generate_conversion_tables.rb +133 -0
- metadata +112 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/mac_japanese/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["labocho"]
|
6
|
+
gem.email = ["labocho@penguinlab.jp"]
|
7
|
+
gem.description = %q{Convert MacJapanese string to UTF-8 and vice versa.}
|
8
|
+
gem.summary = %q{Convert MacJapanese string to UTF-8 and vice versa.}
|
9
|
+
gem.homepage = "https://github.com/labocho/mac_japanese"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "mac_japanese"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = MacJapanese::VERSION
|
17
|
+
gem.add_development_dependency "rspec", "~>2.11.0"
|
18
|
+
gem.add_development_dependency "guard-rspec", "~>0.7.0"
|
19
|
+
gem.add_development_dependency "ruby-debug19"
|
20
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "ruby-debug"
|
3
|
+
|
4
|
+
describe MacJapanese do
|
5
|
+
describe ".to_utf8" do
|
6
|
+
[true, false].each do |use_pua|
|
7
|
+
context "user_pua: #{use_pua}" do
|
8
|
+
let(:options) { {} }
|
9
|
+
subject { MacJapanese.to_utf8(@src, options.merge(use_pua: use_pua)) }
|
10
|
+
it "should convert us-ascii chars to utf8" do
|
11
|
+
@src = "foo\n".force_encoding("macjapan")
|
12
|
+
should == "foo\n"
|
13
|
+
end
|
14
|
+
it "should convert additional backslash to utf8" do
|
15
|
+
@src = "\x80".force_encoding("macjapan")
|
16
|
+
should == "\\"
|
17
|
+
end
|
18
|
+
it "should convert halfwidth katakana to utf8" do
|
19
|
+
@src = "\xA7".force_encoding("macjapan")
|
20
|
+
should == "\u{FF67}"
|
21
|
+
end
|
22
|
+
it "should convert hiragana to utf8" do
|
23
|
+
@src = "\x82\x9F".force_encoding("macjapan")
|
24
|
+
should == "\u{3041}"
|
25
|
+
end
|
26
|
+
it "should convert apple additions to utf8" do
|
27
|
+
@src = "\x85\x5E".force_encoding("macjapan")
|
28
|
+
should == "\u{2474}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
context "default" do
|
33
|
+
subject { MacJapanese.to_utf8(@src) }
|
34
|
+
it "should expand composed char with pua" do
|
35
|
+
@src = "\x85\xAB".force_encoding("macjapan")
|
36
|
+
should == "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
context "use_pua: true" do
|
40
|
+
subject { MacJapanese.to_utf8(@src, use_pua: true) }
|
41
|
+
it "should expand composed char with pua" do
|
42
|
+
@src = "\x85\xAB".force_encoding("macjapan")
|
43
|
+
should == "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
context "use_pua: false" do
|
47
|
+
subject { MacJapanese.to_utf8(@src, use_pua: false) }
|
48
|
+
it "should expand composed char without pua" do
|
49
|
+
@src = "\x85\xAB".force_encoding("macjapan")
|
50
|
+
should == "\u{0058}\u{0049}\u{0049}\u{0049}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
context "pass another encoding string to .to_utf8" do
|
54
|
+
it "should encode just like passing mac japanese string" do
|
55
|
+
@src = "\x82\x9F"
|
56
|
+
@src.encoding.should == Encoding::ASCII_8BIT
|
57
|
+
MacJapanese.to_utf8(@src).should == "\u{3041}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe ".to_mac_japanese" do
|
63
|
+
let(:options) { {} }
|
64
|
+
subject { MacJapanese.to_mac_japanese(@src, options) }
|
65
|
+
it "should convert us-ascii chars to mac_japanese" do
|
66
|
+
@src = "foo\n"
|
67
|
+
should == "foo\n".force_encoding("macjapan")
|
68
|
+
end
|
69
|
+
it "should convert additional backslash to mac_japanese" do
|
70
|
+
@src = "\\"
|
71
|
+
should == "\x80".force_encoding("macjapan")
|
72
|
+
end
|
73
|
+
it "should convert halfwidth katakana to mac_japanese" do
|
74
|
+
@src = "\u{FF67}"
|
75
|
+
should == "\xA7".force_encoding("macjapan")
|
76
|
+
end
|
77
|
+
it "should convert hiragana to mac_japanese" do
|
78
|
+
@src = "\u{3041}"
|
79
|
+
should == "\x82\x9F".force_encoding("macjapan")
|
80
|
+
end
|
81
|
+
it "should convert hiragana followed by composed characters to mac_japanese" do
|
82
|
+
@src = "\u{3041}\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
83
|
+
should == "\x82\x9F\x85\xAB".force_encoding("macjapan")
|
84
|
+
end
|
85
|
+
it "should convert apple additions to mac_japanese" do
|
86
|
+
@src = "\u{2474}"
|
87
|
+
should == "\x85\x5E".force_encoding("macjapan")
|
88
|
+
end
|
89
|
+
it "should compose characters with pua" do
|
90
|
+
@src = "\u{F862}\u{0058}\u{0049}\u{0049}\u{0049}"
|
91
|
+
should == "\x85\xAB".force_encoding("macjapan")
|
92
|
+
end
|
93
|
+
it "should not compose characters without pua" do
|
94
|
+
@src = "\u{0058}\u{0049}\u{0049}\u{0049}"
|
95
|
+
should == "XIII".force_encoding("macjapan")
|
96
|
+
end
|
97
|
+
context "pass another encoding string to .to_mac_japanese" do
|
98
|
+
it "should encode to mac japanese string (via utf8)" do
|
99
|
+
@src = "\u{3041}".encode("euc-jp")
|
100
|
+
MacJapanese.to_mac_japanese(@src).should == "\x82\x9F".force_encoding("macjapan")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
context "undef: :replace" do
|
106
|
+
it "should replace undefined mac japanese char" do
|
107
|
+
@src = "foo\xFC\xFCbar".force_encoding("macjapan")
|
108
|
+
MacJapanese.to_utf8(@src, undef: :replace).should == "foo\u{fffd}bar"
|
109
|
+
end
|
110
|
+
it "should replace undefined utf-8 char" do
|
111
|
+
@src = "foo\u{FA11}bar"
|
112
|
+
MacJapanese.to_mac_japanese(@src, undef: :replace).should == "foo?bar"
|
113
|
+
end
|
114
|
+
it "should replace with replace option" do
|
115
|
+
@src = "foo\xFC\xFCbar".force_encoding("macjapan")
|
116
|
+
MacJapanese.to_utf8(@src, undef: :replace, replace: "*").should == "foo*bar"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context "undef: (none)" do
|
121
|
+
it "should raise Encoding::UndefinedConversionError for undefined mac japanese char" do
|
122
|
+
@src = "foo\xFC\xFCbar".force_encoding("macjapan")
|
123
|
+
expect{
|
124
|
+
MacJapanese.to_utf8(@src)
|
125
|
+
}.to raise_error(Encoding::UndefinedConversionError)
|
126
|
+
end
|
127
|
+
it "should raise Encoding::UndefinedConversionError for undefined utf-8 japanese char" do
|
128
|
+
@src = "foo\u{FA11}bar"
|
129
|
+
expect{
|
130
|
+
MacJapanese.to_mac_japanese(@src)
|
131
|
+
}.to raise_error(Encoding::UndefinedConversionError)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$LOAD_PATH.unshift "#{File.dirname(__FILE__)}/../lib"
|
2
|
+
require "mac_japanese"
|
3
|
+
|
4
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
5
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
6
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
7
|
+
# loaded once.
|
8
|
+
#
|
9
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
10
|
+
RSpec.configure do |config|
|
11
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
12
|
+
config.run_all_when_everything_filtered = true
|
13
|
+
config.filter_run :focus
|
14
|
+
|
15
|
+
# Run specs in random order to surface order dependencies. If you find an
|
16
|
+
# order dependency and want to debug it, you can fix the order by providing
|
17
|
+
# the seed, which is printed after each run.
|
18
|
+
# --seed 1234
|
19
|
+
config.order = 'random'
|
20
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
ROOT_DIR = File.expand_path "#{File.dirname(__FILE__)}/../"
|
5
|
+
|
6
|
+
# convert string to literal like "\x81\x40"
|
7
|
+
def hex_literal(string)
|
8
|
+
string.bytes.map{|b|
|
9
|
+
"\\x" + b.to_s(16).upcase.rjust(2, "0")
|
10
|
+
}.join
|
11
|
+
end
|
12
|
+
|
13
|
+
# convert string to literal like "\u{3041}"
|
14
|
+
def unicode_literal(string)
|
15
|
+
string.codepoints.map{|c|
|
16
|
+
"\\u{" + c.to_s(16).upcase.rjust(4, "0") + "}"
|
17
|
+
}.join
|
18
|
+
end
|
19
|
+
|
20
|
+
def pua?(four_hex_with_0x)
|
21
|
+
case four_hex_with_0x
|
22
|
+
when "\u{F860}", "\u{F861}", "\u{F862}", "\u{F87A}", "\u{F87E}", "\u{F87F}"
|
23
|
+
true
|
24
|
+
else
|
25
|
+
false
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Make pairs of string literals [[macjapanese, utf8], ...]
|
30
|
+
def make_pairs(use_pua = true)
|
31
|
+
pairs = []
|
32
|
+
# Control characters
|
33
|
+
pairs += (0x00..0x1f).map{|i|
|
34
|
+
c = [i].pack("C*")
|
35
|
+
[c, c]
|
36
|
+
}
|
37
|
+
open("#{ROOT_DIR}/src/JAPANESE.txt") do |f|
|
38
|
+
f.lines.each do |line|
|
39
|
+
next if line =~ /^#/ # ignore comment
|
40
|
+
next unless line =~ /^(0x.+)\t(0x.+)\t/ # capture
|
41
|
+
macjp_hex, unicode_hex = $~.captures
|
42
|
+
|
43
|
+
# macjp = macjp_hex[2..-1].chars.each_slice(2).map{|hex| "\\x" + hex.join}.join
|
44
|
+
macjp = macjp_hex[2..-1].chars.each_slice(2).map{|h| h.join.to_i(16)}.pack("C*")
|
45
|
+
|
46
|
+
unicode = unicode_hex.split("+").map{|hex|
|
47
|
+
c = hex.to_i(16).chr("utf-8")
|
48
|
+
next "" if !use_pua && pua?(c) # skip pua
|
49
|
+
c
|
50
|
+
}.join
|
51
|
+
|
52
|
+
pairs.push [macjp, unicode]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
pairs
|
56
|
+
end
|
57
|
+
|
58
|
+
unless File.exist?("#{ROOT_DIR}/src/JAPANESE.txt")
|
59
|
+
open("#{ROOT_DIR}/src/JAPANESE.txt", "w") do |f|
|
60
|
+
open("http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT") do |g|
|
61
|
+
f.print g.read
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Make MacJapanese to UTF-8 table (with PUA)
|
67
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb"
|
68
|
+
puts path
|
69
|
+
literal_pairs = make_pairs.map{|m, u| [hex_literal(m), unicode_literal(u)]}
|
70
|
+
open(path, "w") do |f|
|
71
|
+
f.puts <<-EOS
|
72
|
+
# This file was automatically generated by `rake tables`.
|
73
|
+
# Cannot modify directly.
|
74
|
+
module MacJapanese
|
75
|
+
MAC_JAPANESE_TO_UTF8_WITH_PUA = Hash[
|
76
|
+
[
|
77
|
+
#{literal_pairs.map{|m, u| %{ ["#{m}", "#{u}"]}}.join(",\n")}
|
78
|
+
].each{|m, u| m.force_encoding(Encoding::MacJapanese)}
|
79
|
+
]
|
80
|
+
end
|
81
|
+
EOS
|
82
|
+
end
|
83
|
+
|
84
|
+
# Make MacJapanese to UTF-8 table (without PUA)
|
85
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb"
|
86
|
+
puts path
|
87
|
+
literal_pairs = make_pairs(false).map{|m, u| [hex_literal(m), unicode_literal(u)]}
|
88
|
+
open(path, "w") do |f|
|
89
|
+
f.puts <<-EOS
|
90
|
+
# This file was automatically generated by `rake tables`.
|
91
|
+
# Cannot modify directly.
|
92
|
+
module MacJapanese
|
93
|
+
MAC_JAPANESE_TO_UTF8_WITHOUT_PUA = Hash[
|
94
|
+
[
|
95
|
+
#{literal_pairs.map{|m, u| %{ ["#{m}", "#{u}"]}}.join(",\n")}
|
96
|
+
].each{|m, u| m.force_encoding(Encoding::MacJapanese)}
|
97
|
+
]
|
98
|
+
end
|
99
|
+
EOS
|
100
|
+
end
|
101
|
+
|
102
|
+
# Make UTF-8 to MacJapanese table
|
103
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/utf8_to_mac_japanese.rb"
|
104
|
+
puts path
|
105
|
+
literal_pairs = make_pairs.map{|m, u| [hex_literal(m), unicode_literal(u)]}
|
106
|
+
open(path, "w") do |f|
|
107
|
+
f.puts <<-EOS
|
108
|
+
# This file was automatically generated by `rake tables`.
|
109
|
+
# Cannot modify directly.
|
110
|
+
module MacJapanese
|
111
|
+
UTF8_TO_MAC_JAPANESE = Hash[
|
112
|
+
[
|
113
|
+
#{literal_pairs.map{|m, u| %{ ["#{u}", "#{m}"]}}.join(",\n")}
|
114
|
+
].each{|u, m| m.force_encoding(Encoding::MacJapanese)}
|
115
|
+
]
|
116
|
+
end
|
117
|
+
EOS
|
118
|
+
end
|
119
|
+
|
120
|
+
# Make UTF-8 single character or decomposed characters regexp
|
121
|
+
path = "#{ROOT_DIR}/lib/mac_japanese/decomposed_or_normal_character_regexp.rb"
|
122
|
+
puts path
|
123
|
+
decomposed_or_single_character_regexp =
|
124
|
+
"/(" + make_pairs.map{|*, u| u}.select{|u| u.size > 1}.map{|u| unicode_literal(u)}.join("|") + "|.)/m"
|
125
|
+
open(path, "w") do |f|
|
126
|
+
f.puts <<-EOS
|
127
|
+
# This file was automatically generated by `rake tables`.
|
128
|
+
# Cannot modify directly.
|
129
|
+
module MacJapanese
|
130
|
+
DECOMPOSED_OR_NORMAL_CHARACTER_REGEXP = #{decomposed_or_single_character_regexp}
|
131
|
+
end
|
132
|
+
EOS
|
133
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mac_japanese
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- labocho
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-22 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.11.0
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 2.11.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: guard-rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 0.7.0
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.7.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: ruby-debug19
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Convert MacJapanese string to UTF-8 and vice versa.
|
63
|
+
email:
|
64
|
+
- labocho@penguinlab.jp
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- .rspec
|
71
|
+
- Gemfile
|
72
|
+
- Guardfile
|
73
|
+
- LICENSE
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- lib/mac_japanese.rb
|
77
|
+
- lib/mac_japanese/decomposed_or_normal_character_regexp.rb
|
78
|
+
- lib/mac_japanese/mac_japanese_to_utf8_with_pua.rb
|
79
|
+
- lib/mac_japanese/mac_japanese_to_utf8_without_pua.rb
|
80
|
+
- lib/mac_japanese/utf8_to_mac_japanese.rb
|
81
|
+
- lib/mac_japanese/version.rb
|
82
|
+
- mac_japanese.gemspec
|
83
|
+
- spec/mac_japanese_spec.rb
|
84
|
+
- spec/spec_helper.rb
|
85
|
+
- src/generate_conversion_tables.rb
|
86
|
+
homepage: https://github.com/labocho/mac_japanese
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 1.8.23
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Convert MacJapanese string to UTF-8 and vice versa.
|
110
|
+
test_files:
|
111
|
+
- spec/mac_japanese_spec.rb
|
112
|
+
- spec/spec_helper.rb
|