twitter_cldr 1.3.6 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +47 -2
- data/lib/twitter_cldr/core_ext/calendars/datetime.rb +2 -2
- data/lib/twitter_cldr/core_ext/calendars/timespan.rb +11 -13
- data/lib/twitter_cldr/normalizers.rb +3 -0
- data/lib/twitter_cldr/normalizers/base.rb +34 -0
- data/lib/twitter_cldr/normalizers/nfc.rb +24 -0
- data/lib/twitter_cldr/normalizers/nfd.rb +1 -1
- data/lib/twitter_cldr/normalizers/nfkc.rb +126 -0
- data/lib/twitter_cldr/normalizers/nfkd.rb +9 -17
- data/lib/twitter_cldr/shared.rb +1 -1
- data/lib/twitter_cldr/shared/code_point.rb +116 -0
- data/lib/twitter_cldr/tokenizers/base.rb +2 -2
- data/lib/twitter_cldr/utils.rb +8 -0
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/unicode_data/blocks_hangul.yml +46 -0
- data/resources/unicode_data/composition_exclusions.yml +293 -0
- data/resources/unicode_data/decomposition_map.yml +4565 -0
- data/spec/normalizers/NormalizationTestShort.txt +66 -66
- data/spec/normalizers/base_spec.rb +17 -0
- data/spec/normalizers/normalization_spec.rb +10 -0
- data/spec/readme_spec.rb +26 -1
- data/spec/shared/code_point_spec.rb +152 -0
- data/spec/tokenizers/base_spec.rb +0 -10
- data/spec/utils/{code_point_spec.rb → code_points_spec.rb} +0 -0
- data/spec/utils_spec.rb +10 -0
- metadata +16 -10
- data/lib/twitter_cldr/shared/unicode_data.rb +0 -64
- data/spec/normalizers/nfd_spec.rb +0 -21
- data/spec/shared/unicode_data_spec.rb +0 -51
@@ -1,64 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
module TwitterCldr
|
7
|
-
module Shared
|
8
|
-
module UnicodeData
|
9
|
-
|
10
|
-
class << self
|
11
|
-
|
12
|
-
def for_code_point(code_point)
|
13
|
-
blocks = TwitterCldr.get_resource(:unicode_data, :blocks)
|
14
|
-
|
15
|
-
#Find the target block
|
16
|
-
target = blocks.find do |block_name, range|
|
17
|
-
range.include? code_point.to_i(16)
|
18
|
-
end
|
19
|
-
|
20
|
-
if target
|
21
|
-
block_data = TwitterCldr.get_resource(:unicode_data, target.first)
|
22
|
-
code_point_data = block_data.fetch(code_point.to_sym) { |code_point_sym| get_range_start(code_point_sym, block_data) }
|
23
|
-
Attributes.new(*code_point_data) if code_point_data
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
private
|
28
|
-
|
29
|
-
# Check if block constitutes a range. The code point beginning a range will have a name enclosed in <>, ending with 'First'
|
30
|
-
# eg: <CJK Ideograph Extension A, First>
|
31
|
-
# http://unicode.org/reports/tr44/#Code_Point_Ranges
|
32
|
-
def get_range_start(code_point, block_data)
|
33
|
-
start_code_point = block_data.keys.sort_by { |key| key.to_s.to_i(16) }.first
|
34
|
-
start_data = block_data[start_code_point].clone
|
35
|
-
if start_data[1] =~ /<.*, First>/
|
36
|
-
start_data[0] = code_point.to_s
|
37
|
-
start_data[1] = start_data[1].sub(', First', '')
|
38
|
-
start_data
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
Attributes = Struct.new(
|
45
|
-
:code_point,
|
46
|
-
:name,
|
47
|
-
:category,
|
48
|
-
:combining_class,
|
49
|
-
:bidi_class,
|
50
|
-
:decomposition,
|
51
|
-
:digit_value,
|
52
|
-
:non_decimal_digit_value,
|
53
|
-
:numeric_value,
|
54
|
-
:bidi_mirrored,
|
55
|
-
:unicode1_name,
|
56
|
-
:iso_comment,
|
57
|
-
:simple_uppercase_map,
|
58
|
-
:simple_lowercase_map,
|
59
|
-
:simple_titlecase_map
|
60
|
-
)
|
61
|
-
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'spec_helper'
|
7
|
-
|
8
|
-
include TwitterCldr::Normalizers
|
9
|
-
|
10
|
-
describe NFD do
|
11
|
-
|
12
|
-
describe "#normalize" do
|
13
|
-
NFD.normalize("庠摪饢鼢豦樄澸脧鱵礩翜艰").should == "庠摪饢鼢豦樄澸脧鱵礩翜艰"
|
14
|
-
NFD.normalize("䷙䷿").should == "䷙䷿"
|
15
|
-
NFD.normalize("ᎿᎲᎪᏨᎨᏪᎧᎵᏥ").should == "ᎿᎲᎪᏨᎨᏪᎧᎵᏥ"
|
16
|
-
NFD.normalize("ᆙᅓᆼᄋᇶ").should == "ᆙᅓᆼᄋᇶ"
|
17
|
-
NFD.normalize("…‾⁋
⁒‒′‾⁖").should == "…‾⁋
⁒‒′‾⁖"
|
18
|
-
NFD.normalize("ⶾⷕⶱⷀ").should == "ⶾⷕⶱⷀ"
|
19
|
-
end
|
20
|
-
|
21
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'spec_helper'
|
7
|
-
|
8
|
-
include TwitterCldr::Shared
|
9
|
-
|
10
|
-
describe UnicodeData do
|
11
|
-
describe "#for_code_point" do
|
12
|
-
it "should retrieve information for any valid code point" do
|
13
|
-
data = UnicodeData.for_code_point('0301')
|
14
|
-
data.should be_a(Struct)
|
15
|
-
data.length.should == 15
|
16
|
-
end
|
17
|
-
|
18
|
-
it "should return nil for invalid code points" do
|
19
|
-
UnicodeData.for_code_point('abcd').should be_nil
|
20
|
-
UnicodeData.for_code_point('FFFFFFF').should be_nil
|
21
|
-
UnicodeData.for_code_point('uytukhil123').should be_nil
|
22
|
-
end
|
23
|
-
|
24
|
-
it "fetches valid information for the specified code point" do
|
25
|
-
test_data = {
|
26
|
-
'17D1' => ['17D1','KHMER SIGN VIRIAM','Mn','0','NSM',"","","","",'N',"","","","",""],
|
27
|
-
'FE91' => ['FE91','ARABIC LETTER BEH INITIAL FORM','Lo','0','AL','<initial> 0628',"","","",'N','GLYPH FOR INITIAL ARABIC BAA',"","","",""],
|
28
|
-
'24B5' => ['24B5','PARENTHESIZED LATIN SMALL LETTER Z','So','0','L','<compat> 0028 007A 0029',"","","",'N',"","","","",""],
|
29
|
-
'2128' => ['2128','BLACK-LETTER CAPITAL Z','Lu','0','L','<font> 005A',"","","",'N','BLACK-LETTER Z',"","","",""],
|
30
|
-
'1F241'=> ['1F241','TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E09','So','0','L','<compat> 3014 4E09 3015',"","","",'N',"","","","",""]
|
31
|
-
}
|
32
|
-
test_data.each_pair do |code_point, data|
|
33
|
-
UnicodeData.for_code_point(code_point).values.should == data
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
it "fetches valid information for a code point within a range" do
|
38
|
-
test_data = {
|
39
|
-
'4E11' => ["4E11","<CJK Ideograph>","Lo","0","L","","","","","N","","","","",""],
|
40
|
-
'AC55' => ["AC55","<Hangul Syllable>","Lo","0","L","","","","","N","","","","",""],
|
41
|
-
'D7A1' => ["D7A1","<Hangul Syllable>","Lo","0","L","","","","","N","","","","",""],
|
42
|
-
'DAAA' => ["DAAA","<Non Private Use High Surrogate>","Cs","0","L","","","","","N","","","","",""],
|
43
|
-
'F8FE' => ["F8FE","<Private Use>","Co","0","L","","","","","N","","","","",""]
|
44
|
-
}
|
45
|
-
|
46
|
-
test_data.each_pair do |code_point, data|
|
47
|
-
UnicodeData.for_code_point(code_point).values.should == data
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|