unicode_utils 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +5 -5
- data/cdata/casefold_c_map +1 -0
- data/cdata/casefold_f_map +1 -0
- data/cdata/casefold_s_map +1 -0
- data/lib/unicode_utils.rb +7 -2
- data/lib/unicode_utils/canonical_decomposition.rb +1 -1
- data/lib/unicode_utils/case_ignorable_char_q.rb +1 -1
- data/lib/unicode_utils/casefold.rb +33 -0
- data/lib/unicode_utils/{name.rb → char_name.rb} +15 -8
- data/lib/unicode_utils/codepoint.rb +66 -0
- data/lib/unicode_utils/compatibility_decomposition.rb +1 -1
- data/lib/unicode_utils/downcase.rb +1 -1
- data/lib/unicode_utils/grep.rb +21 -0
- data/lib/unicode_utils/hangul_syllable_decomposition.rb +3 -2
- data/lib/unicode_utils/jamo_short_name.rb +1 -1
- data/lib/unicode_utils/lowercase_char_q.rb +1 -1
- data/lib/unicode_utils/nfc.rb +1 -1
- data/lib/unicode_utils/read_cdata.rb +71 -0
- data/lib/unicode_utils/simple_casefold.rb +32 -0
- data/lib/unicode_utils/simple_downcase.rb +1 -1
- data/lib/unicode_utils/simple_upcase.rb +1 -1
- data/lib/unicode_utils/soft_dotted_char_q.rb +1 -1
- data/lib/unicode_utils/titlecase_char_q.rb +1 -1
- data/lib/unicode_utils/u.rb +12 -0
- data/lib/unicode_utils/upcase.rb +1 -1
- data/lib/unicode_utils/uppercase_char_q.rb +1 -1
- data/lib/unicode_utils/version.rb +1 -1
- data/test/test_unicode_utils.rb +25 -8
- metadata +12 -9
- data/lib/unicode_utils/read_codepoint_map.rb +0 -22
- data/lib/unicode_utils/read_codepoint_set.rb +0 -22
- data/lib/unicode_utils/read_multivalued_map.rb +0 -27
- data/lib/unicode_utils/read_names.rb +0 -22
- data/test/test_normalization.rb +0 -95
data/README.txt
CHANGED
@@ -6,16 +6,13 @@ Install with RubyGems:
|
|
6
6
|
|
7
7
|
Or get the source from Github: http://github.com/lang/unicode_utils
|
8
8
|
|
9
|
-
UnicodeUtils works with Ruby 1.9.1-
|
10
|
-
(http://redmine.ruby-lang.org/issues/show/692) in
|
11
|
-
1.9.1-preview1 prevents UnicodeUtils from loading when
|
12
|
-
Encoding.default_internal is set (e.g. with -U or -E).
|
9
|
+
UnicodeUtils works with Ruby 1.9.1-preview2 or later.
|
13
10
|
|
14
11
|
== Synopsis
|
15
12
|
|
16
13
|
require "unicode_utils"
|
17
14
|
|
18
|
-
UnicodeUtils.
|
15
|
+
UnicodeUtils.char_name("æ") => "LATIN SMALL LETTER AE"
|
19
16
|
|
20
17
|
UnicodeUtils.upcase("weiß") => "WEISS"
|
21
18
|
|
@@ -37,6 +34,9 @@ startup time. Methods that end in a ? are in a file suffixed with
|
|
37
34
|
+_q+, e.g. <tt>lowercase_char?</tt> can be required with
|
38
35
|
<tt>unicode_utils/lowercase_char_q</tt>.
|
39
36
|
|
37
|
+
There is also a shortcut for IRB usage. See
|
38
|
+
U[link:files/lib/unicode_utils/u_rb.html].
|
39
|
+
|
40
40
|
== License
|
41
41
|
|
42
42
|
unicode_utils is licensed under the BSD license. Read the file
|
@@ -0,0 +1 @@
|
|
1
|
+
00004100006100004200006200004300006300004400006400004500006500004600006600004700006700004800006800004900006900004a00006a00004b00006b00004c00006c00004d00006d00004e00006e00004f00006f00005000007000005100007100005200007200005300007300005400007400005500007500005600007600005700007700005800007800005900007900005a00007a0000b50003bc0000c00000e00000c10000e10000c20000e20000c30000e30000c40000e40000c50000e50000c60000e60000c70000e70000c80000e80000c90000e90000ca0000ea0000cb0000eb0000cc0000ec0000cd0000ed0000ce0000ee0000cf0000ef0000d00000f00000d10000f10000d20000f20000d30000f30000d40000f40000d50000f50000d60000f60000d80000f80000d90000f90000da0000fa0000db0000fb0000dc0000fc0000dd0000fd0000de0000fe00010000010100010200010300010400010500010600010700010800010900010a00010b00010c00010d00010e00010f00011000011100011200011300011400011500011600011700011800011900011a00011b00011c00011d00011e00011f00012000012100012200012300012400012500012600012700012800012900012a00012b00012c00012d00012e00012f00013200013300013400013500013600013700013900013a00013b00013c00013d00013e00013f00014000014100014200014300014400014500014600014700014800014a00014b00014c00014d00014e00014f00015000015100015200015300015400015500015600015700015800015900015a00015b00015c00015d00015e00015f00016000016100016200016300016400016500016600016700016800016900016a00016b00016c00016d00016e00016f0001700001710001720001730001740001750001760001770001780000ff00017900017a00017b00017c00017d00017e00017f00007300018100025300018200018300018400018500018600025400018700018800018900025600018a00025700018b00018c00018e0001dd00018f00025900019000025b00019100019200019300026000019400026300019600026900019700026800019800019900019c00026f00019d00027200019f0002750001a00001a10001a20001a30001a40001a50001a60002800001a70001a80001a90002830001ac0001ad0001ae0002880001af0001b00001b100028a0001b200028b0001b30001b40001b50001b60001b70002920001b80001b90001bc0001bd0001c40001c60001c50001c60001c70001c90001c80001c90001ca0001cc0001cb0001cc0001cd0001ce0001cf0001d00001d10001d20001d30001d40001d50001d60001d70001d80001d90001da0001db0001dc0001de0001df0001e00001e10001e20001e30001e40001e50001e60001e70001e80001e90001ea0001eb0001ec0001ed0001ee0001ef0001f10001f30001f20001f30001f40001f50001f60001950001f70001bf0001f80001f90001fa0001fb0001fc0001fd0001fe0001ff00020000020100020200020300020400020500020600020700020800020900020a00020b00020c00020d00020e00020f00021000021100021200021300021400021500021600021700021800021900021a00021b00021c00021d00021e00021f00022000019e00022200022300022400022500022600022700022800022900022a00022b00022c00022d00022e00022f00023000023100023200023300023a002c6500023b00023c00023d00019a00023e002c6600024100024200024300018000024400028900024500028c00024600024700024800024900024a00024b00024c00024d00024e00024f0003450003b90003700003710003720003730003760003770003860003ac0003880003ad0003890003ae00038a0003af00038c0003cc00038e0003cd00038f0003ce0003910003b10003920003b20003930003b30003940003b40003950003b50003960003b60003970003b70003980003b80003990003b900039a0003ba00039b0003bb00039c0003bc00039d0003bd00039e0003be00039f0003bf0003a00003c00003a10003c10003a30003c30003a40003c40003a50003c50003a60003c60003a70003c70003a80003c80003a90003c90003aa0003ca0003ab0003cb0003c20003c30003cf0003d70003d00003b20003d10003b80003d50003c60003d60003c00003d80003d90003da0003db0003dc0003dd0003de0003df0003e00003e10003e20003e30003e40003e50003e60003e70003e80003e90003ea0003eb0003ec0003ed0003ee0003ef0003f00003ba0003f10003c10003f40003b80003f50003b50003f70003f80003f90003f20003fa0003fb0003fd00037b0003fe00037c0003ff00037d00040000045000040100045100040200045200040300045300040400045400040500045500040600045600040700045700040800045800040900045900040a00045a00040b00045b00040c00045c00040d00045d00040e00045e00040f00045f00041000043000041100043100041200043200041300043300041400043400041500043500041600043600041700043700041800043800041900043900041a00043a00041b00043b00041c00043c00041d00043d00041e00043e00041f00043f00042000044000042100044100042200044200042300044300042400044400042500044500042600044600042700044700042800044800042900044900042a00044a00042b00044b00042c00044c00042d00044d00042e00044e00042f00044f00046000046100046200046300046400046500046600046700046800046900046a00046b00046c00046d00046e00046f00047000047100047200047300047400047500047600047700047800047900047a00047b00047c00047d00047e00047f00048000048100048a00048b00048c00048d00048e00048f00049000049100049200049300049400049500049600049700049800049900049a00049b00049c00049d00049e00049f0004a00004a10004a20004a30004a40004a50004a60004a70004a80004a90004aa0004ab0004ac0004ad0004ae0004af0004b00004b10004b20004b30004b40004b50004b60004b70004b80004b90004ba0004bb0004bc0004bd0004be0004bf0004c00004cf0004c10004c20004c30004c40004c50004c60004c70004c80004c90004ca0004cb0004cc0004cd0004ce0004d00004d10004d20004d30004d40004d50004d60004d70004d80004d90004da0004db0004dc0004dd0004de0004df0004e00004e10004e20004e30004e40004e50004e60004e70004e80004e90004ea0004eb0004ec0004ed0004ee0004ef0004f00004f10004f20004f30004f40004f50004f60004f70004f80004f90004fa0004fb0004fc0004fd0004fe0004ff00050000050100050200050300050400050500050600050700050800050900050a00050b00050c00050d00050e00050f00051000051100051200051300051400051500051600051700051800051900051a00051b00051c00051d00051e00051f00052000052100052200052300053100056100053200056200053300056300053400056400053500056500053600056600053700056700053800056800053900056900053a00056a00053b00056b00053c00056c00053d00056d00053e00056e00053f00056f00054000057000054100057100054200057200054300057300054400057400054500057500054600057600054700057700054800057800054900057900054a00057a00054b00057b00054c00057c00054d00057d00054e00057e00054f00057f0005500005800005510005810005520005820005530005830005540005840005550005850005560005860010a0002d000010a1002d010010a2002d020010a3002d030010a4002d040010a5002d050010a6002d060010a7002d070010a8002d080010a9002d090010aa002d0a0010ab002d0b0010ac002d0c0010ad002d0d0010ae002d0e0010af002d0f0010b0002d100010b1002d110010b2002d120010b3002d130010b4002d140010b5002d150010b6002d160010b7002d170010b8002d180010b9002d190010ba002d1a0010bb002d1b0010bc002d1c0010bd002d1d0010be002d1e0010bf002d1f0010c0002d200010c1002d210010c2002d220010c3002d230010c4002d240010c5002d25001e00001e01001e02001e03001e04001e05001e06001e07001e08001e09001e0a001e0b001e0c001e0d001e0e001e0f001e10001e11001e12001e13001e14001e15001e16001e17001e18001e19001e1a001e1b001e1c001e1d001e1e001e1f001e20001e21001e22001e23001e24001e25001e26001e27001e28001e29001e2a001e2b001e2c001e2d001e2e001e2f001e30001e31001e32001e33001e34001e35001e36001e37001e38001e39001e3a001e3b001e3c001e3d001e3e001e3f001e40001e41001e42001e43001e44001e45001e46001e47001e48001e49001e4a001e4b001e4c001e4d001e4e001e4f001e50001e51001e52001e53001e54001e55001e56001e57001e58001e59001e5a001e5b001e5c001e5d001e5e001e5f001e60001e61001e62001e63001e64001e65001e66001e67001e68001e69001e6a001e6b001e6c001e6d001e6e001e6f001e70001e71001e72001e73001e74001e75001e76001e77001e78001e79001e7a001e7b001e7c001e7d001e7e001e7f001e80001e81001e82001e83001e84001e85001e86001e87001e88001e89001e8a001e8b001e8c001e8d001e8e001e8f001e90001e91001e92001e93001e94001e95001e9b001e61001ea0001ea1001ea2001ea3001ea4001ea5001ea6001ea7001ea8001ea9001eaa001eab001eac001ead001eae001eaf001eb0001eb1001eb2001eb3001eb4001eb5001eb6001eb7001eb8001eb9001eba001ebb001ebc001ebd001ebe001ebf001ec0001ec1001ec2001ec3001ec4001ec5001ec6001ec7001ec8001ec9001eca001ecb001ecc001ecd001ece001ecf001ed0001ed1001ed2001ed3001ed4001ed5001ed6001ed7001ed8001ed9001eda001edb001edc001edd001ede001edf001ee0001ee1001ee2001ee3001ee4001ee5001ee6001ee7001ee8001ee9001eea001eeb001eec001eed001eee001eef001ef0001ef1001ef2001ef3001ef4001ef5001ef6001ef7001ef8001ef9001efa001efb001efc001efd001efe001eff001f08001f00001f09001f01001f0a001f02001f0b001f03001f0c001f04001f0d001f05001f0e001f06001f0f001f07001f18001f10001f19001f11001f1a001f12001f1b001f13001f1c001f14001f1d001f15001f28001f20001f29001f21001f2a001f22001f2b001f23001f2c001f24001f2d001f25001f2e001f26001f2f001f27001f38001f30001f39001f31001f3a001f32001f3b001f33001f3c001f34001f3d001f35001f3e001f36001f3f001f37001f48001f40001f49001f41001f4a001f42001f4b001f43001f4c001f44001f4d001f45001f59001f51001f5b001f53001f5d001f55001f5f001f57001f68001f60001f69001f61001f6a001f62001f6b001f63001f6c001f64001f6d001f65001f6e001f66001f6f001f67001fb8001fb0001fb9001fb1001fba001f70001fbb001f71001fbe0003b9001fc8001f72001fc9001f73001fca001f74001fcb001f75001fd8001fd0001fd9001fd1001fda001f76001fdb001f77001fe8001fe0001fe9001fe1001fea001f7a001feb001f7b001fec001fe5001ff8001f78001ff9001f79001ffa001f7c001ffb001f7d0021260003c900212a00006b00212b0000e500213200214e00216000217000216100217100216200217200216300217300216400217400216500217500216600217600216700217700216800217800216900217900216a00217a00216b00217b00216c00217c00216d00217d00216e00217e00216f00217f0021830021840024b60024d00024b70024d10024b80024d20024b90024d30024ba0024d40024bb0024d50024bc0024d60024bd0024d70024be0024d80024bf0024d90024c00024da0024c10024db0024c20024dc0024c30024dd0024c40024de0024c50024df0024c60024e00024c70024e10024c80024e20024c90024e30024ca0024e40024cb0024e50024cc0024e60024cd0024e70024ce0024e80024cf0024e9002c00002c30002c01002c31002c02002c32002c03002c33002c04002c34002c05002c35002c06002c36002c07002c37002c08002c38002c09002c39002c0a002c3a002c0b002c3b002c0c002c3c002c0d002c3d002c0e002c3e002c0f002c3f002c10002c40002c11002c41002c12002c42002c13002c43002c14002c44002c15002c45002c16002c46002c17002c47002c18002c48002c19002c49002c1a002c4a002c1b002c4b002c1c002c4c002c1d002c4d002c1e002c4e002c1f002c4f002c20002c50002c21002c51002c22002c52002c23002c53002c24002c54002c25002c55002c26002c56002c27002c57002c28002c58002c29002c59002c2a002c5a002c2b002c5b002c2c002c5c002c2d002c5d002c2e002c5e002c60002c61002c6200026b002c63001d7d002c6400027d002c67002c68002c69002c6a002c6b002c6c002c6d000251002c6e000271002c6f000250002c72002c73002c75002c76002c80002c81002c82002c83002c84002c85002c86002c87002c88002c89002c8a002c8b002c8c002c8d002c8e002c8f002c90002c91002c92002c93002c94002c95002c96002c97002c98002c99002c9a002c9b002c9c002c9d002c9e002c9f002ca0002ca1002ca2002ca3002ca4002ca5002ca6002ca7002ca8002ca9002caa002cab002cac002cad002cae002caf002cb0002cb1002cb2002cb3002cb4002cb5002cb6002cb7002cb8002cb9002cba002cbb002cbc002cbd002cbe002cbf002cc0002cc1002cc2002cc3002cc4002cc5002cc6002cc7002cc8002cc9002cca002ccb002ccc002ccd002cce002ccf002cd0002cd1002cd2002cd3002cd4002cd5002cd6002cd7002cd8002cd9002cda002cdb002cdc002cdd002cde002cdf002ce0002ce1002ce2002ce300a64000a64100a64200a64300a64400a64500a64600a64700a64800a64900a64a00a64b00a64c00a64d00a64e00a64f00a65000a65100a65200a65300a65400a65500a65600a65700a65800a65900a65a00a65b00a65c00a65d00a65e00a65f00a66200a66300a66400a66500a66600a66700a66800a66900a66a00a66b00a66c00a66d00a68000a68100a68200a68300a68400a68500a68600a68700a68800a68900a68a00a68b00a68c00a68d00a68e00a68f00a69000a69100a69200a69300a69400a69500a69600a69700a72200a72300a72400a72500a72600a72700a72800a72900a72a00a72b00a72c00a72d00a72e00a72f00a73200a73300a73400a73500a73600a73700a73800a73900a73a00a73b00a73c00a73d00a73e00a73f00a74000a74100a74200a74300a74400a74500a74600a74700a74800a74900a74a00a74b00a74c00a74d00a74e00a74f00a75000a75100a75200a75300a75400a75500a75600a75700a75800a75900a75a00a75b00a75c00a75d00a75e00a75f00a76000a76100a76200a76300a76400a76500a76600a76700a76800a76900a76a00a76b00a76c00a76d00a76e00a76f00a77900a77a00a77b00a77c00a77d001d7900a77e00a77f00a78000a78100a78200a78300a78400a78500a78600a78700a78b00a78c00ff2100ff4100ff2200ff4200ff2300ff4300ff2400ff4400ff2500ff4500ff2600ff4600ff2700ff4700ff2800ff4800ff2900ff4900ff2a00ff4a00ff2b00ff4b00ff2c00ff4c00ff2d00ff4d00ff2e00ff4e00ff2f00ff4f00ff3000ff5000ff3100ff5100ff3200ff5200ff3300ff5300ff3400ff5400ff3500ff5500ff3600ff5600ff3700ff5700ff3800ff5800ff3900ff5900ff3a00ff5a01040001042801040101042901040201042a01040301042b01040401042c01040501042d01040601042e01040701042f01040801043001040901043101040a01043201040b01043301040c01043401040d01043501040e01043601040f01043701041001043801041101043901041201043a01041301043b01041401043c01041501043d01041601043e01041701043f01041801044001041901044101041a01044201041b01044301041c01044401041d01044501041e01044601041f01044701042001044801042101044901042201044a01042301044b01042401044c01042501044d01042601044e01042701044f
|
@@ -0,0 +1 @@
|
|
1
|
+
0000df000073000073xxxxxx000130000069000307xxxxxx0001490002bc00006exxxxxx0001f000006a00030cxxxxxx0003900003b9000308000301xxxxxx0003b00003c5000308000301xxxxxx000587000565000582xxxxxx001e96000068000331xxxxxx001e97000074000308xxxxxx001e9800007700030axxxxxx001e9900007900030axxxxxx001e9a0000610002bexxxxxx001e9e000073000073xxxxxx001f500003c5000313xxxxxx001f520003c5000313000300xxxxxx001f540003c5000313000301xxxxxx001f560003c5000313000342xxxxxx001f80001f000003b9xxxxxx001f81001f010003b9xxxxxx001f82001f020003b9xxxxxx001f83001f030003b9xxxxxx001f84001f040003b9xxxxxx001f85001f050003b9xxxxxx001f86001f060003b9xxxxxx001f87001f070003b9xxxxxx001f88001f000003b9xxxxxx001f89001f010003b9xxxxxx001f8a001f020003b9xxxxxx001f8b001f030003b9xxxxxx001f8c001f040003b9xxxxxx001f8d001f050003b9xxxxxx001f8e001f060003b9xxxxxx001f8f001f070003b9xxxxxx001f90001f200003b9xxxxxx001f91001f210003b9xxxxxx001f92001f220003b9xxxxxx001f93001f230003b9xxxxxx001f94001f240003b9xxxxxx001f95001f250003b9xxxxxx001f96001f260003b9xxxxxx001f97001f270003b9xxxxxx001f98001f200003b9xxxxxx001f99001f210003b9xxxxxx001f9a001f220003b9xxxxxx001f9b001f230003b9xxxxxx001f9c001f240003b9xxxxxx001f9d001f250003b9xxxxxx001f9e001f260003b9xxxxxx001f9f001f270003b9xxxxxx001fa0001f600003b9xxxxxx001fa1001f610003b9xxxxxx001fa2001f620003b9xxxxxx001fa3001f630003b9xxxxxx001fa4001f640003b9xxxxxx001fa5001f650003b9xxxxxx001fa6001f660003b9xxxxxx001fa7001f670003b9xxxxxx001fa8001f600003b9xxxxxx001fa9001f610003b9xxxxxx001faa001f620003b9xxxxxx001fab001f630003b9xxxxxx001fac001f640003b9xxxxxx001fad001f650003b9xxxxxx001fae001f660003b9xxxxxx001faf001f670003b9xxxxxx001fb2001f700003b9xxxxxx001fb30003b10003b9xxxxxx001fb40003ac0003b9xxxxxx001fb60003b1000342xxxxxx001fb70003b10003420003b9xxxxxx001fbc0003b10003b9xxxxxx001fc2001f740003b9xxxxxx001fc30003b70003b9xxxxxx001fc40003ae0003b9xxxxxx001fc60003b7000342xxxxxx001fc70003b70003420003b9xxxxxx001fcc0003b70003b9xxxxxx001fd20003b9000308000300xxxxxx001fd30003b9000308000301xxxxxx001fd60003b9000342xxxxxx001fd70003b9000308000342xxxxxx001fe20003c5000308000300xxxxxx001fe30003c5000308000301xxxxxx001fe40003c1000313xxxxxx001fe60003c5000342xxxxxx001fe70003c5000308000342xxxxxx001ff2001f7c0003b9xxxxxx001ff30003c90003b9xxxxxx001ff40003ce0003b9xxxxxx001ff60003c9000342xxxxxx001ff70003c90003420003b9xxxxxx001ffc0003c90003b9xxxxxx00fb00000066000066xxxxxx00fb01000066000069xxxxxx00fb0200006600006cxxxxxx00fb03000066000066000069xxxxxx00fb0400006600006600006cxxxxxx00fb05000073000074xxxxxx00fb06000073000074xxxxxx00fb13000574000576xxxxxx00fb14000574000565xxxxxx00fb1500057400056bxxxxxx00fb1600057e000576xxxxxx00fb1700057400056dxxxxxx
|
@@ -0,0 +1 @@
|
|
1
|
+
001e9e0000df001f88001f80001f89001f81001f8a001f82001f8b001f83001f8c001f84001f8d001f85001f8e001f86001f8f001f87001f98001f90001f99001f91001f9a001f92001f9b001f93001f9c001f94001f9d001f95001f9e001f96001f9f001f97001fa8001fa0001fa9001fa1001faa001fa2001fab001fa3001fac001fa4001fad001fa5001fae001fa6001faf001fa7001fbc001fb3001fcc001fc3001ffc001ff3
|
data/lib/unicode_utils.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
require "unicode_utils/version"
|
4
|
-
require "unicode_utils/
|
4
|
+
require "unicode_utils/char_name"
|
5
5
|
require "unicode_utils/simple_upcase"
|
6
6
|
require "unicode_utils/simple_downcase"
|
7
7
|
require "unicode_utils/upcase"
|
@@ -22,6 +22,10 @@ require "unicode_utils/nfc"
|
|
22
22
|
require "unicode_utils/compatibility_decomposition"
|
23
23
|
require "unicode_utils/nfkd"
|
24
24
|
require "unicode_utils/nfkc"
|
25
|
+
require "unicode_utils/codepoint"
|
26
|
+
require "unicode_utils/grep"
|
27
|
+
require "unicode_utils/simple_casefold"
|
28
|
+
require "unicode_utils/casefold"
|
25
29
|
|
26
30
|
# Read the README[link:files/README_txt.html] for an introduction.
|
27
31
|
#
|
@@ -33,6 +37,7 @@ require "unicode_utils/nfkc"
|
|
33
37
|
# UnicodeUtils.nfc:: Normalization Form C
|
34
38
|
# UnicodeUtils.nfkd:: Normalization Form KD
|
35
39
|
# UnicodeUtils.nfkc:: Normalization Form KC
|
36
|
-
# UnicodeUtils.
|
40
|
+
# UnicodeUtils.char_name:: character names
|
41
|
+
# UnicodeUtils.casefold:: case folding (case insensitive string comparison)
|
37
42
|
module UnicodeUtils
|
38
43
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_cdata"
|
4
|
+
require "unicode_utils/simple_casefold"
|
5
|
+
|
6
|
+
module UnicodeUtils
|
7
|
+
|
8
|
+
CASEFOLD_F_MAP = Impl.read_multivalued_map("casefold_f_map") # :nodoc:
|
9
|
+
|
10
|
+
# Perform full case folding. The returned string may be longer than
|
11
|
+
# +str+. The purpose of case folding is case insensitive string
|
12
|
+
# comparison.
|
13
|
+
#
|
14
|
+
# Examples:
|
15
|
+
#
|
16
|
+
# UnicodeUtils.casefold("Ümit") == UnicodeUtils.casefold("ümit") => true
|
17
|
+
# UnicodeUtils.casefold("WEISS") == UnicodeUtils.casefold("weiß") => true
|
18
|
+
def casefold(str)
|
19
|
+
String.new.force_encoding(str.encoding).tap do |res|
|
20
|
+
str.each_codepoint { |cp|
|
21
|
+
if mapping = CASEFOLD_C_MAP[cp]
|
22
|
+
res << mapping
|
23
|
+
elsif mapping = CASEFOLD_F_MAP[cp]
|
24
|
+
mapping.each { |m| res << m }
|
25
|
+
else
|
26
|
+
res << cp
|
27
|
+
end
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
module_function :casefold
|
32
|
+
|
33
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
|
-
require "unicode_utils/
|
3
|
+
require "unicode_utils/read_cdata"
|
4
4
|
require "unicode_utils/hangul_syllable_decomposition"
|
5
5
|
require "unicode_utils/jamo_short_name"
|
6
6
|
|
@@ -18,22 +18,29 @@ module UnicodeUtils
|
|
18
18
|
#
|
19
19
|
# Example:
|
20
20
|
#
|
21
|
-
# UnicodeUtils.
|
22
|
-
# UnicodeUtils.
|
23
|
-
def
|
24
|
-
|
21
|
+
# UnicodeUtils.char_name "ᾀ" => "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI"
|
22
|
+
# UnicodeUtils.char_name "\t" => "<control>"
|
23
|
+
def char_name(char)
|
24
|
+
if char.kind_of?(Integer)
|
25
|
+
cp = char
|
26
|
+
str = nil
|
27
|
+
else
|
28
|
+
cp = char.ord
|
29
|
+
str = char
|
30
|
+
end
|
25
31
|
NAME_MAP[cp] ||
|
26
32
|
case cp
|
27
33
|
when 0x3400..0x4DB5, 0x4E00..0x9FC3, 0x20000..0x2A6D6
|
28
|
-
"CJK UNIFIED IDEOGRAPH-#{sprintf('%
|
34
|
+
"CJK UNIFIED IDEOGRAPH-#{sprintf('%04X', cp)}"
|
29
35
|
when 0xAC00..0xD7A3
|
36
|
+
str ||= cp.chr(Encoding::UTF_8)
|
30
37
|
"HANGUL SYLLABLE ".tap do |n|
|
31
|
-
hangul_syllable_decomposition(
|
38
|
+
hangul_syllable_decomposition(str).each_char { |c|
|
32
39
|
n << (jamo_short_name(c) || '')
|
33
40
|
}
|
34
41
|
end
|
35
42
|
end
|
36
43
|
end
|
37
|
-
module_function :
|
44
|
+
module_function :char_name
|
38
45
|
|
39
46
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/char_name"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
# A Codepoint instance represents a single Unicode codepoint.
|
8
|
+
#
|
9
|
+
# UnicodeUtils::Codepoint.new(0x20ac) => #<U+20AC "€" EURO SIGN utf8:e2,82,ac>
|
10
|
+
class Codepoint
|
11
|
+
|
12
|
+
# The Unicode codespace. Any integer in this range is a Unicode
|
13
|
+
# codepoint.
|
14
|
+
RANGE = 0..0x10FFFF
|
15
|
+
|
16
|
+
# Create a Codepoint instance that wraps the given Integer. +int+
|
17
|
+
# must be in Codepoint::RANGE.
|
18
|
+
def initialize(int)
|
19
|
+
unless RANGE.include?(int)
|
20
|
+
raise ArgumentError, "#{int} not in codespace"
|
21
|
+
end
|
22
|
+
@int = int
|
23
|
+
end
|
24
|
+
|
25
|
+
# Convert to Integer.
|
26
|
+
def ord
|
27
|
+
@int
|
28
|
+
end
|
29
|
+
|
30
|
+
# Format in U+ notation.
|
31
|
+
#
|
32
|
+
# Codepoint.new(0xc5).uplus => "U+00C5"
|
33
|
+
def uplus
|
34
|
+
sprintf('U+%04X', @int)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the normative Unicode name of this codepoint.
|
38
|
+
#
|
39
|
+
# See also: UnicodeUtils.char_name
|
40
|
+
def name
|
41
|
+
UnicodeUtils.char_name(@int)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Convert this codepoint to an UTF-8 encoded string. Returns a new
|
45
|
+
# string on each call and thus it is allowed to mutate the return
|
46
|
+
# value.
|
47
|
+
def to_s
|
48
|
+
@int.chr(Encoding::UTF_8)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the bytes used to encode this codepoint in UTF-8,
|
52
|
+
# hex-formatted.
|
53
|
+
#
|
54
|
+
# Codepoint.new(0xe4).hexbytes => "c3,a4"
|
55
|
+
def hexbytes
|
56
|
+
to_s.bytes.map { |b| sprintf("%02x", b) }.join(",")
|
57
|
+
end
|
58
|
+
|
59
|
+
# #<U+... char name utf8-hexbytes>
|
60
|
+
def inspect
|
61
|
+
"#<#{uplus} #{to_s.inspect} #{name || "nil"} utf8:#{hexbytes}>"
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/codepoint"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
# Get an array of all Codepoint instances in Codepoint::RANGE whose
|
8
|
+
# name matches regexp. Matching is case insensitive.
|
9
|
+
#
|
10
|
+
# UnicodeUtils.grep(/angstrom/) => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
|
11
|
+
def grep(regexp)
|
12
|
+
unless regexp.casefold?
|
13
|
+
regexp = Regexp.new(regexp.source, Regexp::IGNORECASE)
|
14
|
+
end
|
15
|
+
Codepoint::RANGE.select { |cp|
|
16
|
+
regexp =~ UnicodeUtils.char_name(cp)
|
17
|
+
}.map { |cp| Codepoint.new(cp) }
|
18
|
+
end
|
19
|
+
module_function :grep
|
20
|
+
|
21
|
+
end
|
@@ -8,8 +8,9 @@ module UnicodeUtils
|
|
8
8
|
#
|
9
9
|
# UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
|
10
10
|
def hangul_syllable_decomposition(char)
|
11
|
-
|
12
|
-
|
11
|
+
String.new.force_encoding(char.encoding).tap do |str|
|
12
|
+
Impl.append_hangul_syllable_decomposition(str , char.ord)
|
13
|
+
end
|
13
14
|
end
|
14
15
|
module_function :hangul_syllable_decomposition
|
15
16
|
|
data/lib/unicode_utils/nfc.rb
CHANGED
@@ -0,0 +1,71 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module UnicodeUtils
|
4
|
+
|
5
|
+
# Absolute path to the directory from which UnicodeUtils loads its
|
6
|
+
# compiled Unicode data files at runtime.
|
7
|
+
CDATA_DIR =
|
8
|
+
File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
|
9
|
+
|
10
|
+
module Impl # :nodoc:
|
11
|
+
|
12
|
+
def self.open_cdata_file(filename, &block)
|
13
|
+
File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.read_codepoint_set(filename)
|
17
|
+
Hash.new.tap { |set|
|
18
|
+
open_cdata_file(filename) do |input|
|
19
|
+
buffer = "x" * 6
|
20
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
21
|
+
while input.read(6, buffer)
|
22
|
+
set[buffer.to_i(16)] = true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.read_codepoint_map(filename)
|
29
|
+
Hash.new.tap { |map|
|
30
|
+
open_cdata_file(filename) do |input|
|
31
|
+
buffer = "x" * 6
|
32
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
33
|
+
while input.read(6, buffer)
|
34
|
+
map[buffer.to_i(16)] = input.read(6, buffer).to_i(16)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.read_multivalued_map(filename)
|
41
|
+
Hash.new.tap { |map|
|
42
|
+
open_cdata_file(filename) do |input|
|
43
|
+
buffer = "x" * 6
|
44
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
45
|
+
while input.read(6, buffer)
|
46
|
+
cp = buffer.to_i(16)
|
47
|
+
mapping = []
|
48
|
+
while input.read(6, buffer).getbyte(0) != 120
|
49
|
+
mapping << buffer.to_i(16)
|
50
|
+
end
|
51
|
+
map[cp] = mapping
|
52
|
+
end
|
53
|
+
end
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.read_names(filename)
|
58
|
+
Hash.new.tap { |map|
|
59
|
+
open_cdata_file(filename) do |input|
|
60
|
+
buffer = "x" * 6
|
61
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
62
|
+
while input.read(6, buffer)
|
63
|
+
map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
|
64
|
+
end
|
65
|
+
end
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_cdata"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
CASEFOLD_C_MAP = Impl.read_codepoint_map("casefold_c_map") # :nodoc:
|
8
|
+
|
9
|
+
CASEFOLD_S_MAP = Impl.read_codepoint_map("casefold_s_map") # :nodoc:
|
10
|
+
|
11
|
+
# Perform simple case folding. Contrary to full case folding, this
|
12
|
+
# uses only one to one mappings, so that the length of the returned
|
13
|
+
# string is equal to the length of +str+.
|
14
|
+
#
|
15
|
+
# The purpose of case folding is case insensitive string comparison.
|
16
|
+
#
|
17
|
+
# Examples:
|
18
|
+
#
|
19
|
+
# UnicodeUtils.simple_casefold("Ümit") == UnicodeUtils.simple_casefold("ümit") => true
|
20
|
+
# UnicodeUtils.simple_casefold("WEISS") == UnicodeUtils.simple_casefold("weiß") => false
|
21
|
+
#
|
22
|
+
# See also: UnicodeUtils.casefold
|
23
|
+
def simple_casefold(str)
|
24
|
+
String.new.force_encoding(str.encoding).tap do |res|
|
25
|
+
str.each_codepoint { |cp|
|
26
|
+
res << (CASEFOLD_C_MAP[cp] || CASEFOLD_S_MAP[cp] || cp)
|
27
|
+
}
|
28
|
+
end
|
29
|
+
end
|
30
|
+
module_function :simple_casefold
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils"
|
4
|
+
|
5
|
+
# Shortcut for usage in irb. This shortcut is only defined when
|
6
|
+
# <tt>unicode_utils/u</tt> is explicitly required. It is intended for
|
7
|
+
# interactive use only!
|
8
|
+
#
|
9
|
+
# $ irb -r unicode_utils/u
|
10
|
+
# irb(main):001:0> U.grep(/angstrom/)
|
11
|
+
# => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
|
12
|
+
U = UnicodeUtils
|
data/lib/unicode_utils/upcase.rb
CHANGED
data/test/test_unicode_utils.rb
CHANGED
@@ -4,17 +4,18 @@ require "test/unit"
|
|
4
4
|
|
5
5
|
require "unicode_utils"
|
6
6
|
|
7
|
+
# Fast tests for allmost all UnicodeUtils functions.
|
7
8
|
class TestUnicodeUtils < Test::Unit::TestCase
|
8
9
|
|
9
10
|
def test_name
|
10
|
-
assert_equal "LATIN SMALL LETTER F", UnicodeUtils.
|
11
|
-
assert_equal Encoding::US_ASCII, UnicodeUtils.
|
12
|
-
assert_equal nil, UnicodeUtils.
|
13
|
-
assert_equal "<control>", UnicodeUtils.
|
14
|
-
assert_equal "CJK UNIFIED IDEOGRAPH-4E00", UnicodeUtils.
|
15
|
-
assert_equal "CJK UNIFIED IDEOGRAPH-2A6D6", UnicodeUtils.
|
16
|
-
assert_equal "CJK UNIFIED IDEOGRAPH-2A3D6", UnicodeUtils.
|
17
|
-
assert_equal "HANGUL SYLLABLE PWILH", UnicodeUtils.
|
11
|
+
assert_equal "LATIN SMALL LETTER F", UnicodeUtils.char_name("f")
|
12
|
+
assert_equal Encoding::US_ASCII, UnicodeUtils.char_name("f").encoding
|
13
|
+
assert_equal nil, UnicodeUtils.char_name("\u{e000}") # private use
|
14
|
+
assert_equal "<control>", UnicodeUtils.char_name("\t")
|
15
|
+
assert_equal "CJK UNIFIED IDEOGRAPH-4E00", UnicodeUtils.char_name("\u{4e00}")
|
16
|
+
assert_equal "CJK UNIFIED IDEOGRAPH-2A6D6", UnicodeUtils.char_name("\u{2a6d6}")
|
17
|
+
assert_equal "CJK UNIFIED IDEOGRAPH-2A3D6", UnicodeUtils.char_name("\u{2a3d6}")
|
18
|
+
assert_equal "HANGUL SYLLABLE PWILH", UnicodeUtils.char_name("\u{d4db}")
|
18
19
|
end
|
19
20
|
|
20
21
|
def test_simple_upcase
|
@@ -160,4 +161,20 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
160
161
|
assert_equal "\u{66}\u{69}\u{e4}", UnicodeUtils.nfkc("\u{fb01}\u{e4}")
|
161
162
|
end
|
162
163
|
|
164
|
+
def test_simple_casefold
|
165
|
+
assert_equal "abc123", UnicodeUtils.simple_casefold("ABC123")
|
166
|
+
assert UnicodeUtils.simple_casefold("ÜMIT") ==
|
167
|
+
UnicodeUtils.simple_casefold("ümit")
|
168
|
+
assert UnicodeUtils.simple_casefold("WEISS") !=
|
169
|
+
UnicodeUtils.simple_casefold("weiß")
|
170
|
+
end
|
171
|
+
|
172
|
+
def test_casefold
|
173
|
+
assert_equal "abc123", UnicodeUtils.casefold("ABC123")
|
174
|
+
assert UnicodeUtils.casefold("ÜMIT") ==
|
175
|
+
UnicodeUtils.casefold("ümit")
|
176
|
+
assert UnicodeUtils.casefold("WEISS") ==
|
177
|
+
UnicodeUtils.casefold("weiß")
|
178
|
+
end
|
179
|
+
|
163
180
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stefan Lang
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-12-07 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -22,31 +22,33 @@ extensions: []
|
|
22
22
|
extra_rdoc_files:
|
23
23
|
- README.txt
|
24
24
|
files:
|
25
|
+
- lib/unicode_utils/u.rb
|
26
|
+
- lib/unicode_utils/read_cdata.rb
|
25
27
|
- lib/unicode_utils/conditional_casing.rb
|
26
28
|
- lib/unicode_utils/hangul_syllable_decomposition.rb
|
27
29
|
- lib/unicode_utils/simple_downcase.rb
|
28
|
-
- lib/unicode_utils/
|
29
|
-
- lib/unicode_utils/read_names.rb
|
30
|
-
- lib/unicode_utils/read_codepoint_set.rb
|
30
|
+
- lib/unicode_utils/casefold.rb
|
31
31
|
- lib/unicode_utils/titlecase_char_q.rb
|
32
32
|
- lib/unicode_utils/cased_char_q.rb
|
33
33
|
- lib/unicode_utils/downcase.rb
|
34
|
-
- lib/unicode_utils/name.rb
|
35
34
|
- lib/unicode_utils/uppercase_char_q.rb
|
36
|
-
- lib/unicode_utils/read_multivalued_map.rb
|
37
35
|
- lib/unicode_utils/canonical_equivalents_q.rb
|
36
|
+
- lib/unicode_utils/char_name.rb
|
38
37
|
- lib/unicode_utils/nfkc.rb
|
39
38
|
- lib/unicode_utils/nfkd.rb
|
39
|
+
- lib/unicode_utils/codepoint.rb
|
40
40
|
- lib/unicode_utils/canonical_decomposition.rb
|
41
41
|
- lib/unicode_utils/upcase.rb
|
42
42
|
- lib/unicode_utils/nfc.rb
|
43
43
|
- lib/unicode_utils/nfd.rb
|
44
44
|
- lib/unicode_utils/case_ignorable_char_q.rb
|
45
45
|
- lib/unicode_utils/compatibility_decomposition.rb
|
46
|
+
- lib/unicode_utils/grep.rb
|
46
47
|
- lib/unicode_utils/simple_upcase.rb
|
47
48
|
- lib/unicode_utils/lowercase_char_q.rb
|
48
49
|
- lib/unicode_utils/jamo_short_name.rb
|
49
50
|
- lib/unicode_utils/combining_class.rb
|
51
|
+
- lib/unicode_utils/simple_casefold.rb
|
50
52
|
- lib/unicode_utils/version.rb
|
51
53
|
- lib/unicode_utils/soft_dotted_char_q.rb
|
52
54
|
- lib/unicode_utils.rb
|
@@ -54,7 +56,10 @@ files:
|
|
54
56
|
- cdata/cond_lc_map
|
55
57
|
- cdata/prop_set_lowercase
|
56
58
|
- cdata/cat_set_titlecase
|
59
|
+
- cdata/casefold_c_map
|
60
|
+
- cdata/casefold_f_map
|
57
61
|
- cdata/special_lc_map
|
62
|
+
- cdata/casefold_s_map
|
58
63
|
- cdata/names
|
59
64
|
- cdata/cond_uc_map
|
60
65
|
- cdata/special_uc_map
|
@@ -67,7 +72,6 @@ files:
|
|
67
72
|
- cdata/jamo_short_names
|
68
73
|
- cdata/compatibility_decomposition_map
|
69
74
|
- cdata/prop_set_uppercase
|
70
|
-
- test/test_normalization.rb
|
71
75
|
- test/test_unicode_utils.rb
|
72
76
|
- README.txt
|
73
77
|
- LICENSE.txt
|
@@ -99,5 +103,4 @@ signing_key:
|
|
99
103
|
specification_version: 2
|
100
104
|
summary: additional Unicode aware functions for Ruby 1.9
|
101
105
|
test_files:
|
102
|
-
- test/test_normalization.rb
|
103
106
|
- test/test_unicode_utils.rb
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
module UnicodeUtils
|
4
|
-
|
5
|
-
module Impl # :nodoc:
|
6
|
-
|
7
|
-
def self.read_codepoint_map(filename)
|
8
|
-
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
|
9
|
-
Hash.new.tap { |map|
|
10
|
-
File.open(path, "r:US-ASCII:-") do |input|
|
11
|
-
buffer = "x" * 6
|
12
|
-
buffer.force_encoding(Encoding::US_ASCII)
|
13
|
-
while input.read(6, buffer)
|
14
|
-
map[buffer.to_i(16)] = input.read(6, buffer).to_i(16)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
module UnicodeUtils
|
4
|
-
|
5
|
-
module Impl # :nodoc:
|
6
|
-
|
7
|
-
def self.read_codepoint_set(filename)
|
8
|
-
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
|
9
|
-
Hash.new.tap { |set|
|
10
|
-
File.open(path, "r:US-ASCII:-") do |input|
|
11
|
-
buffer = "x" * 6
|
12
|
-
buffer.force_encoding(Encoding::US_ASCII)
|
13
|
-
while input.read(6, buffer)
|
14
|
-
set[buffer.to_i(16)] = true
|
15
|
-
end
|
16
|
-
end
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
module UnicodeUtils
|
4
|
-
|
5
|
-
module Impl # :nodoc:
|
6
|
-
|
7
|
-
def self.read_multivalued_map(filename)
|
8
|
-
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
|
9
|
-
Hash.new.tap { |map|
|
10
|
-
File.open(path, "r:US-ASCII:-") do |input|
|
11
|
-
buffer = "x" * 6
|
12
|
-
buffer.force_encoding(Encoding::US_ASCII)
|
13
|
-
while input.read(6, buffer)
|
14
|
-
cp = buffer.to_i(16)
|
15
|
-
mapping = []
|
16
|
-
while input.read(6, buffer).getbyte(0) != 120
|
17
|
-
mapping << buffer.to_i(16)
|
18
|
-
end
|
19
|
-
map[cp] = mapping
|
20
|
-
end
|
21
|
-
end
|
22
|
-
}
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
module UnicodeUtils
|
4
|
-
|
5
|
-
module Impl # :nodoc:
|
6
|
-
|
7
|
-
def self.read_names(filename)
|
8
|
-
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
|
9
|
-
Hash.new.tap { |map|
|
10
|
-
File.open(path, "r:US-ASCII:-") do |input|
|
11
|
-
buffer = "x" * 6
|
12
|
-
buffer.force_encoding(Encoding::US_ASCII)
|
13
|
-
while input.read(6, buffer)
|
14
|
-
map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
|
15
|
-
end
|
16
|
-
end
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
data/test/test_normalization.rb
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require "test/unit"
|
4
|
-
|
5
|
-
require "unicode_utils/nfd"
|
6
|
-
require "unicode_utils/nfc"
|
7
|
-
|
8
|
-
# See data/NormalizationTest.txt
|
9
|
-
class TestNormalization < Test::Unit::TestCase
|
10
|
-
|
11
|
-
class Record
|
12
|
-
def initialize(ary)
|
13
|
-
@ary = ary
|
14
|
-
end
|
15
|
-
def c1
|
16
|
-
@ary[0]
|
17
|
-
end
|
18
|
-
def c2
|
19
|
-
@ary[1]
|
20
|
-
end
|
21
|
-
def c3
|
22
|
-
@ary[2]
|
23
|
-
end
|
24
|
-
def c4
|
25
|
-
@ary[3]
|
26
|
-
end
|
27
|
-
def c5
|
28
|
-
@ary[4]
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def each_testdata_record
|
33
|
-
fn = File.join(File.dirname(__FILE__),
|
34
|
-
"..", "data", "NormalizationTest.txt")
|
35
|
-
File.open(fn, "r:utf-8:-") do |input|
|
36
|
-
input.each_line { |line|
|
37
|
-
if line =~ /^([^#]*)#/
|
38
|
-
line = $1
|
39
|
-
end
|
40
|
-
line.strip!
|
41
|
-
next if line.empty? || line =~ /^@Part/
|
42
|
-
columns = line.split(";")
|
43
|
-
ary = columns.map { |column|
|
44
|
-
String.new.force_encoding(Encoding::UTF_8).tap do |str|
|
45
|
-
column.split(" ").each { |c|
|
46
|
-
str << c.strip.to_i(16)
|
47
|
-
}
|
48
|
-
end
|
49
|
-
}
|
50
|
-
yield Record.new(ary)
|
51
|
-
}
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def test_nfd
|
56
|
-
each_testdata_record { |r|
|
57
|
-
assert_equal r.c3, UnicodeUtils.nfd(r.c1)
|
58
|
-
assert_equal r.c3, UnicodeUtils.nfd(r.c2)
|
59
|
-
assert_equal r.c3, UnicodeUtils.nfd(r.c3)
|
60
|
-
assert_equal r.c5, UnicodeUtils.nfd(r.c4)
|
61
|
-
assert_equal r.c5, UnicodeUtils.nfd(r.c5)
|
62
|
-
}
|
63
|
-
end
|
64
|
-
|
65
|
-
def test_nfc
|
66
|
-
each_testdata_record { |r|
|
67
|
-
assert_equal r.c2, UnicodeUtils.nfc(r.c1)
|
68
|
-
assert_equal r.c2, UnicodeUtils.nfc(r.c2)
|
69
|
-
assert_equal r.c2, UnicodeUtils.nfc(r.c3)
|
70
|
-
assert_equal r.c4, UnicodeUtils.nfc(r.c4)
|
71
|
-
assert_equal r.c4, UnicodeUtils.nfc(r.c5)
|
72
|
-
}
|
73
|
-
end
|
74
|
-
|
75
|
-
def test_nfkd
|
76
|
-
each_testdata_record { |r|
|
77
|
-
assert_equal r.c5, UnicodeUtils.nfkd(r.c1)
|
78
|
-
assert_equal r.c5, UnicodeUtils.nfkd(r.c2)
|
79
|
-
assert_equal r.c5, UnicodeUtils.nfkd(r.c3)
|
80
|
-
assert_equal r.c5, UnicodeUtils.nfkd(r.c4)
|
81
|
-
assert_equal r.c5, UnicodeUtils.nfkd(r.c5)
|
82
|
-
}
|
83
|
-
end
|
84
|
-
|
85
|
-
def test_nfkc
|
86
|
-
each_testdata_record { |r|
|
87
|
-
assert_equal r.c4, UnicodeUtils.nfkc(r.c1)
|
88
|
-
assert_equal r.c4, UnicodeUtils.nfkc(r.c2)
|
89
|
-
assert_equal r.c4, UnicodeUtils.nfkc(r.c3)
|
90
|
-
assert_equal r.c4, UnicodeUtils.nfkc(r.c4)
|
91
|
-
assert_equal r.c4, UnicodeUtils.nfkc(r.c5)
|
92
|
-
}
|
93
|
-
end
|
94
|
-
|
95
|
-
end
|