u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,31 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class Constants
6
+ def initialize(data, version, io = $stdout)
7
+ # TODO: Defines should be made with a list, but we can’t match formatting
8
+ # quite yet, so wait until we have stable tests.
9
+ U::Build::Header.new(io) do
10
+ io.puts <<EOH
11
+ #define UNICODE_DATA_VERSION "#{version}"
12
+
13
+ #define UNICODE_LAST_CHAR #{'%#04x' % data.last}
14
+
15
+ #define UNICODE_MAX_TABLE_INDEX ((UNICODE_LAST_CHAR + 1) / 256)
16
+
17
+ #define UNICODE_LAST_CHAR_PART1 #{data.last_char_part1_x}
18
+
19
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
20
+
21
+ #define UNICODE_LAST_PAGE_PART1 (UNICODE_LAST_CHAR_PART1 / 256)
22
+
23
+ #define UNICODE_LAST_PAGE_PART2 ((UNICODE_LAST_CHAR + 1 - UNICODE_FIRST_CHAR_PART2) / 256)
24
+
25
+ #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
26
+ EOH
27
+ end
28
+ end
29
+ end
30
+
31
+ Constants.new(Marshal.load(File.open(ARGV[0], 'rb', &:read)), ARGV[1])
@@ -0,0 +1,85 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class Decompose
6
+ NotPresentOffset = 1 << 16 - 1
7
+
8
+ def initialize(data, io = $stdout)
9
+ # TODO: Defines should be made with a list, but we can’t match formatting
10
+ # quite yet, so wait until we have stable tests.
11
+ U::Build::Header.new(io) do
12
+ io.puts <<EOD
13
+ #define UNICODE_NOT_PRESENT_OFFSET UINT16_MAX
14
+ EOD
15
+ io.puts DecompositionTable.new(data)
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ class DecompositionTable < U::Build::Header::Table
22
+ def initialize(data)
23
+ super "static const struct {
24
+ \tuint32_t ch;
25
+ \tuint16_t canon_offset;
26
+ \tuint16_t compat_offset;
27
+ } decomp_table[]"
28
+ @data = data
29
+ @decomp_offsets = {}
30
+ @decomp_string = ''
31
+ @decomp_string_offset = 0
32
+ 0.upto(data.last) do |i|
33
+ next if data[i].decomposition.empty?
34
+ canon_decomp = data[i].decomposition.canonical? ? make_decomp(i, false) : nil
35
+ compat_decomp = make_decomp(i, true)
36
+ compat_decomp = nil if canon_decomp and compat_decomp == canon_decomp
37
+ canon_offset = handle_decomp(canon_decomp)
38
+ compat_offset = handle_decomp(compat_decomp)
39
+ raise RuntimeError,
40
+ 'decomposition string offset beyond not-present offset, upgrade value: offset: %d, max: %d' %
41
+ [@decomp_string_offset, NotPresentOffset] if
42
+ @decomp_string_offset > NotPresentOffset
43
+ self << U::Build::Header::Table::Row.new('%#06x' % i, canon_offset.to_s, compat_offset.to_s)
44
+ end
45
+ end
46
+
47
+ def to_s
48
+ "%s\nstatic const char decomp_expansion_string[] = %s;" % [super, @decomp_string]
49
+ end
50
+
51
+ private
52
+
53
+ def make_decomp(point, compatible)
54
+ expand_decomp(point, compatible).flatten.pack('U*')
55
+ end
56
+
57
+ def expand_decomp(point, compatible)
58
+ @data[point].decomposition.map{ |part|
59
+ if not @data[part].decomposition.empty? and
60
+ (compatible or @data[part].decomposition.canonical?)
61
+ expand_decomp(part, compatible)
62
+ else
63
+ part
64
+ end
65
+ }
66
+ end
67
+
68
+ def handle_decomp(decomp)
69
+ return 'UNICODE_NOT_PRESENT_OFFSET' unless decomp
70
+ return @decomp_offsets[decomp] if @decomp_offsets.include? decomp
71
+ @decomp_offsets[decomp] = @decomp_string_offset
72
+ @decomp_string << "\n \"%s\\0\" /* offset %d */" %
73
+ [escape(decomp), @decomp_string_offset]
74
+ result = @decomp_string_offset
75
+ @decomp_string_offset += decomp.bytesize + 1
76
+ result
77
+ end
78
+
79
+ def escape(decomp)
80
+ decomp.unpack('H*')[0].gsub(/(.{2})/, '\\x\1')
81
+ end
82
+ end
83
+ end
84
+
85
+ Decompose.new(Marshal.load(File.open(ARGV[0], 'rb', &:read)))
@@ -0,0 +1,61 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class Category
6
+ def initialize(data, io = $stdout)
7
+ U::Build::Header.new(io) do
8
+ io.puts U::Build::Header::Tables::Split.new(data, 'uint8_t', 'general_category'){ |i|
9
+ 'U_GENERAL_CATEGORY_%s' % Mappings[data[i].type]
10
+ }
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ # Map general category code onto symbolic name.
17
+ module Mappings
18
+ Mappings = {
19
+ 'Cc' => 'OTHER_CONTROL',
20
+ 'Cf' => 'OTHER_FORMAT',
21
+ 'Cn' => 'OTHER_NOT_ASSIGNED',
22
+ 'Co' => 'OTHER_PRIVATE_USE',
23
+ 'Cs' => 'OTHER_SURROGATE',
24
+ 'Ll' => 'LETTER_LOWERCASE',
25
+ 'Lm' => 'LETTER_MODIFIER',
26
+ 'Lo' => 'LETTER_OTHER',
27
+ 'Lt' => 'LETTER_TITLECASE',
28
+ 'Lu' => 'LETTER_UPPERCASE',
29
+ 'Mc' => 'MARK_SPACING_COMBINING',
30
+ 'Me' => 'MARK_ENCLOSING',
31
+ 'Mn' => 'MARK_NON_SPACING',
32
+ 'Nd' => 'NUMBER_DECIMAL',
33
+ 'Nl' => 'NUMBER_LETTER',
34
+ 'No' => 'NUMBER_OTHER',
35
+ 'Pc' => 'PUNCTUATION_CONNECTOR',
36
+ 'Pd' => 'PUNCTUATION_DASH',
37
+ 'Pe' => 'PUNCTUATION_CLOSE',
38
+ 'Pf' => 'PUNCTUATION_FINAL_QUOTE',
39
+ 'Pi' => 'PUNCTUATION_INITIAL_QUOTE',
40
+ 'Po' => 'PUNCTUATION_OTHER',
41
+ 'Ps' => 'PUNCTUATION_OPEN',
42
+ 'Sc' => 'SYMBOL_CURRENCY',
43
+ 'Sk' => 'SYMBOL_MODIFIER',
44
+ 'Sm' => 'SYMBOL_MATH',
45
+ 'So' => 'SYMBOL_OTHER',
46
+ 'Zl' => 'SEPARATOR_LINE',
47
+ 'Zp' => 'SEPARATOR_PARAGRAPH',
48
+ 'Zs' => 'SEPARATOR_SPACE',
49
+ }.freeze
50
+
51
+ class << self
52
+ def [](value)
53
+ Mappings[value] or
54
+ raise 'unknown General_Category property value: %s; the Unicode standard has changed' %
55
+ value
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ Category.new(Marshal.load(File.open(ARGV[0], 'rb', &:read)))
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class Break
6
+ def initialize(data, breaks, name, io = $stdout)
7
+ U::Build::Header.new(io) do
8
+ io.puts U::Build::Header::Tables::Split.new(data, 'uint8_t', '%s_break_property' % name){ |i|
9
+ 'U_%s_BREAK_%s' % [name.upcase, breaks[i].upcase]
10
+ }
11
+ end
12
+ end
13
+ end
14
+
15
+ Break.new Marshal.load(File.open(ARGV[0], 'rb', &:read)), U::Build::Data::Break.new(ARGV[1]), ARGV[2]
@@ -0,0 +1,5 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ Marshal.dump(U::Build::Data::Unicode.new(ARGV[0]), $stdout)
@@ -0,0 +1,91 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class Script
6
+ EasyRange = 0x0000...0x2000
7
+
8
+ class << self
9
+ def name(script)
10
+ 'U_SCRIPT_%s' % script.upcase
11
+ end
12
+ end
13
+
14
+ def initialize(scripts, io = $stdout)
15
+ U::Build::Header.new(io) do
16
+ io.puts <<EOH
17
+ #define EASY_SCRIPTS_RANGE #{EasyRange.end}
18
+ EOH
19
+ io.puts EasyTable.new(scripts)
20
+ io.puts Table.new(scripts)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ class EasyTable < U::Build::Header::Table
27
+ def initialize(scripts)
28
+ super 'static const unsigned char script_easy_table[%d]' % EasyRange.end
29
+ row = Row.new
30
+ EasyRange.each do |i|
31
+ row << Script.name(scripts[i])
32
+ end
33
+ self << row
34
+ end
35
+
36
+ private
37
+
38
+ class Row < U::Build::Header::Table::Row
39
+ private
40
+
41
+ def multi_limit
42
+ 72
43
+ end
44
+
45
+ def multi_format
46
+ single_format
47
+ end
48
+
49
+ def multi_joiner
50
+ ",\n\t"
51
+ end
52
+
53
+ def single_format
54
+ "\t%s"
55
+ end
56
+ end
57
+ end
58
+
59
+ class Table < U::Build::Header::Table
60
+ def initialize(scripts)
61
+ super "static const struct {
62
+ \tuint32_t start;
63
+ \tuint16_t chars;
64
+ \tuint16_t script;
65
+ } script_table[]"
66
+ sorted = scripts.select{ |point, _script| point >= EasyRange.end }.sort
67
+ first = sorted[0][0]
68
+ last = first - 1
69
+ current = sorted[0][1]
70
+ sorted.each do |point, script|
71
+ if point == last + 1 and script == current
72
+ last += 1
73
+ else
74
+ row first, last, current
75
+ first = point
76
+ last = first
77
+ current = script
78
+ end
79
+ end
80
+ row first, last, current
81
+ end
82
+
83
+ private
84
+
85
+ def row(first, last, script)
86
+ self << U::Build::Header::Table::Row.new('%#06x, %5d, %s' % [first, last - first + 1, Script.name(script)])
87
+ end
88
+ end
89
+ end
90
+
91
+ Script.new(U::Build::Data::Scripts.new(ARGV[0]))
@@ -0,0 +1,17 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class SoftDotted
6
+ def initialize(prop_list, io = $stdout)
7
+ U::Build::Header.new(io) do
8
+ io.puts U::Build::Header::Tables::Intervals.
9
+ new(prop_list.
10
+ select{ |point, properties| properties.include? 'Soft_Dotted' }.
11
+ map{ |point, properties| point },
12
+ 'soft_dotted')
13
+ end
14
+ end
15
+ end
16
+
17
+ SoftDotted.new(U::Build::Data::PropList.new(ARGV[0]))
@@ -0,0 +1,30 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class TitleTable
6
+ def initialize(data, io = $stdout)
7
+ U::Build::Header.new(io) do
8
+ io.puts TitleTable.new(data)
9
+ end
10
+ end
11
+
12
+ private
13
+
14
+ class TitleTable < U::Build::Header::Table
15
+ def initialize(data)
16
+ super "static const struct {
17
+ \tuint32_t ch;
18
+ \tuint32_t upper;
19
+ \tuint32_t lower;
20
+ } title_table[]"
21
+ # TODO: Add #code to Entry and use #select on data here.
22
+ data.each_with_index do |entry, code|
23
+ next unless entry.title_to_lower
24
+ self << U::Build::Header::Table::Row.new(*[code, entry.title_to_upper, entry.title_to_lower].map{ |i| '0x%04x' % i })
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ TitleTable.new(Marshal.load(File.open(ARGV[0], 'rb', &:read)))
@@ -0,0 +1,17 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'u/build'
4
+
5
+ class Wide
6
+ def initialize(derived_east_asian_width, types, io = $stdout)
7
+ U::Build::Header.new(io) do
8
+ io.puts U::Build::Header::Tables::Intervals.
9
+ new(derived_east_asian_width.
10
+ select{ |point, width| types.include? width }.
11
+ map{ |point, width| point },
12
+ 'wide')
13
+ end
14
+ end
15
+ end
16
+
17
+ Wide.new(U::Build::Data::DerivedEastAsianWidth.new(ARGV[0]), ARGV[1..-1])
@@ -0,0 +1,8 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module U
4
+ module Build
5
+ require 'u/build/data'
6
+ require 'u/build/header'
7
+ end
8
+ end
@@ -0,0 +1,16 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module U::Build::Data
4
+ require 'u/build/data/file'
5
+ require 'u/build/data/bidimirroring'
6
+ require 'u/build/data/break'
7
+ require 'u/build/data/casefolding'
8
+ require 'u/build/data/compositionexclusions'
9
+ require 'u/build/data/derivedeastasianwidth'
10
+ require 'u/build/data/derivednormalizationprops'
11
+ require 'u/build/data/linebreak'
12
+ require 'u/build/data/proplist'
13
+ require 'u/build/data/scripts'
14
+ require 'u/build/data/specialcasing'
15
+ require 'u/build/data/unicode'
16
+ end
@@ -0,0 +1,26 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Data::BidiMirroring
4
+ include Enumerable
5
+
6
+ def initialize(path)
7
+ @entries = []
8
+ U::Build::Data::File.each(path, 2) do |point, mirrored|
9
+ @entries << Entry.new(point, mirrored.hex)
10
+ end
11
+ end
12
+
13
+ def each
14
+ @entries.each do |entry|
15
+ yield entry
16
+ end
17
+ end
18
+
19
+ class Entry
20
+ def initialize(char, mirrored)
21
+ @char, @mirrored = char, mirrored
22
+ end
23
+
24
+ attr_reader :char, :mirrored
25
+ end
26
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Data::Break
4
+ def initialize(path)
5
+ @entries = Hash.new{ 'Other' }
6
+ U::Build::Data::File.each(path, 2) do |point, property|
7
+ @entries[point] = property
8
+ end
9
+ end
10
+
11
+ def [](point)
12
+ @entries[point]
13
+ end
14
+ end
@@ -0,0 +1,77 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Data::CaseFolding
4
+ include Enumerable
5
+
6
+ def initialize(data, special_casing, path)
7
+ @entries = []
8
+ File.new(data, special_casing, path).each do |entry|
9
+ @entries << entry
10
+ end
11
+ end
12
+
13
+ def each
14
+ @entries.each do |entry|
15
+ yield entry
16
+ end
17
+ end
18
+
19
+ class Entry
20
+ def initialize(char, points)
21
+ @char, @points = char, points
22
+ end
23
+
24
+ def to_s
25
+ @points.to_s
26
+ end
27
+
28
+ def to_escaped_s
29
+ @points.to_escaped_s
30
+ end
31
+
32
+ attr_reader :char
33
+ end
34
+
35
+ private
36
+
37
+ class File
38
+ def initialize(data, special_casing, path)
39
+ @data, @special_casing, @path = data, special_casing, path
40
+ end
41
+
42
+ def each
43
+ U::Build::Data::File.each(@path, 4) do |point, status, mapping|
44
+ next if simple_or_turkic? status
45
+ points = U::Build::Data::Unicode::Points.new(mapping)
46
+ next if simple? points, point
47
+ yield Entry.new(point, points)
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def simple_or_turkic?(status)
54
+ %w[S T].include? status
55
+ end
56
+
57
+ def simple?(points, code)
58
+ points.length == 1 and
59
+ not @special_casing.include? code and
60
+ @data[code].type and
61
+ lower(code) == points.first
62
+ end
63
+
64
+ def lower(code)
65
+ case @data[code].type
66
+ when 'Ll'
67
+ code
68
+ when 'Lt'
69
+ @data[code].title_to_lower
70
+ when 'Lu'
71
+ @data[code].value
72
+ else
73
+ code
74
+ end
75
+ end
76
+ end
77
+ end