u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,64 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Table::Row
4
+ def initialize(*cells)
5
+ @cells = cells
6
+ end
7
+
8
+ def <<(cell)
9
+ @cells << cell
10
+ self
11
+ end
12
+
13
+ def to_s
14
+ if @cells.reduce(0){ |i, cell| i + cell.length + 2 } > multi_limit
15
+ lines = [Line.new]
16
+ @cells.each_with_index do |cell, i|
17
+ lines << Line.new if lines.last.length + cell.length + 2 > multi_limit - 3
18
+ lines.last << cell
19
+ end
20
+ multi_format % lines.join(multi_joiner)
21
+ else
22
+ single_format % @cells.join(', ')
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def multi_limit
29
+ 65
30
+ end
31
+
32
+ def multi_format
33
+ "\t{\n\t\t%s\n\t}"
34
+ end
35
+
36
+ def multi_joiner
37
+ ",\n\t\t"
38
+ end
39
+
40
+ def single_format
41
+ "\t{ %s }"
42
+ end
43
+
44
+ class Line
45
+ def initialize
46
+ @cells = []
47
+ @length = 0
48
+ end
49
+
50
+ def <<(cell)
51
+ @cells << cell
52
+ @length += cell.length
53
+ self
54
+ end
55
+
56
+ def length
57
+ @length + (@cells.length - 1) * 2
58
+ end
59
+
60
+ def to_s
61
+ @cells.join(', ')
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,6 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module U::Build::Header::Tables
4
+ require 'u/build/header/tables/intervals'
5
+ require 'u/build/header/tables/split'
6
+ end
@@ -0,0 +1,50 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Intervals < U::Build::Header::Table
4
+ def initialize(points, name)
5
+ super 'static const struct uint32_t_interval %s[]' % name
6
+ return if points.empty?
7
+ sorted = points.sort
8
+ first = sorted[0]
9
+ last = first - 1
10
+ sorted.each do |point|
11
+ if point == last + 1
12
+ last += 1
13
+ else
14
+ row first, last
15
+ first = point
16
+ last = first
17
+ end
18
+ end
19
+ row first, last
20
+ end
21
+
22
+ def to_s
23
+ '%s%s' % [<<EOH, super]
24
+ struct uint32_t_interval {
25
+ uint32_t first;
26
+ uint32_t last;
27
+ };
28
+
29
+ static int
30
+ u_char_interval_compare(const void *key, const void *element)
31
+ {
32
+ uint32_t c = *(uint32_t *)key;
33
+ struct uint32_t_interval *interval = (struct uint32_t_interval *)element;
34
+
35
+ if (c < interval->first)
36
+ return -1;
37
+ else if (c > interval->last)
38
+ return +1;
39
+ else
40
+ return 0;
41
+ }
42
+ EOH
43
+ end
44
+
45
+ private
46
+
47
+ def row(first, last)
48
+ self << U::Build::Header::Table::Row.new('%#06x, %#06x' % [first, last])
49
+ end
50
+ end
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Split
4
+ require 'u/build/header/tables/split/row'
5
+ require 'u/build/header/tables/split/rows'
6
+ require 'u/build/header/tables/split/data'
7
+ require 'u/build/header/tables/split/part1'
8
+ require 'u/build/header/tables/split/part2'
9
+
10
+ def initialize(data, type, name)
11
+ rows = Rows.new(0, data.last){ |c| yield(c) }
12
+ @data = Data.new('%s %s_data' % [type, name], rows)
13
+ @part1 = Part1.new('%s_table_part1' % name, 0, data.last_char_part1_i, rows, 0)
14
+ @part2 = Part2.new('%s_table_part2' % name, data.last, rows, @part1)
15
+ end
16
+
17
+ def to_s
18
+ "%s\n%s\n%s" % [@data, @part1, @part2]
19
+ end
20
+ end
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Split::Data < U::Build::Header::Table
4
+ def initialize(declaration, rows)
5
+ super 'static const %s[][256]' % declaration
6
+ rows.reject{ |row| row.homogenous? }.each_with_index do |row, index|
7
+ table_row = Row.new(row.start / 256, index)
8
+ row.each do |cell|
9
+ table_row << cell
10
+ end
11
+ self << table_row
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ class Row < U::Build::Header::Table::Row
18
+ def initialize(page, index)
19
+ super()
20
+ @format = "\t{ /* page %d, index %d */\n\t\t%%s\n\t}" % [page, index]
21
+ end
22
+
23
+ private
24
+
25
+ def multi_format
26
+ @format
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,28 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Split::Part1 < U::Build::Header::Table
4
+ class << self
5
+ def last_page
6
+ 'UNICODE_LAST_PAGE_PART1'
7
+ end
8
+ end
9
+
10
+ def initialize(name, first, last, rows, set = 0)
11
+ super "/* U+%04X through U+%04X */\nstatic const int16_t %s[%s + 1]" % [first, last, name, self.class.last_page]
12
+ # TODO: Check if we can use each or something, instead, or if we can use
13
+ # an array for Rows.
14
+ j = set
15
+ first.step(last, 256) do |i|
16
+ row = rows[i / 256]
17
+ if row.homogenous?
18
+ self << "\t%s + UNICODE_MAX_TABLE_INDEX" % row[0]
19
+ else
20
+ self << "\t%d /* page %d */" % [j, row.start / 256]
21
+ j += 1
22
+ end
23
+ end
24
+ @set = j
25
+ end
26
+
27
+ attr_reader :set
28
+ end
@@ -0,0 +1,13 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Split::Part2 < U::Build::Header::Tables::Split::Part1
4
+ class << self
5
+ def last_page
6
+ 'UNICODE_LAST_PAGE_PART2'
7
+ end
8
+ end
9
+
10
+ def initialize(name, last, rows, part1)
11
+ super name, 0xe0000, last, rows, part1.set
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Split::Row
4
+ include Enumerable
5
+
6
+ def initialize(start)
7
+ @start = start
8
+ @cells = []
9
+ # TODO: Perhaps don’t cache this if this turns out to be too memory
10
+ # consuming.
11
+ @homogenous = true
12
+ 0.upto(255) do |i|
13
+ @cells[i] = yield(start + i)
14
+ @homogenous = false if @cells[i] != @cells[0]
15
+ end
16
+ end
17
+
18
+ def each
19
+ @cells.each do |cell|
20
+ yield cell
21
+ end
22
+ self
23
+ end
24
+
25
+ def [](index)
26
+ @cells[index]
27
+ end
28
+
29
+ def homogenous?
30
+ @homogenous
31
+ end
32
+
33
+ attr_reader :start
34
+ end
@@ -0,0 +1,22 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class U::Build::Header::Tables::Split::Rows
4
+ include Enumerable
5
+
6
+ def initialize(first, last)
7
+ @rows = []
8
+ first.step(last, 256) do |i|
9
+ @rows[i / 256] = U::Build::Header::Tables::Split::Row.new(i){ |c| yield(c) }
10
+ end
11
+ end
12
+
13
+ def each
14
+ @rows.each do |row|
15
+ yield row
16
+ end
17
+ end
18
+
19
+ def [](index)
20
+ @rows[index]
21
+ end
22
+ end
@@ -0,0 +1,45 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class Break
4
+ def initialize(path, method, io = $stdout)
5
+ io.puts '# -*- coding: utf-8 -*-',
6
+ '# Automatically generated, so don’t edit!',
7
+ '',
8
+ 'Expectations do'
9
+ Lines.new(path).each do |splits|
10
+ io.printf " expect [%s] do %s.%s.to_a end\n",
11
+ splits.map{ |e| uify(e) }.join(', '),
12
+ uify(splits.join(' ')),
13
+ method
14
+ end
15
+ io.puts 'end'
16
+ end
17
+
18
+ private
19
+
20
+ def uify(chars)
21
+ "[%s].pack('U*').u" % chars.split(' ').map{ |e| '0x%04x' % (e == 'D800' ? 0x0001 : e.to_i(16)) }.join(', ')
22
+ end
23
+
24
+ class Lines
25
+ include Enumerable
26
+
27
+ def initialize(path)
28
+ @path = path
29
+ end
30
+
31
+ def each
32
+ File.open(@path, 'r') do |file|
33
+ file.each_line.with_index do |line, index|
34
+ next if line =~ /\A(?:#|\s*\Z)/
35
+ yield line.gsub(/\s*\u{d7}\s*/, ' ').
36
+ sub(/\A\s*\u{f7}\s*/, '').
37
+ sub(/\s*#.*\Z/, '').
38
+ split(/\s*\u{f7}\s*/)
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ Break.new ARGV[0], ARGV[1]
@@ -0,0 +1,178 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module Utfify
4
+ class << self
5
+ def utfify(codepoints)
6
+ return '' if codepoints.empty?
7
+ codepoints.split(/\s+/).map{ |cp| cp.hex }.pack('U*')
8
+ end
9
+ end
10
+ end
11
+
12
+ class SpecialCasing
13
+ def initialize(path)
14
+ @entries = []
15
+
16
+ Lines.new(path).each do |fields|
17
+ @entries[fields[Code].hex] =
18
+ Entry.new(Utfify.utfify(fields[Upper]),
19
+ Utfify.utfify(fields[Lower]),
20
+ Utfify.utfify(fields[Title]),
21
+ fields[Condition])
22
+ end
23
+ end
24
+
25
+ def include?(code)
26
+ @entries[code]
27
+ end
28
+
29
+ def [](code)
30
+ @entries[code]
31
+ end
32
+
33
+ class Entry
34
+ def initialize(upper, lower, title, condition)
35
+ @upper, @lower, @title, @condition = upper, lower, title, condition
36
+ end
37
+
38
+ attr_reader :upper, :lower, :title, :condition
39
+ end
40
+
41
+ private
42
+
43
+ Code, Lower, Title, Upper, Condition = (0..4).to_a
44
+
45
+ class Lines
46
+ include Enumerable
47
+
48
+ def initialize(path)
49
+ @path = path
50
+ end
51
+
52
+ def each
53
+ File.open(@path, 'rb') do |file|
54
+ file.each_line.with_index do |line, index|
55
+ next if line =~ /\A(?:#|\s*\Z)/
56
+ fields = line.chomp.sub(/\s*;\s*#.*\Z/, '').split(/\s*;\s*/)
57
+ raise RuntimeError,
58
+ '%s:%d: wrong number of fields: %d instead of 4..5' %
59
+ [@path, index + 1, fields.size] unless fields.size.between? 4, 5
60
+ yield fields
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ class UnicodeData
68
+ include Enumerable
69
+
70
+ def initialize(path)
71
+ @entries = []
72
+
73
+ Lines.new(path).each do |code, fields|
74
+ @entries << Entry.new(code, fields[Category], fields[Upper], fields[Lower])
75
+ end
76
+ end
77
+
78
+ def each
79
+ @entries.each do |entry|
80
+ yield entry
81
+ end
82
+ end
83
+
84
+ class Entry
85
+ def initialize(code, category, upper, lower)
86
+ @code, @category, @upper, @lower = code, category, upper, lower
87
+ end
88
+
89
+ attr_reader :code, :category, :upper, :lower
90
+ end
91
+
92
+ private
93
+
94
+ Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
95
+
96
+ class Lines
97
+ include Enumerable
98
+
99
+ def initialize(path)
100
+ @path = path
101
+ end
102
+
103
+ def each
104
+ File.open(@path, 'rb') do |file|
105
+ previous = -1
106
+ file.each_line.with_index do |line, index|
107
+ next if line =~ /\A(?:#|\s*\Z)/
108
+ fields = line.split(';')
109
+ raise RuntimeError,
110
+ '%s:%d: wrong number of fields: %d instead of 15' %
111
+ [@path, index + 1, fields.size] unless fields.size == 15
112
+ code = fields[Code].hex
113
+ if code > previous + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
114
+ previous.upto(code - 1) do |c|
115
+ yield c, fields
116
+ end
117
+ end
118
+ yield code, fields
119
+ previous = code
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
125
+
126
+ class Case
127
+ def initialize(special, unicode_data, io = $stdout)
128
+ @special, @io = special, io
129
+ @io.puts '# -*- coding: utf-8 -*-',
130
+ '# Automatically generated, so don’t edit!',
131
+ '',
132
+ 'Expectations do'
133
+ unicode_data.each do |entry|
134
+ generate entry
135
+ end
136
+ @io.puts 'end'
137
+ end
138
+
139
+ private
140
+
141
+ def generate(entry)
142
+ case entry.category
143
+ when 'Ll'
144
+ upcase entry
145
+ when 'Lu'
146
+ downcase entry
147
+ when 'Lt'
148
+ upcase entry
149
+ downcase entry
150
+ end
151
+ end
152
+
153
+ def upcase(entry)
154
+ if @special.include? entry.code
155
+ generate1 @special[entry.code].upper, u(entry), :upcase if not @special[entry.code].condition
156
+ elsif not entry.upper.empty?
157
+ generate1 Utfify.utfify(entry.upper), u(entry), :upcase
158
+ end
159
+ end
160
+
161
+ def downcase(entry)
162
+ if @special.include? entry.code
163
+ generate1 @special[entry.code].lower, u(entry), :downcase if not @special[entry.code].condition
164
+ elsif not entry.lower.empty?
165
+ generate1 Utfify.utfify(entry.lower), u(entry), :downcase
166
+ end
167
+ end
168
+
169
+ def generate1(expected, u, method)
170
+ @io.printf " expect '%s'.u do %s.%s end\n", expected, u, method
171
+ end
172
+
173
+ def u(entry)
174
+ "'%s'.u" % [entry.code].pack('U*')
175
+ end
176
+ end
177
+
178
+ Case.new SpecialCasing.new(ARGV[0]), UnicodeData.new(ARGV[1])