u 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -1,67 +0,0 @@
1
- require 'mkmf'
2
-
3
- def try_compiler_option(opt, &block)
4
- checking_for "#{opt} option to compiler" do
5
- $CFLAGS += " #{opt}" if try_compile '', opt, &block
6
- end
7
- end
8
-
9
- try_compiler_option '-std=c99'
10
- try_compiler_option '-finline-functions'
11
- try_compiler_option '-fno-common'
12
- try_compiler_option '-Wall'
13
- try_compiler_option '-Waggregate-return'
14
- try_compiler_option '-Wcast-align'
15
- try_compiler_option '-Wextra'
16
- try_compiler_option '-Wformat=2'
17
- try_compiler_option '-Winit-self'
18
- try_compiler_option '-Winline'
19
- try_compiler_option '-Wmissing-declarations'
20
- try_compiler_option '-Wmissing-format-attribute'
21
- try_compiler_option '-Wmissing-include-dirs'
22
- try_compiler_option '-Wmissing-noreturn'
23
- try_compiler_option '-Wmissing-prototypes'
24
- try_compiler_option '-Wnested-externs'
25
- try_compiler_option '-Wold-style-definition'
26
- try_compiler_option '-Wpacked'
27
- try_compiler_option '-Wp,-D_FORTIFY_SOURCE=2'
28
- try_compiler_option '-Wpointer-arith'
29
- try_compiler_option '-Wsign-compare'
30
- try_compiler_option '-Wstrict-aliasing=2'
31
- try_compiler_option '-Wswitch-default'
32
- try_compiler_option '-Wswitch-enum'
33
- try_compiler_option '-Wundef'
34
- try_compiler_option '-Wunsafe-loop-optimizations'
35
- try_compiler_option '-Wwrite-strings'
36
-
37
- checking_for 'GNUC visibility attribute' do
38
- $defs.push('-DHAVE_GNUC_VISIBILITY') if try_compile <<EOC, '-Werror'
39
- void f_hidden(void);
40
- void __attribute__((visibility("hidden")))
41
- f_hidden(void)
42
- {
43
- }
44
- int main(void)
45
- {
46
- f_hidden();
47
- return 0;
48
- }
49
- EOC
50
- end
51
-
52
- have_header 'assert.h'
53
- have_header 'limits.h'
54
- have_header 'locale.h'
55
- have_header 'stdbool.h'
56
- have_header 'stddef.h'
57
- have_header 'stdint.h'
58
- have_header 'stdio.h'
59
- have_header 'stdlib.h'
60
- have_header 'string.h'
61
- have_header 'sys/types.h'
62
- have_header 'wchar.h'
63
-
64
- $INSTALLFILES ||= []
65
- $INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
66
-
67
- create_makefile 'encoding/character/utf-8/utf8'
@@ -1,51 +0,0 @@
1
- /*
2
- * contents: Private Unicode related information.
3
- *
4
- * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
- */
6
-
7
- #ifndef PRIVATE_H
8
- #define PRIVATE_H
9
-
10
- #define NUL '\0'
11
- #define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
12
-
13
- #if defined(HAVE_GNUC_VISIBILITY)
14
- # define HIDDEN \
15
- __attribute__((visibility("hidden")))
16
- #else
17
- # define HIDDEN
18
- #endif
19
-
20
- #if defined(__GNUC__)
21
- # define UNUSED(u) \
22
- u __attribute__((__unused__))
23
- #else
24
- # define UNUSED(u) \
25
- u
26
- #endif
27
-
28
- #define binary_search_middle_of(begin, end) \
29
- (((unsigned)((begin) + (end))) >> 1)
30
-
31
- #define unicode_table_lookup(table, c, index) \
32
- binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
33
-
34
- bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
35
-
36
- #define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
37
- ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
38
- ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
39
- : (data[part[page]][(c) & 0xff]))
40
-
41
- #define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
42
- (((c) <= UNICODE_LAST_CHAR_PART1) \
43
- ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
44
- : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
45
- ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
46
- : (fallback)))
47
-
48
- unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
49
- NormalizeMode mode) HIDDEN;
50
-
51
- #endif /* PRIVATE_H */
@@ -1,1056 +0,0 @@
1
- /*
2
- * contents: Unicode character properties.
3
- *
4
- * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
- */
6
-
7
- #include <ruby.h>
8
- #include <assert.h>
9
- #include <locale.h>
10
- #include <stdbool.h>
11
- #include <stddef.h>
12
- #include <stdint.h>
13
- #include <string.h>
14
- #include "unicode.h"
15
- #include "private.h"
16
- #include "data/character-tables.h"
17
-
18
-
19
- #define COMBINING_DOT_ABOVE ((unichar)0x0307)
20
- #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
21
- #define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
22
- #define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
23
- #define LATIN_SMALL_LETTER_I ((unichar)0x0069)
24
- #define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
25
- #define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
26
- #define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
27
- #define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
28
- #define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
29
- #define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
30
- #define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
31
- #define COMBINING_TILDE ((unichar)0x0303)
32
- #define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
33
- #define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
34
- #define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
35
-
36
- #define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
37
-
38
- /* {{{1
39
- * Macros for accessing the Unicode character attribute table.
40
- *
41
- * TODO: Turn these macros into full-fledged functions, as this is rather silly
42
- * when we have ‹inline› in C99.
43
- */
44
- #define ATTR_TABLE(page) \
45
- (((page) <= UNICODE_LAST_PAGE_PART1) \
46
- ? attr_table_part1[page] \
47
- : attr_table_part2[(page) - 0xe00])
48
-
49
- #define ATTTABLE(page, char) \
50
- ((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
51
- ? 0 : (attr_data[ATTR_TABLE(page)][char]))
52
-
53
-
54
- /* {{{1
55
- * Internal function used for figuring out the type of a given character.
56
- */
57
- static inline int
58
- s_type(unichar c)
59
- {
60
- const int16_t *table;
61
- unsigned int page;
62
-
63
- if (c <= UNICODE_LAST_CHAR_PART1) {
64
- page = c >> 8;
65
- table = type_table_part1;
66
- } else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
67
- page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
68
- table = type_table_part2;
69
- } else {
70
- return UNICODE_UNASSIGNED;
71
- }
72
-
73
- if (table[page] >= UNICODE_MAX_TABLE_INDEX)
74
- return table[page] - UNICODE_MAX_TABLE_INDEX;
75
- else
76
- return type_data[table[page]][c & 0xff];
77
- }
78
-
79
-
80
- /* {{{1
81
- * Bit-fiddling macros for testing the class of a type.
82
- */
83
- #define IS(type, class) (((unsigned int)1 << (type)) & (class))
84
- #define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
85
-
86
-
87
- /* {{{1
88
- * Internal function used to check if the given type represents a digit type.
89
- */
90
- static inline bool
91
- s_isdigit(int type)
92
- {
93
- return IS(type,
94
- OR(UNICODE_DECIMAL_NUMBER,
95
- OR(UNICODE_LETTER_NUMBER,
96
- OR(UNICODE_OTHER_NUMBER, 0))));
97
- }
98
-
99
-
100
- /* {{{1
101
- * Internal function used to check if the given type represents an alphabetic
102
- * type.
103
- */
104
- static inline bool
105
- s_isalpha(int type)
106
- {
107
- return IS(type,
108
- OR(UNICODE_LOWERCASE_LETTER,
109
- OR(UNICODE_UPPERCASE_LETTER,
110
- OR(UNICODE_TITLECASE_LETTER,
111
- OR(UNICODE_MODIFIER_LETTER,
112
- OR(UNICODE_OTHER_LETTER, 0))))));
113
- }
114
-
115
-
116
- /* {{{1
117
- * Internal function used to check if the given type represents a mark type.
118
- */
119
- static inline bool
120
- s_ismark(int type)
121
- {
122
- return IS(type,
123
- OR(UNICODE_NON_SPACING_MARK,
124
- OR(UNICODE_COMBINING_MARK,
125
- OR(UNICODE_ENCLOSING_MARK, 0))));
126
- }
127
-
128
-
129
- /* {{{1
130
- * Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
131
- */
132
- bool
133
- unichar_isalnum(unichar c)
134
- {
135
- int type = s_type(c);
136
-
137
- return s_isdigit(type) || s_isalpha(type);
138
- }
139
-
140
-
141
- /* {{{1
142
- * Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
143
- */
144
- bool
145
- unichar_isalpha(unichar c)
146
- {
147
- return s_isalpha(s_type(c));
148
- }
149
-
150
-
151
- /* {{{1
152
- * Determine whether ‘c’ is a control character, such as ‹NUL›.
153
- */
154
- bool
155
- unichar_iscntrl(unichar c)
156
- {
157
- return s_type(c) == UNICODE_CONTROL;
158
- }
159
-
160
-
161
- /* {{{1
162
- * Determine whether ‘c’ is a digit, such as 0, 1, or 2.
163
- */
164
- bool
165
- unichar_isdigit(unichar c)
166
- {
167
- return s_type(c) == UNICODE_DECIMAL_NUMBER;
168
- }
169
-
170
-
171
- /* {{{1
172
- * Determine whether ‘c’ is printable and not a space or control character such
173
- * as tab or <NUL›, such as A, B, or C.
174
- */
175
- bool
176
- unichar_isgraph(unichar c)
177
- {
178
- return !IS(s_type(c),
179
- OR(UNICODE_CONTROL,
180
- OR(UNICODE_FORMAT,
181
- OR(UNICODE_UNASSIGNED,
182
- OR(UNICODE_PRIVATE_USE,
183
- OR(UNICODE_SURROGATE,
184
- OR(UNICODE_SPACE_SEPARATOR, 0)))))));
185
- }
186
-
187
-
188
- /* {{{1
189
- * Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
190
- */
191
- bool
192
- unichar_islower(unichar c)
193
- {
194
- return s_type(c) == UNICODE_LOWERCASE_LETTER;
195
- }
196
-
197
-
198
- /* {{{1
199
- * Determine whether ‘c’ is printable, which works the same as
200
- * unichar_isgraph(), except that space characters are also printable.
201
- */
202
- bool
203
- unichar_isprint(unichar c)
204
- {
205
- return !IS(s_type(c),
206
- OR(UNICODE_CONTROL,
207
- OR(UNICODE_FORMAT,
208
- OR(UNICODE_UNASSIGNED,
209
- OR(UNICODE_PRIVATE_USE,
210
- OR(UNICODE_SURROGATE, 0))))));
211
- }
212
-
213
-
214
- /* {{{1
215
- * Determine whether ‘c’ is some form of punctuation or other symbol.
216
- */
217
- bool
218
- unichar_ispunct(unichar c)
219
- {
220
- return IS(s_type(c),
221
- OR(UNICODE_CONNECT_PUNCTUATION,
222
- OR(UNICODE_DASH_PUNCTUATION,
223
- OR(UNICODE_OPEN_PUNCTUATION,
224
- OR(UNICODE_CLOSE_PUNCTUATION,
225
- OR(UNICODE_INITIAL_PUNCTUATION,
226
- OR(UNICODE_FINAL_PUNCTUATION,
227
- OR(UNICODE_OTHER_PUNCTUATION,
228
- OR(UNICODE_MODIFIER_SYMBOL,
229
- OR(UNICODE_MATH_SYMBOL,
230
- OR(UNICODE_CURRENCY_SYMBOL,
231
- OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
232
- }
233
-
234
-
235
- /* {{{1
236
- * Determine whether ‘c’ is some form of whitespace, such as space, tab or a
237
- * line separator (newline, carriage return, etc.).
238
- */
239
- bool
240
- unichar_isspace(unichar c)
241
- {
242
- switch (c) {
243
- case '\t':
244
- case '\n':
245
- case '\r':
246
- case '\f':
247
- return true;
248
- default:
249
- return IS(s_type(c),
250
- OR(UNICODE_SPACE_SEPARATOR,
251
- OR(UNICODE_LINE_SEPARATOR,
252
- OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
253
- }
254
- }
255
-
256
-
257
- /* {{{1
258
- * Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
259
- */
260
- bool
261
- unichar_isupper(unichar c)
262
- {
263
- return s_type(c) == UNICODE_UPPERCASE_LETTER;
264
- }
265
-
266
-
267
- /* {{{1
268
- * Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
269
- * which at the beginning of a word is written as Dz, where only the initial D
270
- * is capitalized. (Complicated huh?)
271
- */
272
- bool
273
- unichar_istitle(unichar c)
274
- {
275
- /* TODO: binary search helpful? */
276
- for (size_t i = 0; i < lengthof(title_table); i++)
277
- if (title_table[i][0] == c)
278
- return true;
279
-
280
- return false;
281
- }
282
-
283
-
284
- /* {{{1
285
- * Determine whether ‘c’ is a new-line.
286
- */
287
- #define UNICHAR_NEXT_LINE ((unichar)0x0085)
288
- #define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
289
- #define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
290
-
291
- bool
292
- unichar_isnewline(unichar c)
293
- {
294
- switch (c) {
295
- case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
296
- case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
297
- return true;
298
- default:
299
- return false;
300
- }
301
- }
302
-
303
- /* {{{1
304
- * Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
305
- * ..., f, or A, B, ..., F.
306
- */
307
- #define UNICHAR_FULLWIDTH_A 0xff21
308
- #define UNICHAR_FULLWIDTH_F 0xff26
309
- #define UNICHAR_FULLWIDTH_a 0xff41
310
- #define UNICHAR_FULLWIDTH_f 0xff46
311
- bool
312
- unichar_isxdigit(unichar c)
313
- {
314
- return ((c >= 'a' && c <= 'f') ||
315
- (c >= 'A' && c <= 'F') ||
316
- (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
317
- (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
318
- (s_type(c) == UNICODE_DECIMAL_NUMBER));
319
- // s_isdigit(s_type(c)));
320
- }
321
-
322
-
323
- /* {{{1
324
- * Determine whether code point ‘c’ has been assigned a code value.
325
- */
326
- bool
327
- unichar_isassigned(unichar c)
328
- {
329
- return s_type(c) != UNICODE_UNASSIGNED;
330
- }
331
-
332
-
333
- /* {{{1
334
- * Determine whether ‘c’ is a wide character, thus is typically rendered in a
335
- * double-width cell on a terminal.
336
- */
337
- bool
338
- unichar_iswide(unichar c)
339
- {
340
- if (c < 0x1100)
341
- return false;
342
-
343
- return (c <= 0x115f || /* Hangul Jamo init. consonants */
344
- c == 0x2329 || c == 0x232a || /* angle brackets */
345
- (c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
346
- (c < 0x302a || c > 0x302f) &&
347
- c != 0x303f && c != 0x3099 && c != 0x309a) ||
348
- (c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
349
- (c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
350
- (c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
351
- (c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
352
- (c >= 0xffe0 && c <= 0xffe6) || /* -"- */
353
- (c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
354
- (c >= 0x30000 && c <= 0x3fffd)); /* -"- */
355
- }
356
-
357
-
358
- /* {{{1
359
- * Convert ‘c’ to its uppercase representation (if any).
360
- */
361
- static unichar
362
- special_case_table_lookup(unichar c)
363
- {
364
- unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
-
366
- if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
- tv = utf_char(special_case_table +
368
- tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
-
370
- if (tv == '\0')
371
- return c;
372
-
373
- return tv;
374
- }
375
-
376
- static unichar
377
- titlecase_table_lookup(unichar c, bool want_upper)
378
- {
379
- for (size_t i = 0; i < lengthof(title_table); i++)
380
- if (title_table[i][0] == c)
381
- return title_table[i][want_upper ? 1 : 2];
382
-
383
- return c;
384
- }
385
-
386
- unichar
387
- unichar_toupper(unichar c)
388
- {
389
- int type = s_type(c);
390
-
391
- if (type == UNICODE_LOWERCASE_LETTER)
392
- return special_case_table_lookup(c);
393
-
394
- if (type == UNICODE_TITLECASE_LETTER)
395
- return titlecase_table_lookup(c, true);
396
-
397
- return c;
398
- }
399
-
400
-
401
- /* {{{1
402
- * Convert ‘c’ to its lowercase representation (if any).
403
- */
404
- unichar
405
- unichar_tolower(unichar c)
406
- {
407
- int type = s_type(c);
408
-
409
- if (type == UNICODE_UPPERCASE_LETTER)
410
- return special_case_table_lookup(c);
411
-
412
- if (type == UNICODE_TITLECASE_LETTER)
413
- return titlecase_table_lookup(c, false);
414
-
415
- return c;
416
- }
417
-
418
-
419
- /* {{{1
420
- * Convert ‘c’ to its titlecase representation (if any).
421
- */
422
- unichar
423
- unichar_totitle(unichar c)
424
- {
425
- for (size_t i = 0; i < lengthof(title_table); i++)
426
- if (title_table[i][0] == c ||
427
- title_table[i][1] == c ||
428
- title_table[i][2] == c)
429
- return title_table[i][0];
430
-
431
- if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
- return unichar_toupper(c);
433
-
434
- return c;
435
- }
436
-
437
-
438
- /* {{{1
439
- * Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
440
- */
441
- int
442
- unichar_digit_value(unichar c)
443
- {
444
- if (s_type(c) == UNICODE_DECIMAL_NUMBER)
445
- return ATTTABLE(c >> 8, c & 0xff);
446
-
447
- return -1;
448
- }
449
-
450
-
451
- /* {{{1
452
- * Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
453
- */
454
- int
455
- unichar_xdigit_value(unichar c)
456
- {
457
- if (c >= 'a' && c <= 'f')
458
- return c - 'a' + 10;
459
- else if (c >= 'A' && c <= 'F')
460
- return c - 'A' + 10;
461
- else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
462
- return c - UNICHAR_FULLWIDTH_a + 10;
463
- else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
464
- return c - UNICHAR_FULLWIDTH_A + 10;
465
- else
466
- return unichar_digit_value(c);
467
- }
468
-
469
-
470
- /* {{{1
471
- * Determine the Unicode character type of ‘c’.
472
- */
473
- UnicodeType
474
- unichar_type(unichar c)
475
- {
476
- return s_type(c);
477
- }
478
-
479
-
480
- /* {{{1
481
- * LocaleType: This ‹enum› is used for dealing with different locales for
482
- * turning strings into uppercase or lowercase.
483
- */
484
- typedef enum {
485
- LOCALE_NORMAL,
486
- LOCALE_TURKIC,
487
- LOCALE_LITHUANIAN
488
- } LocaleType;
489
-
490
-
491
- /* {{{1
492
- * Retrieve the locale type from the environment (LC_CTYPE).
493
- */
494
- static LocaleType
495
- get_locale_type(void)
496
- {
497
- const char *locale = setlocale(LC_CTYPE, NULL);
498
-
499
- if ((locale[0] == 'a' && locale[1] == 'z') ||
500
- (locale[0] == 't' && locale[1] == 'r'))
501
- return LOCALE_TURKIC;
502
-
503
- if (locale[0] == 'l' && locale[1] == 't')
504
- return LOCALE_LITHUANIAN;
505
-
506
- return LOCALE_NORMAL;
507
- }
508
-
509
-
510
- /* {{{1
511
- * Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
512
- * true, remove the dot over an uppercase I for a turkish locale.
513
- */
514
- static size_t
515
- output_marks(const char **p_inout, char *buf, bool remove_dot)
516
- {
517
- size_t len = 0;
518
- const char *p = *p_inout;
519
-
520
- for ( ; *p != '\0'; p = utf_next(p)) {
521
- unichar c = utf_char(p);
522
-
523
- if (!s_ismark(s_type(c)))
524
- break;
525
-
526
- if (!remove_dot || c != COMBINING_DOT_ABOVE)
527
- len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
528
- }
529
-
530
- *p_inout = p;
531
-
532
- return len;
533
- }
534
-
535
- /* {{{1
536
- * Output titlecases where appropriate.
537
- */
538
- static size_t
539
- output_special_case(char *buf, int offset, int type, bool upper)
540
- {
541
- const char *p = special_case_table + offset;
542
-
543
- if (type != UNICODE_TITLECASE_LETTER)
544
- p = utf_next(p);
545
-
546
- if (upper)
547
- p += utf_byte_length(p) + 1;
548
-
549
- size_t len = utf_byte_length(p);
550
-
551
- if (buf != NULL)
552
- memcpy(buf, p, len);
553
-
554
- return len;
555
- }
556
-
557
- /* {{{1
558
- * Do uppercasing of ‘p’ for Lithuanian locales.
559
- */
560
- static size_t
561
- remove_all_combining_dot_above(unichar c, char *buf)
562
- {
563
- size_t decomp_len;
564
- unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
565
-
566
- size_t len = 0;
567
- for (size_t i = 0; i < decomp_len; i++)
568
- if (decomp[i] != COMBINING_DOT_ABOVE)
569
- len += unichar_to_utf(unichar_toupper(decomp[i]),
570
- OFFSET_IF(buf, len));
571
-
572
- free(decomp);
573
-
574
- return len;
575
- }
576
-
577
- static size_t
578
- real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
579
- bool *was_i)
580
- {
581
- if (c == 'i') {
582
- *was_i = true;
583
- return 0;
584
- }
585
-
586
- if (*was_i) {
587
- size_t len = remove_all_combining_dot_above(c, buf);
588
- return len + output_marks(p, OFFSET_IF(buf, len), true);
589
- }
590
-
591
- if (!s_ismark(type))
592
- *was_i = false;
593
-
594
- return 0;
595
- }
596
-
597
- /* {{{1
598
- * Do real upcasing. */
599
- static inline size_t
600
- real_do_toupper(unichar c, int type, char *buf)
601
- {
602
- bool upper = (type != UNICODE_LOWERCASE_LETTER);
603
- unichar tv = ATTTABLE(c >> 8, c & 0xff);
604
-
605
- if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
606
- return output_special_case(buf,
607
- tv - UNICODE_SPECIAL_CASE_TABLE_START,
608
- type, upper);
609
-
610
- /* TODO: this should really use titlecase_table_lookup somehow. */
611
- if (type == UNICODE_TITLECASE_LETTER)
612
- for (size_t i = 0; i < lengthof(title_table); i++)
613
- if (title_table[i][0] == c)
614
- return unichar_to_utf(title_table[i][1], buf);
615
-
616
- return unichar_to_utf(tv != '\0' ? tv : c, buf);
617
- }
618
-
619
- /* {{{1
620
- * Do real uppercasing of ‘str’.
621
- */
622
- static size_t
623
- real_toupper_one(const char **p, const char *prev, char *buf,
624
- LocaleType locale_type, bool *was_i)
625
- {
626
- unichar c = utf_char(prev);
627
- int type = s_type(c);
628
-
629
- if (locale_type == LOCALE_LITHUANIAN) {
630
- size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
631
- if (len > 0)
632
- return len;
633
- }
634
-
635
- if (locale_type == LOCALE_TURKIC && c == 'i')
636
- return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
637
- buf);
638
-
639
- if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
640
- /* Nasty, need to move it after other combining marks...this
641
- * would go away if we normalized first. */
642
- /* TODO: don’t we need to make sure we don’t go beyond the end
643
- * of ‘p’? */
644
- size_t len = output_marks(p, buf, false);
645
- return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
646
- OFFSET_IF(buf, len));
647
- }
648
-
649
- if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
650
- OR(UNICODE_TITLECASE_LETTER, 0))))
651
- return real_do_toupper(c, type, buf);
652
-
653
- size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
654
-
655
- if (buf != NULL)
656
- memcpy(buf, prev, len);
657
-
658
- return len;
659
- }
660
-
661
- static size_t
662
- real_toupper(const char *str, size_t max, bool use_max, char *buf,
663
- LocaleType locale_type)
664
- {
665
- const char *p = str;
666
- size_t len = 0;
667
- bool p_was_i = false;
668
-
669
- while ((!use_max || p < str + max) && *p != '\0') {
670
- const char *prev = p;
671
- p = utf_next(p);
672
-
673
- len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
674
- locale_type, &p_was_i);
675
- }
676
-
677
- return len;
678
- }
679
-
680
- /* {{{1
681
- * Wrapper around real_toupper() for dealing with memory allocation and such.
682
- */
683
- static char *
684
- utf_upcase_impl(const char *str, size_t max, bool use_max)
685
- {
686
- assert(str != NULL);
687
-
688
- LocaleType locale_type = get_locale_type();
689
-
690
- size_t len = real_toupper(str, max, use_max, NULL, locale_type);
691
- char *result = ALLOC_N(char, len + 1);
692
- real_toupper(str, max, use_max, result, locale_type);
693
- result[len] = '\0';
694
-
695
- return result;
696
- }
697
-
698
-
699
- /* {{{1
700
- * Convert all characters in ‘str’ to their uppercase representation if
701
- * applicable. Returns the freshly allocated representation.
702
- */
703
- char *
704
- utf_upcase(const char *str)
705
- {
706
- return utf_upcase_impl(str, 0, false);
707
- }
708
-
709
-
710
- /* {{{1
711
- * Convert all characters in ‘str’ to their uppercase representation if
712
- * applicable. Returns the freshly allocated representation. Do this for at
713
- * most ‘len˚ bytes from ‘str’.
714
- */
715
- char *
716
- utf_upcase_n(const char *str, size_t len)
717
- {
718
- return utf_upcase_impl(str, len, true);
719
- }
720
-
721
-
722
- /* {{{1
723
- * Traverse the string checking for characters with combining class == 230
724
- * until a base character is found.
725
- */
726
- static bool
727
- has_more_above(const char *str)
728
- {
729
- for (const char *p = str; *p != '\0'; p = utf_next(p)) {
730
- int c_class = unichar_combining_class(utf_char(p));
731
-
732
- if (c_class == 230)
733
- return true;
734
-
735
- if (c_class == 0)
736
- return false;
737
- }
738
-
739
- return false;
740
- }
741
-
742
- static inline size_t
743
- real_do_tolower(unichar c, int type, char *buf)
744
- {
745
- unichar tv = ATTTABLE(c >> 8, c & 0xff);
746
-
747
- if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
748
- return output_special_case(buf,
749
- tv - UNICODE_SPECIAL_CASE_TABLE_START,
750
- type, false);
751
-
752
- /* TODO: this should really use titlecase_table_lookup somehow. */
753
- if (type == UNICODE_TITLECASE_LETTER)
754
- for (size_t i = 0; i < lengthof(title_table); i++)
755
- if (title_table[i][0] == c)
756
- return unichar_to_utf(title_table[i][2], buf);
757
-
758
- return unichar_to_utf(tv != '\0' ? tv : c, buf);
759
- }
760
-
761
- /* {{{1
762
- * The real implementation of downcase.
763
- */
764
- static size_t
765
- tolower_turkic_i(const char **p, char *buf)
766
- {
767
- unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
768
-
769
- if (utf_char(*p) == COMBINING_DOT_ABOVE) {
770
- /* TODO: don’t we need to make sure we don’t go beyond the end
771
- * of ‘p’? */
772
- *p = utf_next(*p);
773
- i = LATIN_SMALL_LETTER_I;
774
- }
775
-
776
- return unichar_to_utf(i, buf);
777
- }
778
-
779
- static size_t
780
- tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
781
- {
782
- size_t len = unichar_to_utf(base, buf);
783
- len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
784
- if (combiner != '\0')
785
- len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
786
-
787
- return len;
788
- }
789
-
790
- static size_t
791
- tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
792
- {
793
- unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
794
-
795
- /* SIGMA maps differently depending on whether it is final or not. The
796
- * following simplified test would fail in the case of combining marks
797
- * following the sigma, but I don't think that occurs in real text.
798
- * The test here matches that in ICU. */
799
- if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
800
- sigma = GREEK_SMALL_LETTER_SIGMA;
801
-
802
- return unichar_to_utf(sigma, buf);
803
- }
804
-
805
- static size_t
806
- real_tolower_one(const char **p, const char *prev, char *buf,
807
- LocaleType locale_type, const char *end, bool use_end)
808
- {
809
- unichar c = utf_char(prev);
810
- int type = s_type(c);
811
-
812
- if (locale_type == LOCALE_TURKIC && c == 'I')
813
- return tolower_turkic_i(p, buf);
814
-
815
- /* Introduce an explicit dot above the lowercasing capital I’s
816
- * and J’s whenever there are more accents above.
817
- * [SpecialCasing.txt] */
818
- if (locale_type == LOCALE_LITHUANIAN) {
819
- unichar base = LATIN_SMALL_LETTER_I;
820
- unichar combiner = '\0';
821
-
822
- switch (c) {
823
- case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
824
- combiner = COMBINING_GRAVE_ACCENT;
825
- break;
826
- case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
827
- combiner = COMBINING_ACUTE_ACCENT;
828
- break;
829
- case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
830
- combiner = COMBINING_TILDE;
831
- break;
832
- case 'I':
833
- case 'J':
834
- case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
835
- if (!has_more_above(*p))
836
- goto no_lithuanian_i_casing;
837
-
838
- base = unichar_tolower(c);
839
- break;
840
- default:
841
- goto no_lithuanian_i_casing;
842
- }
843
-
844
- return tolower_lithuianian_i(buf, base, combiner);
845
- }
846
-
847
- no_lithuanian_i_casing:
848
-
849
- if (c == GREEK_CAPITAL_LETTER_SIGMA)
850
- return tolower_sigma(p, buf, end, use_end);
851
-
852
- if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
853
- OR(UNICODE_TITLECASE_LETTER, 0))))
854
- return real_do_tolower(c, type, buf);
855
-
856
- size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
857
-
858
- if (buf != NULL)
859
- memcpy(buf, prev, len);
860
-
861
- return len;
862
- }
863
-
864
- static size_t
865
- real_tolower(const char *str, size_t max, bool use_max, char *buf,
866
- LocaleType locale_type)
867
- {
868
- const char *p = str;
869
- const char *end = str + max;
870
- size_t len = 0;
871
-
872
- while ((!use_max || p < end) && *p != '\0') {
873
- const char *prev = p;
874
- p = utf_next(p);
875
-
876
- len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
877
- locale_type, end, use_max);
878
- }
879
-
880
- return len;
881
- }
882
-
883
-
884
- /* {{{1 */
885
- static char *
886
- utf_downcase_impl(const char *str, size_t max, bool use_max)
887
- {
888
- assert(str != NULL);
889
-
890
- LocaleType locale_type = get_locale_type();
891
-
892
- size_t len = real_tolower(str, max, use_max, NULL, locale_type);
893
- char *result = ALLOC_N(char, len + 1);
894
- real_tolower(str, max, use_max, result, locale_type);
895
- result[len] = '\0';
896
-
897
- return result;
898
- }
899
-
900
-
901
- /* {{{1
902
- * Convert all characters in ‘str’ to their lowercase representation if
903
- * applicable. Returns the freshly allocated representation.
904
- */
905
- char *
906
- utf_downcase(const char *str)
907
- {
908
- return utf_downcase_impl(str, 0, false);
909
- }
910
-
911
-
912
- /* {{{1
913
- * Convert all characters in ‘str’ to their lowercase representation if
914
- * applicable. Returns the freshly allocated representation. Do this for at
915
- * most ‘len˚ bytes from ‘str’.
916
- */
917
- char *
918
- utf_downcase_n(const char *str, size_t len)
919
- {
920
- return utf_downcase_impl(str, len, true);
921
- }
922
-
923
-
924
- /* {{{1
925
- * The real implementation of case folding below.
926
- */
927
-
928
- static bool
929
- casefold_table_lookup(unichar c, char *folded, size_t *len)
930
- {
931
- int index;
932
-
933
- if (!unicode_table_lookup(casefold_table, c, &index))
934
- return false;
935
-
936
- char const *folded_c = casefold_table[index].data;
937
-
938
- if (folded != NULL)
939
- strcpy(folded, folded_c);
940
-
941
- *len += utf_byte_length(folded_c);
942
-
943
- return true;
944
- }
945
-
946
- static char *
947
- utf_foldcase_impl(const char *str, size_t max, bool use_max)
948
- {
949
- assert(str != NULL);
950
-
951
- char *folded = NULL;
952
- size_t len = 0;
953
-
954
- again:
955
- for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
956
- unichar c = utf_char(p);
957
-
958
- if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
959
- continue;
960
-
961
- len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
962
- }
963
-
964
- if (folded == NULL) {
965
- folded = ALLOC_N(char, len + 1);
966
- folded[0] = NUL;
967
- len = 0;
968
- goto again;
969
- }
970
-
971
- folded[len] = '\0';
972
-
973
- return folded;
974
- }
975
-
976
-
977
- /* {{{1
978
- * Convert a string into a form that is independent of case. Return the
979
- * freshly allocated representation.
980
- */
981
- char *
982
- utf_foldcase(const char *str)
983
- {
984
- return utf_foldcase_impl(str, 0, false);
985
- }
986
-
987
-
988
- /* {{{1
989
- * Convert a string into a form that is independent of case. Return the
990
- * freshly allocated representation. Do this for at most ‘len’ bytes from the
991
- * string.
992
- */
993
- char *
994
- utf_foldcase_n(const char *str, size_t len)
995
- {
996
- return utf_foldcase_impl(str, len, true);
997
- }
998
-
999
-
1000
- /* {{{1
1001
- * The real implementation of utf_width() and utf_width_n() below.
1002
- */
1003
- static size_t
1004
- utf_width_impl(const char *str, size_t len, bool use_len)
1005
- {
1006
- assert(str != NULL);
1007
-
1008
- size_t width = 0;
1009
-
1010
- for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
1011
- width += unichar_iswide(utf_char(p)) ? 2 : 1;
1012
-
1013
- return width;
1014
- }
1015
-
1016
-
1017
- /* {{{1
1018
- * Calculate the width in cells of ‘str’.
1019
- */
1020
- size_t
1021
- utf_width(const char *str)
1022
- {
1023
- return utf_width_impl(str, 0, false);
1024
- }
1025
-
1026
-
1027
- /* {{{1
1028
- * Calculate the width in cells of ‘str’, which is of length ‘len’.
1029
- */
1030
- size_t
1031
- utf_width_n(const char *str, size_t len)
1032
- {
1033
- return utf_width_impl(str, len, true);
1034
- }
1035
-
1036
-
1037
- /* {{{1
1038
- * Retrieve the mirrored representation of ‘c’ (if any) and store it in
1039
- * ‘mirrored’.
1040
- */
1041
- bool
1042
- unichar_mirror(unichar c, unichar *mirrored)
1043
- {
1044
- int index;
1045
-
1046
- if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
1047
- return false;
1048
-
1049
- if (mirrored != NULL)
1050
- *mirrored = bidi_mirroring_table[index].mirrored_ch;
1051
-
1052
- return true;
1053
- }
1054
-
1055
-
1056
- /* }}}1 */