u 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -1,1334 +0,0 @@
1
- /*
2
- * contents: UTF-8 string operations.
3
- *
4
- * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
- */
6
-
7
-
8
- #include <ruby.h>
9
- #include <assert.h>
10
- #include <locale.h>
11
- #include <stdbool.h>
12
- #include <stddef.h>
13
- #include <stdint.h>
14
- #include <stdlib.h>
15
- #include <string.h>
16
- #include <wchar.h>
17
-
18
- #include "unicode.h"
19
- #include "private.h"
20
-
21
-
22
- #define UNICODE_ISVALID(char) \
23
- ((char) < 0x110000 && \
24
- (((char) & 0xffffff800) != 0xd800) && \
25
- ((char) < 0xfdd0 || (char) > 0xfdef) && \
26
- ((char) & 0xfffe) != 0xfffe)
27
-
28
-
29
- /* {{{1
30
- * These are a couple of constants we use for dealing with the bit-twiddling
31
- * necessary when dealing with UTF-8 character sequences.
32
- */
33
- enum {
34
- BIT_1 = 7,
35
- BIT_X = 6,
36
- BIT_2 = 5,
37
- BIT_3 = 4,
38
- BIT_4 = 3,
39
- BIT_5 = 2,
40
- BIT_6 = 1,
41
-
42
- OCT_1 = ((1 << (BIT_1 + 1)) - 1) ^ 0xff, /* 0000 0000 */
43
- OCT_X = ((1 << (BIT_X + 1)) - 1) ^ 0xff, /* 1000 0000 */
44
- OCT_2 = ((1 << (BIT_2 + 1)) - 1) ^ 0xff, /* 1100 0000 */
45
- OCT_3 = ((1 << (BIT_3 + 1)) - 1) ^ 0xff, /* 1110 0000 */
46
- OCT_4 = ((1 << (BIT_4 + 1)) - 1) ^ 0xff, /* 1111 0000 */
47
- OCT_5 = ((1 << (BIT_5 + 1)) - 1) ^ 0xff, /* 1111 1000 */
48
- OCT_6 = ((1 << (BIT_6 + 1)) - 1) ^ 0xff, /* 1111 1100 */
49
-
50
- UNI_LEN1 = 0x80,
51
- UNI_LEN2 = 0x800,
52
- UNI_LEN3 = 0x10000,
53
- UNI_LEN4 = 0x200000,
54
- UNI_LEN5 = 0x4000000,
55
-
56
- MASK_X = (1 << BIT_X) - 1, /* 0011 1111 */
57
- TEST_X = MASK_X ^ 0xff, /* 1100 0000 */
58
- };
59
-
60
- /* {{{1
61
- * Determine whether ‘p’ is part of a UTF-8 multi-byte sequence.
62
- */
63
- #define CONT_X(p) ((((unsigned char)p) & TEST_X) == OCT_X)
64
-
65
- /* {{{1
66
- * Add the bits from ‘p’ to ‘c’, which is first shifted right to make room for
67
- * the additional bits.
68
- */
69
- #define ADD_X(c, p) (((c) << BIT_X) | (((unsigned char)p) & MASK_X))
70
-
71
- /* {{{1
72
- * Put bits from ‘c’ into ‘p’ and shift them off of ‘c’ afterwards.
73
- */
74
- #define PUT_X(c, p) ((p) = OCT_X | ((c) & MASK_X), (c) >> BIT_X)
75
-
76
-
77
- /* {{{1
78
- * s_utf_skip_lengths: This table is used for keeping track of how long a given
79
- * UTF-8 character sequence is from the contents of the first byte.
80
- */
81
- static const uint8_t s_utf_skip_length_data[256] = {
82
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
90
- };
91
-
92
-
93
- const char * const s_utf_skip_lengths = (const char *)s_utf_skip_length_data;
94
-
95
-
96
-
97
- /* {{{1
98
- * Private function used to calculate the length and mask to use when dealing
99
- * with a given UTF-8 character sequence.
100
- */
101
- static inline void
102
- _utf_compute(unsigned char c, int *mask, int *len)
103
- {
104
- if (c < 0x80) {
105
- *len = 1;
106
- *mask = 0x7f;
107
- } else if ((c & 0xe0) == 0xc0) {
108
- *len = 2;
109
- *mask = 0x1f;
110
- } else if ((c & 0xf0) == 0xe0) {
111
- *len = 3;
112
- *mask = 0x0f;
113
- } else if ((c & 0xf8) == 0xf0) {
114
- *len = 4;
115
- *mask = 0x07;
116
- } else if ((c & 0xfc) == 0xf8) {
117
- *len = 5;
118
- *mask = 0x03;
119
- } else if ((c & 0xfe) == 0xfc) {
120
- *len = 6;
121
- *mask = 0x01;
122
- } else {
123
- *len = -1;
124
- }
125
- }
126
-
127
- /* {{{1
128
- * Private function used to figure out the length of the UTF-8 representation
129
- * of a given Unicode character (UTF-32).
130
- */
131
- static inline unsigned short
132
- _utf_length(const unichar c)
133
- {
134
- if (c < UNI_LEN1)
135
- return 1;
136
- else if (c < UNI_LEN2)
137
- return 2;
138
- else if (c < UNI_LEN3)
139
- return 3;
140
- else if (c < UNI_LEN4)
141
- return 4;
142
- else if (c < UNI_LEN5)
143
- return 5;
144
- else
145
- return 6;
146
- }
147
-
148
- /* {{{1
149
- * Private function used to retrieve a UTF-32 character from an UTF-8 character
150
- * sequence given a mask and length previously retrieved with _utf_compute().
151
- */
152
- static inline unichar
153
- _utf_get(const char *str, int mask, int len)
154
- {
155
- unichar c = (unsigned char)str[0] & mask;
156
-
157
- for (int i = 1; i < len; i++) {
158
- unsigned char ch = ((const unsigned char *)str)[i];
159
-
160
- if (CONT_X(ch)) {
161
- c = ADD_X(c, ch);
162
- } else {
163
- c = UTF_BAD_INPUT_UNICHAR;
164
- break;
165
- }
166
- }
167
-
168
- return c;
169
- }
170
-
171
-
172
- /* {{{1
173
- * Retrieve a UTF-32 character from a UTF-8 character sequence.
174
- */
175
- unichar
176
- utf_char(const char *str)
177
- {
178
- int mask;
179
- int len;
180
-
181
- _utf_compute(*str, &mask, &len);
182
-
183
- return (len > -1) ? _utf_get(str, mask, len) : UTF_BAD_INPUT_UNICHAR;
184
- }
185
-
186
-
187
- /* {{{1
188
- * TODO
189
- */
190
- unichar
191
- utf_char_n(const char *str, size_t max)
192
- {
193
- if (max == 0)
194
- return UTF_INCOMPLETE_INPUT_UNICHAR;
195
-
196
- size_t len;
197
- unichar c = (unsigned char)*str;
198
-
199
- /* TODO: _utf_compute() here */
200
- if (c < 0x80) {
201
- return c;
202
- } else if (c < 0xc0) {
203
- return UTF_BAD_INPUT_UNICHAR;
204
- } else if (c < 0xe0) {
205
- len = 2;
206
- c &= 0x1f;
207
- } else if (c < 0xf0) {
208
- len = 3;
209
- c &= 0x0f;
210
- } else if (c < 0xf8) {
211
- len = 4;
212
- c &= 0x07;
213
- } else if (c < 0xfc) {
214
- len = 5;
215
- c &= 0x03;
216
- } else if (c < 0xfe) {
217
- len = 6;
218
- c &= 0x01;
219
- } else {
220
- return UTF_BAD_INPUT_UNICHAR;
221
- }
222
-
223
- if (len > max) {
224
- for (size_t i = 1; i < max; i++) {
225
- if (!CONT_X(str[i]))
226
- return UTF_BAD_INPUT_UNICHAR;
227
- }
228
-
229
- return UTF_INCOMPLETE_INPUT_UNICHAR;
230
- }
231
-
232
- for (size_t i = 1; i < len; i++) {
233
- unsigned char ch = ((const unsigned char *)str)[i];
234
-
235
- if (!CONT_X(ch))
236
- return (ch != NUL) ? UTF_BAD_INPUT_UNICHAR : UTF_INCOMPLETE_INPUT_UNICHAR;
237
-
238
- c = ADD_X(c, ch);
239
- }
240
-
241
- return (_utf_length(c) == len) ? c : UTF_BAD_INPUT_UNICHAR;
242
- }
243
-
244
-
245
- /* {{{1
246
- * Retrieve a UTF-32 character from a UTF-8 character sequence. This function
247
- * does additional checking while converitng, such as not overruning a maximum
248
- * length and checks for incomplete, invalid or out-of-range characters.
249
- */
250
- unichar
251
- utf_char_validated(const char *str)
252
- {
253
- unichar result = utf_char(str);
254
-
255
- if (result & 0x80000000) {
256
- return result;
257
- } else if (!unichar_isvalid(result)) {
258
- return UTF_BAD_INPUT_UNICHAR;
259
- } else {
260
- return result;
261
- }
262
- }
263
-
264
-
265
- /* {{{1 */
266
- unichar
267
- utf_char_validated_n(const char *str, size_t max)
268
- {
269
- unichar result = utf_char_n(str, max);
270
-
271
- if (result & 0x80000000) {
272
- return result;
273
- } else if (!unichar_isvalid(result)) {
274
- return UTF_BAD_INPUT_UNICHAR;
275
- } else {
276
- return result;
277
- }
278
- }
279
-
280
-
281
- /* {{{1
282
- * Return a pointer to the next UTF-8 character sequence in ‘str’. This
283
- * requires that it is at the start of the previous one already and no
284
- * additional error checking is done.
285
- */
286
- /*
287
- inline char *
288
- utf_next(const char *str)
289
- {
290
- return (char *)str + s_utf_skip_lengths[*(const uchar *)str];
291
- }
292
- */
293
-
294
-
295
- /* {{{1
296
- * Synchronize and go to the next UTF-8 character sequence in ‘p’. This search
297
- * will not go beyond ‘end’. ‹NULL› is returned if it couldn't be found.
298
- */
299
- char *
300
- utf_find_next(const char *p, const char *end)
301
- {
302
- if (*p != NUL) {
303
- if (end != NULL) {
304
- for (p++; p < end && CONT_X(*p); p++) {
305
- /* this loop intentionally left empty */
306
- }
307
- } else {
308
- for (p++; CONT_X(*p); p++) {
309
- /* this loop intentionally left empty */
310
- }
311
- }
312
- }
313
- return (p == end) ? NULL : (char *)p;
314
- }
315
-
316
-
317
- /* {{{1
318
- * Return a pointer to the previous UTF-8 character sequence in ‘str’.
319
- */
320
- char *
321
- utf_prev(const char *p)
322
- {
323
- while (true) {
324
- p--;
325
-
326
- if (!CONT_X(*p))
327
- return (char *)p;
328
- }
329
- }
330
-
331
-
332
- /* {{{1
333
- * Synchronize and go to the previous UTF-8 character sequence in ‘p’. This
334
- * search will not go beyond ‘begin’. ‹NULL› is returned if it couldn't be
335
- * found.
336
- */
337
- char *
338
- utf_find_prev(const char *begin, const char *p)
339
- {
340
- for (p--; p >= begin; p--) {
341
- if (!CONT_X(*p))
342
- return (char *)p;
343
- }
344
-
345
- return NULL;
346
- }
347
-
348
-
349
- /* {{{1
350
- * Convert an integer offset to a pointer within ‘str’.
351
- *
352
- */
353
- char *
354
- utf_offset_to_pointer(const char *str, long offset)
355
- {
356
- const char *p = str;
357
-
358
- if (offset > 0) {
359
- while (offset-- > 0)
360
- p = utf_next(p);
361
- } else {
362
- while (offset != 0) {
363
- const char *base = p;
364
- p += offset;
365
- while ((*p & 0xc0) == 0x80)
366
- p--;
367
-
368
- offset += utf_pointer_to_offset(p, base);
369
- }
370
- }
371
-
372
- return (char *)p;
373
- }
374
-
375
-
376
- /* {{{1
377
- * Convert a pointer to an integer offset within ‘str’.
378
- */
379
- long
380
- utf_pointer_to_offset(const char *str, const char *pos)
381
- {
382
- if (pos < str)
383
- return -utf_pointer_to_offset(pos, str);
384
-
385
- long offset = 0;
386
- for (const char *p = str; p < pos; p = utf_next(p))
387
- offset++;
388
-
389
- return offset;
390
- }
391
-
392
-
393
- /* {{{1
394
- * Copy the contents of an UTF-8 string to another.
395
- */
396
- void
397
- utf_copy(char *dest, const char *src)
398
- {
399
- strcpy(dest, src);
400
- }
401
-
402
-
403
- /* {{{1
404
- * Copy at most n Unicode characters from an UTF-8 string to another. The
405
- * destination string will be ‹NUL›-terminated properly.
406
- */
407
- void
408
- utf_copy_n(char *dest, const char *src, size_t n)
409
- {
410
- const char *p;
411
-
412
- for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
413
- /* this loop intentionally left empty */;
414
- }
415
-
416
- strncpy(dest, src, p - src);
417
- dest[p - src] = NUL;
418
- }
419
-
420
-
421
- /* {{{1
422
- * Append an UTF-8 string onto another.
423
- */
424
- void
425
- utf_append(char *dest, const char *src)
426
- {
427
- strcat(dest, src);
428
- }
429
-
430
-
431
- /* {{{1
432
- * Append at most ‘n’ Unicode character from an UTF-8 string onto another.
433
- */
434
- void
435
- utf_append_n(char *dest, const char *src, size_t n)
436
- {
437
- const char *p;
438
-
439
- for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
440
- /* this loop intentionally left empty */;
441
- }
442
-
443
- strncat(dest, src, p - src);
444
- dest[p - src] = NUL;
445
- }
446
-
447
-
448
- /* {{{1
449
- * Compare two strings for ordering using the linguistically correct rules of
450
- * the current locale.
451
- */
452
- int
453
- utf_collate(const char *a, const char *b)
454
- {
455
- assert(a != NULL);
456
- assert(b != NULL);
457
-
458
- unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
459
- unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
460
-
461
- int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
462
-
463
- free(a_norm);
464
- free(b_norm);
465
-
466
- return result;
467
- }
468
-
469
-
470
- /* {{{1
471
- * We need UTF-8 encoding of numbers to encode the weights if
472
- * we are using wcsxfrm. However, we aren't encoding Unicode
473
- * characters, so we can't simply use unichar_to_utf.
474
- *
475
- * The following routine is taken (with modification) from GNU
476
- * libc's strxfrm routine:
477
- *
478
- * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
479
- * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
480
- */
481
- static inline int
482
- _utf_encode(char *buf, wchar_t c)
483
- {
484
- int retval;
485
-
486
- if (c < 0x80) {
487
- if (buf != NULL)
488
- *buf++ = (char)c;
489
- retval = 1;
490
- } else {
491
- int step;
492
-
493
- for (step = 2; step < 6; step++) {
494
- if ((c & (~(uint32_t)0 << (5 * step + 1))) == 0)
495
- break;
496
- }
497
-
498
- retval = step;
499
-
500
- if (buf != NULL) {
501
- *buf = (unsigned char)(~0xff >> step);
502
- step--;
503
- do {
504
- c = PUT_X(c, buf[step]);
505
- } while (--step > 0);
506
- *buf |= c;
507
- }
508
- }
509
-
510
- return retval;
511
- }
512
-
513
-
514
- /* {{{1
515
- * Generate a collation key from a string which can be compared with other
516
- * collation keys using str_compare().
517
- */
518
- static char *
519
- utf_collate_key_impl(const char *str, size_t len, bool use_len)
520
- {
521
- assert(str != NULL);
522
-
523
- unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
524
- size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
525
- wchar_t result_wc[xfrm_len + 1];
526
- wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
527
-
528
- int result_len = 0;
529
- for (size_t i = 0; i < xfrm_len; i++)
530
- result_len += _utf_encode(NULL, result_wc[i]);
531
-
532
- char *result = ALLOC_N(char, result_len + 1);
533
- result_len = 0;
534
- for (size_t i = 0; i < xfrm_len; i++)
535
- result_len += _utf_encode(result + result_len, result_wc[i]);
536
- result[result_len] = NUL;
537
-
538
- free(str_norm);
539
-
540
- return result;
541
- }
542
-
543
-
544
- /* {{{1
545
- * Generate a collation key from a string which can be compared with other
546
- * collation keys using str_compare().
547
- */
548
- char *
549
- utf_collate_key(const char *str)
550
- {
551
- return utf_collate_key_impl(str, 0, false);
552
- }
553
-
554
-
555
- /* {{{1
556
- * Generate a collation key from a string (of length ‘len’) which can be
557
- * compared with other collation keys using str_compare().
558
- */
559
- char *
560
- utf_collate_key_n(const char *str, size_t len)
561
- {
562
- return utf_collate_key_impl(str, len, true);
563
- }
564
-
565
-
566
- /* {{{1
567
- * Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
568
- * ‘haystack_len’.
569
- */
570
- static int
571
- str_index_n(const char *haystack, const char *needle, size_t haystack_len)
572
- {
573
- assert(haystack != NULL);
574
- assert(needle != NULL);
575
-
576
- size_t needle_len = strlen(needle);
577
-
578
- if (needle_len == 0)
579
- return 0;
580
-
581
- if (haystack_len < needle_len)
582
- return -1;
583
-
584
- const char *end = haystack + haystack_len - needle_len;
585
- for (const char *p = haystack; *p != '\0' && p <= end; p++) {
586
- size_t i;
587
-
588
- for (i = 0; i < needle_len; i++) {
589
- if (p[i] != needle[i])
590
- break;
591
- }
592
-
593
- if (i == needle_len)
594
- return p - haystack;
595
- }
596
-
597
- return -1;
598
- }
599
-
600
-
601
- /* {{{1
602
- * Retrieve the index/offset of the right-most occurence of ‘needle’ in
603
- * ‘haystack’, or -1 if it doesn't exist.
604
- */
605
- static int
606
- str_rindex(const char *haystack, const char *needle)
607
- {
608
- assert(haystack != NULL);
609
- assert(needle != NULL);
610
-
611
- size_t needle_len = strlen(needle);
612
- size_t haystack_len = strlen(haystack);
613
-
614
- if (needle_len == 0)
615
- return haystack_len;
616
-
617
- if (haystack_len < needle_len)
618
- return -1;
619
-
620
- for (const char *p = haystack + haystack_len - needle_len; p >= haystack; p--) {
621
- size_t i;
622
-
623
- for (i = 0; i < needle_len; i++) {
624
- if (p[i] != needle[i])
625
- break;
626
- }
627
-
628
- if (i == needle_len)
629
- return p - haystack;
630
- }
631
-
632
- return -1;
633
- }
634
-
635
-
636
- /* {{{1
637
- * Retrieve the index/offset of the right-most occurence of ‘needle’ in
638
- * ‘haystack’, or -1 if it doesn't exist.
639
- */
640
- static int
641
- str_rindex_n(const char *haystack, const char *needle, size_t haystack_len)
642
- {
643
- assert(haystack != NULL);
644
- assert(needle != NULL);
645
-
646
- size_t needle_len = strlen(needle);
647
- const char *haystack_max = haystack + haystack_len;
648
- const char *p = haystack;
649
-
650
- while (p < haystack_max && *p != '\0')
651
- p++;
652
-
653
- if (p < haystack + needle_len)
654
- return -1;
655
-
656
- p -= needle_len;
657
-
658
- for ( ; p >= haystack; p--) {
659
- size_t i;
660
-
661
- for (i = 0; i < needle_len; i++) {
662
- if (p[i] != needle[i])
663
- break;
664
- }
665
-
666
- if (i == needle_len)
667
- return p - haystack;
668
- }
669
-
670
- return -1;
671
- }
672
-
673
-
674
- /* {{{1
675
- * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
676
- * doesn't exist.
677
- */
678
- int
679
- utf_char_index(const char *str, unichar c)
680
- {
681
- char ch[7];
682
-
683
- ch[unichar_to_utf(c, ch)] = NUL;
684
- char *p = strstr(str, ch);
685
- return (p != NULL) ? p - str : -1;
686
- }
687
-
688
-
689
- /* {{{1
690
- * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
691
- * doesn't exist, going over at most ‘len’ bytes in ‘str’.
692
- */
693
- int
694
- utf_char_index_n(const char *str, unichar c, size_t len)
695
- {
696
- char ch[7];
697
-
698
- ch[unichar_to_utf(c, ch)] = NUL;
699
-
700
- return str_index_n(str, ch, len);
701
- }
702
-
703
-
704
- /* {{{1
705
- * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
706
- * doesn't exist.
707
- */
708
- int
709
- utf_char_rindex(const char *str, unichar c)
710
- {
711
- char ch[7];
712
-
713
- ch[unichar_to_utf(c, ch)] = NUL;
714
-
715
- return str_rindex(str, ch);
716
- }
717
-
718
-
719
- /* {{{1
720
- * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
721
- * doesn't exist, going over at most ‘len’ bytes in ‘str’.
722
- */
723
- int
724
- utf_char_rindex_n(const char *str, unichar c, size_t len)
725
- {
726
- char ch[7];
727
-
728
- ch[unichar_to_utf(c, ch)] = NUL;
729
-
730
- return str_rindex_n(str, ch, len);
731
- }
732
-
733
-
734
- /* {{{1
735
- * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
736
- * -1 if it doesn't exist.
737
- */
738
- int
739
- utf_index(const char *haystack, const char *needle)
740
- {
741
- return strstr(haystack, needle) - haystack;
742
- }
743
-
744
-
745
- /* {{{1
746
- * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
747
- * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
748
- */
749
- int
750
- utf_index_n(const char *haystack, const char *needle, size_t len)
751
- {
752
- return str_index_n(haystack, needle, len);
753
- }
754
-
755
-
756
- /* {{{1
757
- * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
758
- * -1 if it doesn't exist.
759
- */
760
- int
761
- utf_rindex(const char *haystack, const char *needle)
762
- {
763
- return str_rindex(haystack, needle);
764
- }
765
-
766
-
767
- /* {{{1
768
- * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
769
- * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
770
- */
771
- int
772
- utf_rindex_n(const char *haystack, const char *needle, size_t len)
773
- {
774
- return str_rindex_n(haystack, needle, len);
775
- }
776
-
777
-
778
- /* {{{1
779
- * Check if the given string begins with ‘prefix’.
780
- */
781
- bool
782
- utf_has_prefix(const char *str, const char *prefix)
783
- {
784
- assert(str != NULL);
785
- assert(prefix != NULL);
786
-
787
- do {
788
- if (*prefix == NUL)
789
- return true;
790
- else if (*str == NUL)
791
- return false;
792
- } while (*str++ == *prefix++);
793
-
794
- return false;
795
- }
796
-
797
-
798
- /* {{{1
799
- * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
800
- */
801
- long
802
- utf_length(const char *str)
803
- {
804
- assert(str != NULL);
805
-
806
- long n = 0;
807
- const char *p = str;
808
- while (*p != '\0') {
809
- n++;
810
- p = utf_next(p);
811
- }
812
-
813
- return n;
814
- }
815
-
816
-
817
- /* {{{1
818
- * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
819
- * ‘len’ bytes.
820
- */
821
- long
822
- utf_length_n(const char *str, long len)
823
- {
824
- assert(str != NULL || len == 0);
825
-
826
- if (len == 0)
827
- return 0;
828
-
829
- long n = 0;
830
- const char *p = str;
831
- const char *end = str + len;
832
- while (p < end) {
833
- n++;
834
- p = utf_next(p);
835
- }
836
-
837
- /* This makes sure that we don’t count incomplete characters. It won’t
838
- * save us from illegal UTF-8-sequences, however. */
839
- if (p > end)
840
- n--;
841
-
842
- return n;
843
- }
844
-
845
-
846
- /* {{{1
847
- * Retrieve the number of bytes making up the given UTF-8 string.
848
- */
849
- size_t
850
- utf_byte_length(const char *str)
851
- {
852
- return strlen(str);
853
- }
854
-
855
-
856
- /* {{{1
857
- * The real implementation of utf_reverse() and utf_reverse_n() below.
858
- */
859
- static char *
860
- utf_reverse_impl(const char *str, size_t len, bool use_len)
861
- {
862
- if (!use_len)
863
- len = utf_byte_length(str);
864
-
865
- char *result = ALLOC_N(char, len + 1);
866
- char *r = result + len;
867
- const char *p = str;
868
- while (r > result) {
869
- uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
870
- r -= skip;
871
- for (char *m = r; skip > 0; skip--)
872
- *m++ = *p++;
873
- }
874
- result[len] = 0;
875
-
876
- return result;
877
- }
878
-
879
-
880
- /* {{{1
881
- * Return a new string which is ‘str’ reversed.
882
- */
883
- char *
884
- utf_reverse(const char *str)
885
- {
886
- return utf_reverse_impl(str, 0, false);
887
- }
888
-
889
-
890
- /* {{{1
891
- * Return a new string which is ‘str’ reversed, examining at most ‘len’ bytes
892
- * of it.
893
- */
894
- char *
895
- utf_reverse_n(const char *str, size_t len)
896
- {
897
- return utf_reverse_impl(str, len, true);
898
- }
899
-
900
-
901
- /* {{{1
902
- * The real implementation of utf_isvalid() and utf_isvalid_n() below.
903
- *
904
- * TODO: this needs optimizing. Look at glib's new optimized implementation
905
- * (2.6.0) and also separate the ‘use_max’ into two cases.
906
- */
907
- #define CONTINUATION_CHAR do { \
908
- if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
909
- goto error; \
910
- val <<= 6; \
911
- val |= (*(unsigned char *)p) & 0x3f; \
912
- } while (0);
913
-
914
- static const char *
915
- fast_validate(const char *str)
916
- {
917
- unichar val = 0;
918
- unichar min = 0;
919
- const char *p;
920
-
921
- for (p = str; *p != NUL; p++) {
922
- if (*(unsigned char *)p < 128)
923
- continue;
924
-
925
- const char *last = p;
926
-
927
- if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
928
- if ((*(unsigned char *)p & 0x1e) == 0)
929
- goto error;
930
- p++;
931
- if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
932
- goto error;
933
- } else {
934
- if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
935
- min = (1 << 11);
936
- val = *(unsigned char *)p & 0x0f;
937
- goto two_remaining;
938
- } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
939
- min = (1 << 16);
940
- val = *(unsigned char *)p & 0x07;
941
- } else {
942
- goto error;
943
- }
944
-
945
- p++;
946
- CONTINUATION_CHAR;
947
- two_remaining:
948
- p++;
949
- CONTINUATION_CHAR;
950
- p++;
951
- CONTINUATION_CHAR;
952
-
953
- if (val < min)
954
- goto error;
955
-
956
- if (!UNICODE_ISVALID(val))
957
- goto error;
958
- }
959
-
960
- continue;
961
- error:
962
- return last;
963
- }
964
-
965
- return p;
966
- }
967
-
968
- static const char *
969
- fast_validate_len(const char *str, size_t max_len)
970
- {
971
- unichar val = 0;
972
- unichar min = 0;
973
- const char *p;
974
-
975
- for (p = str; (size_t)(p - str) < max_len && *p != NUL; p++) {
976
- if (*(unsigned char *)p < 128)
977
- continue;
978
-
979
- const char *last = p;
980
-
981
- if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
982
- if (max_len - (p - str) < 2)
983
- goto error;
984
-
985
- if ((*(unsigned char *)p & 0x1e) == 0)
986
- goto error;
987
- p++;
988
- if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
989
- goto error;
990
- } else {
991
- if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
992
- if (max_len - (p - str) < 3)
993
- goto error;
994
-
995
- min = (1 << 11);
996
- val = *(unsigned char *)p & 0x0f;
997
- goto two_remaining;
998
- } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
999
- if (max_len - (p - str) < 4)
1000
- goto error;
1001
-
1002
- min = (1 << 16);
1003
- val = *(unsigned char *)p & 0x07;
1004
- } else {
1005
- goto error;
1006
- }
1007
-
1008
- p++;
1009
- CONTINUATION_CHAR;
1010
- two_remaining:
1011
- p++;
1012
- CONTINUATION_CHAR;
1013
- p++;
1014
- CONTINUATION_CHAR;
1015
-
1016
- if (val < min)
1017
- goto error;
1018
- if (!UNICODE_ISVALID(val))
1019
- goto error;
1020
- }
1021
-
1022
- continue;
1023
- error:
1024
- return last;
1025
- }
1026
-
1027
- return p;
1028
- }
1029
-
1030
-
1031
- /* {{{1
1032
- * Check if ‘str’ constitutes a valid UTF-8 character sequence.
1033
- */
1034
- bool
1035
- utf_isvalid(const char *str)
1036
- {
1037
- const char *p = fast_validate(str);
1038
-
1039
- return *p == NUL;
1040
- }
1041
-
1042
-
1043
- /* {{{1
1044
- * Check if ‘str’ constitutes a valid UTF-8 character sequence, examining at
1045
- * most ‘max’ bytes. If it turns out ‘str’ isn't a valid UTF-8 character
1046
- * sequence and ‘end’ is non-‹NULL›, ‘end’ is set to the end of the valid range
1047
- * of bytes in ‘str’.
1048
- */
1049
- bool
1050
- utf_isvalid_n(const char *str, size_t max, const char **end)
1051
- {
1052
- const char *p = fast_validate_len(str, max);
1053
-
1054
- if (end != NULL)
1055
- *end = p;
1056
-
1057
- return p == str + max;
1058
- }
1059
-
1060
-
1061
- /* {{{1
1062
- * Check whether ‘c’ is a valid Unicode character.
1063
- */
1064
- bool
1065
- unichar_isvalid(unichar c)
1066
- {
1067
- return UNICODE_ISVALID(c);
1068
- }
1069
-
1070
-
1071
- /* {{{1
1072
- * Turn an Unicode character (UTF-32) into an UTF-8 character sequence and
1073
- * store it in ‘result’, returning the length of the stored sequence.
1074
- */
1075
- int
1076
- unichar_to_utf(unichar c, char *result)
1077
- {
1078
- int len = 0;
1079
- int first;
1080
-
1081
- if (c < UNI_LEN1) {
1082
- first = 0;
1083
- len = 1;
1084
- } else if (c < UNI_LEN2) {
1085
- first = 0xc0;
1086
- len = 2;
1087
- } else if (c < UNI_LEN3) {
1088
- first = 0xe0;
1089
- len = 3;
1090
- } else if (c < UNI_LEN4) {
1091
- first = 0xf0;
1092
- len = 4;
1093
- } else if (c < UNI_LEN5) {
1094
- first = 0xf8;
1095
- len = 5;
1096
- } else {
1097
- first = 0xfc;
1098
- len = 6;
1099
- }
1100
-
1101
- if (result != NULL) {
1102
- for (int i = len - 1; i > 0; i--)
1103
- c = PUT_X(c, result[i]);
1104
-
1105
- result[0] = c | first;
1106
- }
1107
-
1108
- return len;
1109
- }
1110
-
1111
-
1112
- /* {{{1
1113
- * The real implementation of ucs4_to_utf8() and ucs4_to_utf8_n() below.
1114
- */
1115
- static char *
1116
- ucs4_to_utf8_n_impl(unichar *str, size_t len, bool use_len,
1117
- size_t *items_read, size_t *items_written)
1118
- {
1119
- size_t result_len = 0;
1120
- char *result = NULL, *p;
1121
-
1122
- for (size_t i = 0; (!use_len || i < len) && str[i] != NUL; i++) {
1123
- if (str[i] >= 0x80000000) {
1124
- if (items_read != NULL)
1125
- *items_read = i;
1126
-
1127
- rb_raise(rb_eArgError, "UCS-4 input contains character outside of range for UTF-8 (%lc))", str[i]);
1128
- }
1129
-
1130
- result_len += _utf_length(str[i]);
1131
- }
1132
-
1133
- p = result = ALLOC_N(char, result_len + 1);
1134
- size_t i;
1135
- for (i = 0; p < result + result_len; i++)
1136
- p += unichar_to_utf(str[i], p);
1137
- *p = NUL;
1138
-
1139
- if (items_written != NULL)
1140
- *items_written = p - result;
1141
- if (items_read != NULL)
1142
- *items_read = i;
1143
-
1144
- return result;
1145
- }
1146
-
1147
- /* {{{1
1148
- * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1149
- * store the number of characters read and bytes written in ‘items_read’ and
1150
- * ‘items_written’ respectivelly.
1151
- */
1152
- char *
1153
- ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written)
1154
- {
1155
- return ucs4_to_utf8_n_impl(str, 0, false, items_read, items_written);
1156
- }
1157
-
1158
- /* {{{1
1159
- * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1160
- * store the number of characters read and bytes written in ‘items_read’ and
1161
- * ‘items_written’ respectivelly. Examine at most ‘len’ characters from ‘str’.
1162
- */
1163
- char *
1164
- ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written)
1165
- {
1166
- return ucs4_to_utf8_n_impl(str, len, true, items_read, items_written);
1167
- }
1168
-
1169
-
1170
- /* {{{1
1171
- * The real implementation of utf8_to_ucs4_fast() and utf8_to_ucs4_fast_n()
1172
- * below.
1173
- */
1174
- static unichar *
1175
- utf8_to_ucs4_fast_impl(const char *str, size_t len, bool use_len, size_t *items_written)
1176
- {
1177
- assert(str != NULL);
1178
-
1179
- const char *p = str;
1180
- size_t n = 0;
1181
- if (use_len) {
1182
- while (p < str + len && *p != NUL) {
1183
- p = utf_next(p);
1184
- n++;
1185
- }
1186
- } else {
1187
- while (p != NUL) {
1188
- p = utf_next(p);
1189
- n++;
1190
- }
1191
- }
1192
-
1193
- unichar *result = ALLOC_N(unichar, n + 1);
1194
- p = str;
1195
- size_t i;
1196
- for (i = 0; i < n; i++) {
1197
- unichar c = ((unsigned char *)p)[0];
1198
- int c_len;
1199
-
1200
- if (c < 0x80) {
1201
- result[i] = c;
1202
- p++;
1203
- } else {
1204
- /* TODO: use _utf_compute() here */
1205
- if (c < 0xe0) {
1206
- c_len = 2;
1207
- c &= 0x1f;
1208
- } else if (c < 0xf0) {
1209
- c_len = 3;
1210
- c &= 0x0f;
1211
- } else if (c < 0xf8) {
1212
- c_len = 4;
1213
- c &= 0x07;
1214
- } else if (c < 0xfc) {
1215
- c_len = 5;
1216
- c &= 0x03;
1217
- } else {
1218
- c_len = 6;
1219
- c &= 0x01;
1220
- }
1221
-
1222
- for (int j = 1; j < c_len; j++) {
1223
- c <<= BIT_X;
1224
- c |= ((unsigned char *)p)[j] & MASK_X;
1225
- }
1226
-
1227
- result[i] = c;
1228
- p += c_len;
1229
- }
1230
- }
1231
- result[i] = NUL;
1232
-
1233
- if (items_written != NULL)
1234
- *items_written = i;
1235
-
1236
- return result;
1237
- }
1238
-
1239
-
1240
- /* {{{1
1241
- * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1242
- * the number of characters written in ‘items_written’.
1243
- */
1244
- unichar *
1245
- utf8_to_ucs4_fast(const char *str, size_t *items_written)
1246
- {
1247
- return utf8_to_ucs4_fast_impl(str, 0, false, items_written);
1248
- }
1249
-
1250
-
1251
- /* {{{1
1252
- * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1253
- * the number of characters written in ‘items_written’. Examine at most ‘len’
1254
- * bytes from ‘str’.
1255
- */
1256
- unichar *
1257
- utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written)
1258
- {
1259
- return utf8_to_ucs4_fast_impl(str, len, true, items_written);
1260
- }
1261
-
1262
-
1263
- /* {{{1
1264
- * The real implementation of utf8_to_ucs4() and utf8_to_ucs4_n() below.
1265
- */
1266
- static unichar *
1267
- utf8_to_ucs4_impl(const char *str, size_t len, bool use_len, size_t *items_read, size_t *items_written)
1268
- {
1269
- size_t n = 0;
1270
- const char *p = str;
1271
- for (; (!use_len || str + len - p > 0) && *p != NUL; p = utf_next(p)) {
1272
- unichar c = utf_char_n(p, str + len - p);
1273
- if (c & 0x80000000) {
1274
- if (c == UTF_INCOMPLETE_INPUT_UNICHAR) {
1275
- if (items_read != NULL)
1276
- break;
1277
-
1278
- rb_raise(rb_eArgError, "partial character sequence in UTF-8 input");
1279
- } else {
1280
- rb_raise(rb_eArgError, "UTF-8 input contains character outside of range for UTF-8 (%lc))", c);
1281
- }
1282
-
1283
- if (items_read != NULL)
1284
- *items_read = p - str;
1285
-
1286
- return NULL;
1287
- } else {
1288
- n++;
1289
- }
1290
- }
1291
-
1292
- unichar *result = ALLOC_N(unichar, n + 1);
1293
- size_t i;
1294
- for (i = 0, p = str; i < n; i++) {
1295
- result[i] = utf_char(p);
1296
- p = utf_next(p);
1297
- }
1298
- result[i] = NUL;
1299
-
1300
- if (items_written != NULL)
1301
- *items_written = n;
1302
- if (items_read != NULL)
1303
- *items_read = p - str;
1304
-
1305
- return result;
1306
- }
1307
-
1308
-
1309
- /* {{{1
1310
- * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1311
- * the number of characters written in ‘items_written’. This function does
1312
- * additional error-checking on the input.
1313
- */
1314
- unichar *
1315
- utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written)
1316
- {
1317
- return utf8_to_ucs4_impl(str, 0, false, items_read, items_written);
1318
- }
1319
-
1320
-
1321
- /* {{{1
1322
- * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1323
- * the number of characters written in ‘items_written’. Examine at most ‘len’
1324
- * bytes from ‘str’. This function does additional error-checking on the
1325
- * input.
1326
- */
1327
- unichar *
1328
- utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written)
1329
- {
1330
- return utf8_to_ucs4_impl(str, len, true, items_read, items_written);
1331
- }
1332
-
1333
-
1334
- /* }}}1 */