u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,28 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include "u.h"
5
+
6
+ enum u_normalized
7
+ u_normalized(const char *u, size_t n, enum u_normalization_form form)
8
+ {
9
+ enum u_canonical_combining_class pcc = 0;
10
+ enum u_normalized r = U_NORMALIZED_YES;
11
+ for (const char *p = u, *end = u + n; p < end; ) {
12
+ uint32_t c = u_decode(&p, p, end);
13
+ enum u_canonical_combining_class cc = u_char_canonical_combining_class(c);
14
+ if (pcc > cc && cc != 0)
15
+ return U_NORMALIZED_NO;
16
+ switch (u_char_normalized(c, form)) {
17
+ case U_NORMALIZED_NO:
18
+ return U_NORMALIZED_NO;
19
+ case U_NORMALIZED_MAYBE:
20
+ r = U_NORMALIZED_MAYBE;
21
+ break;
22
+ case U_NORMALIZED_YES:
23
+ break;
24
+ }
25
+ pcc = cc;
26
+ }
27
+ return r;
28
+ }
@@ -0,0 +1,62 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+ /* {{{1
9
+ * Convert an integer offset to a pointer within ‘str’.
10
+ *
11
+ */
12
+ char *
13
+ u_offset_to_pointer(const char *str, long offset)
14
+ {
15
+ const char *p = str;
16
+
17
+ if (offset > 0) {
18
+ while (offset-- > 0)
19
+ p = u_next(p);
20
+ } else {
21
+ while (offset != 0) {
22
+ const char *base = p;
23
+ p += offset;
24
+ while ((*p & 0xc0) == 0x80)
25
+ p--;
26
+
27
+ offset += u_pointer_to_offset(p, base);
28
+ }
29
+ }
30
+
31
+ return (char *)p;
32
+ }
33
+
34
+ char *
35
+ u_offset_to_pointer_n(const char *str, long offset, size_t n)
36
+ {
37
+ const char *p = str;
38
+
39
+ if (offset > 0) {
40
+ const char *end = p + n;
41
+ while (p < end && offset-- > 0)
42
+ p = u_next(p);
43
+
44
+ if (offset > 0)
45
+ return NULL;
46
+ } else {
47
+ const char *end = p - n;
48
+ while (offset != 0) {
49
+ const char *base = p;
50
+ p += offset;
51
+ while (p >= end && (*p & 0xc0) == 0x80)
52
+ p--;
53
+
54
+ if (p < end)
55
+ return NULL;
56
+
57
+ offset += u_pointer_to_offset(p, base);
58
+ }
59
+ }
60
+
61
+ return (char *)p;
62
+ }
@@ -0,0 +1,23 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ /* {{{1
10
+ * Convert a pointer to an integer offset within ‘str’.
11
+ */
12
+ long
13
+ u_pointer_to_offset(const char *str, const char *pos)
14
+ {
15
+ if (pos < str)
16
+ return -u_pointer_to_offset(pos, str);
17
+
18
+ long offset = 0;
19
+ for (const char *p = str; p < pos; p = u_next(p))
20
+ offset++;
21
+
22
+ return offset;
23
+ }
@@ -0,0 +1,73 @@
1
+ #include "extconf.h"
2
+ #include <assert.h>
3
+ #include <errno.h>
4
+ #include <stdbool.h>
5
+ #include <stddef.h>
6
+ #include <stdint.h>
7
+ #include <string.h>
8
+
9
+ #include "u.h"
10
+
11
+ #ifdef HAVE_ICONV
12
+ # include <iconv.h>
13
+ # include <limits.h>
14
+ #endif
15
+
16
+ size_t
17
+ u_recode(char *result, size_t m, const char *string, size_t n,
18
+ const char *codeset)
19
+ {
20
+ iconv_t cd = iconv_open(codeset, "UTF-8");
21
+ if (cd == (iconv_t)-1)
22
+ return 0;
23
+
24
+ bool done = false;
25
+ bool failed = false;
26
+ bool final = false;
27
+ bool too_big = false;
28
+ char *p = (char *)string;
29
+ size_t p_left = n;
30
+ // We need to align the buffer.
31
+ union { unsigned int align; char buffer[4096]; } b;
32
+ char *base = m > 0 ? result : b.buffer;
33
+ char *q = base;
34
+ size_t q_left = m > 0 ? m : sizeof(b.buffer);
35
+ size_t written = 0;
36
+ while (!done && !failed) {
37
+ size_t err = final ?
38
+ iconv(cd, NULL, NULL, &q, &q_left) :
39
+ iconv(cd, &p, &p_left, &q, &q_left);
40
+ if (err == (size_t)-1) {
41
+ switch (errno) {
42
+ case EINVAL:
43
+ done = true;
44
+ break;
45
+ case E2BIG:
46
+ written += q - base;
47
+ if (!too_big) {
48
+ too_big = true;
49
+ base = b.buffer;
50
+ }
51
+ q = base;
52
+ q_left = sizeof(b.buffer);
53
+ errno = 0;
54
+ break;
55
+ default:
56
+ failed = true;
57
+ break;
58
+ }
59
+ } else {
60
+ if (!final)
61
+ final = true;
62
+ else
63
+ done = true;
64
+ }
65
+ }
66
+ *q = '\0';
67
+
68
+ int saved_errno = errno;
69
+ if (iconv_close(cd) < 0 && failed)
70
+ errno = saved_errno;
71
+
72
+ return written + (q - base);
73
+ }
@@ -0,0 +1,21 @@
1
+ #include <assert.h>
2
+ #include <stdint.h>
3
+ #include <stdlib.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #include "u.h"
8
+ #include "private.h"
9
+ #include "output.h"
10
+
11
+
12
+ size_t
13
+ u_reverse(char *result, size_t m, const char *string, size_t n)
14
+ {
15
+ assert(string != NULL);
16
+ assert(result != NULL || m == 0);
17
+ struct output output = OUTPUT_INIT(result, m);
18
+ for (const char *p = string + n; p > string; )
19
+ output_char(&output, u_decode_r(&p, string, p));
20
+ return output_finalize(&output);
21
+ }
@@ -0,0 +1,132 @@
1
+ #include <assert.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #include "u.h"
8
+ #include "private.h"
9
+
10
+ /*
11
+ * Retrieve the index/offset of the right-most occurence of NEEDLE in
12
+ * HAYSTACK, or -1 if it doesn’t exist.
13
+ */
14
+ static U_PURE size_t
15
+ str_rindex(const char *haystack, const char *needle)
16
+ {
17
+ assert(haystack != NULL);
18
+ assert(needle != NULL);
19
+
20
+ size_t needle_n = strlen(needle);
21
+ size_t haystack_n = strlen(haystack);
22
+
23
+ if (needle_n == 0)
24
+ return haystack_n;
25
+
26
+ if (haystack_n < needle_n)
27
+ return -1;
28
+
29
+ for (const char *p = haystack + haystack_n - needle_n; p >= haystack; p--) {
30
+ size_t i;
31
+
32
+ for (i = 0; i < needle_n; i++) {
33
+ if (p[i] != needle[i])
34
+ break;
35
+ }
36
+
37
+ if (i == needle_n)
38
+ return p - haystack;
39
+ }
40
+
41
+ return -1;
42
+ }
43
+
44
+
45
+ /*
46
+ * Retrieve the index/offset of the right-most occurence of NEEDLE in
47
+ * HAYSTACK, or -1 if it doesn’t exist.
48
+ */
49
+ static U_PURE size_t
50
+ str_rindex_n(const char *haystack, const char *needle, size_t haystack_n)
51
+ {
52
+ assert(haystack != NULL);
53
+ assert(needle != NULL);
54
+
55
+ size_t needle_n = strlen(needle);
56
+ const char *haystack_max = haystack + haystack_n;
57
+ const char *p = haystack;
58
+
59
+ while (p < haystack_max && *p != '\0')
60
+ p++;
61
+
62
+ if (p < haystack + needle_n)
63
+ return -1;
64
+
65
+ p -= needle_n;
66
+
67
+ for ( ; p >= haystack; p--) {
68
+ size_t i;
69
+
70
+ for (i = 0; i < needle_n; i++) {
71
+ if (p[i] != needle[i])
72
+ break;
73
+ }
74
+
75
+ if (i == needle_n)
76
+ return p - haystack;
77
+ }
78
+
79
+ return -1;
80
+ }
81
+
82
+
83
+ /* {{{1
84
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
85
+ * doesn't exist.
86
+ */
87
+ size_t
88
+ u_char_rindex(const char *str, uint32_t c)
89
+ {
90
+ char ch[7];
91
+
92
+ ch[u_char_to_u(c, ch)] = '\0';
93
+
94
+ return str_rindex(str, ch);
95
+ }
96
+
97
+
98
+ /* {{{1
99
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
100
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
101
+ */
102
+ size_t
103
+ u_char_rindex_n(const char *str, uint32_t c, size_t n)
104
+ {
105
+ char ch[7];
106
+
107
+ ch[u_char_to_u(c, ch)] = '\0';
108
+
109
+ return str_rindex_n(str, ch, n);
110
+ }
111
+
112
+
113
+ /* {{{1
114
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
115
+ * -1 if it doesn't exist.
116
+ */
117
+ size_t
118
+ u_rindex(const char *haystack, const char *needle)
119
+ {
120
+ return str_rindex(haystack, needle);
121
+ }
122
+
123
+
124
+ /* {{{1
125
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
126
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
127
+ */
128
+ size_t
129
+ u_rindex_n(const char *haystack, const char *needle, size_t n)
130
+ {
131
+ return str_rindex_n(haystack, needle, n);
132
+ }
@@ -0,0 +1,68 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+ #include <stdbool.h>
4
+ #include <stddef.h>
5
+ #include <stdint.h>
6
+ #include <string.h>
7
+
8
+ #include "u.h"
9
+ #include "private.h"
10
+ #include "data/constants.h"
11
+ #include "attributes.h"
12
+ #include "titled.h"
13
+ #include "output.h"
14
+ #include "u_locale.h"
15
+ #include "case.h"
16
+
17
+ #define LATIN_CAPITAL_LETTER_I ((uint32_t)0x0049)
18
+ #define LATIN_CAPITAL_LETTER_J ((uint32_t)0x004a)
19
+ #define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
20
+ #define LATIN_SMALL_LETTER_J ((uint32_t)0x006a)
21
+
22
+ struct titlecase_closure {
23
+ const char *string;
24
+ enum locale locale;
25
+ struct output *output;
26
+ };
27
+
28
+ static void
29
+ titlecase_step(const char *p, const char *q, struct titlecase_closure *closure)
30
+ {
31
+ const char *t = p;
32
+ const char *u;
33
+ while (t < q && !u_char_iscased(u_decode(&u, t, q)))
34
+ t = u;
35
+ output_string(closure->output, p, t - p);
36
+ if (t == q)
37
+ return;
38
+ _u_upcase_step(closure->string, t, q, closure->locale, true,
39
+ closure->output);
40
+ if (u < q && closure->locale == LOCALE_DUTCH &&
41
+ (*t == LATIN_CAPITAL_LETTER_I || *t == LATIN_SMALL_LETTER_I) &&
42
+ (*u == LATIN_CAPITAL_LETTER_J || *u == LATIN_SMALL_LETTER_J)) {
43
+ output_char(closure->output, LATIN_CAPITAL_LETTER_J);
44
+ u++;
45
+ }
46
+ while (u < q)
47
+ u = _u_downcase_step(closure->string, u, q, closure->locale,
48
+ closure->output);
49
+ }
50
+
51
+ static void
52
+ titlecase_words(const char *p, size_t n, struct titlecase_closure *closure)
53
+ {
54
+ titlecase_step(p, p + n, closure);
55
+ }
56
+
57
+ size_t
58
+ u_titlecase(char *result, size_t m, const char *string, size_t n,
59
+ const char *locale)
60
+ {
61
+ assert(string != NULL);
62
+ assert(result != NULL || m == 0);
63
+ struct output output = OUTPUT_INIT(result, m);
64
+ struct titlecase_closure closure =
65
+ { string, _u_locale_from_string(locale), &output };
66
+ u_words(string, n, (u_substring_fn)titlecase_words, &closure);
67
+ return output_finalize(&output);
68
+ }
@@ -0,0 +1,89 @@
1
+ #include <assert.h>
2
+ #include <stdbool.h>
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ #include "u.h"
9
+ #include "private.h"
10
+
11
+ #include "data/constants.h"
12
+ #include "attributes.h"
13
+ #include "titled.h"
14
+ #include "output.h"
15
+ #include "u_locale.h"
16
+ #include "case.h"
17
+
18
+ #define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
19
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((uint32_t)0x0130)
20
+ #define COMBINING_DOT_ABOVE ((uint32_t)0x0307)
21
+ #define COMBINING_GREEK_YPOGEGRAMMENI ((uint32_t)0x0345)
22
+ #define GREEK_CAPITAL_LETTER_IOTA ((uint32_t)0x0399)
23
+
24
+
25
+ static inline bool
26
+ ismark(int category)
27
+ {
28
+ return IS(category,
29
+ OR(U_GENERAL_CATEGORY_MARK_NON_SPACING,
30
+ OR(U_GENERAL_CATEGORY_MARK_SPACING_COMBINING,
31
+ OR(U_GENERAL_CATEGORY_MARK_ENCLOSING, 0))));
32
+ }
33
+
34
+ static inline const char *
35
+ output_marks(const char *q, const char *end,
36
+ struct output *output)
37
+ {
38
+ while (q < end) {
39
+ const char *r;
40
+ uint32_t c = u_decode(&r, q, end);
41
+ if (!ismark(u_char_general_category(c)))
42
+ break;
43
+ output_char(output, c);
44
+ q = r;
45
+ }
46
+ return q;
47
+ }
48
+
49
+ const char *
50
+ _u_upcase_step(const char *string, const char *p, const char *end,
51
+ enum locale locale, bool title, struct output *output)
52
+ {
53
+ const char *q;
54
+ uint32_t c = u_decode(&q, p, end);
55
+ enum u_general_category gc;
56
+ if (!title && c == COMBINING_GREEK_YPOGEGRAMMENI) {
57
+ q = output_marks(q, end, output);
58
+ output_char(output, GREEK_CAPITAL_LETTER_IOTA);
59
+ } else if (locale == LOCALE_LITHUANIAN &&
60
+ c == COMBINING_DOT_ABOVE &&
61
+ is_after(string, p, u_char_issoftdotted))
62
+ ;
63
+ else if (locale == LOCALE_TURKIC && c == LATIN_SMALL_LETTER_I)
64
+ output_char(output, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
65
+ else if (IS(gc = u_char_general_category(c),
66
+ OR(U_GENERAL_CATEGORY_LETTER_LOWERCASE,
67
+ OR(U_GENERAL_CATEGORY_LETTER_TITLECASE, 0))))
68
+ case_simple(c, gc,
69
+ title || gc != U_GENERAL_CATEGORY_LETTER_LOWERCASE,
70
+ true,
71
+ output);
72
+ else
73
+ output_string(output, p, q - p);
74
+ return q;
75
+ }
76
+
77
+ size_t
78
+ u_upcase(char *result, size_t m, const char *string, size_t n,
79
+ const char *locale)
80
+ {
81
+ assert(string != NULL);
82
+ assert(result != NULL || m == 0);
83
+ enum locale l = _u_locale_from_string(locale);
84
+ const char *end = string + n;
85
+ struct output output = OUTPUT_INIT(result, m);
86
+ for (const char *p = string; p < end; )
87
+ p = _u_upcase_step(string, p, end, l, false, &output);
88
+ return output_finalize(&output);
89
+ }