u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1 @@
1
+ VALUE rb_u_string_to_inum(VALUE str, int base, bool verify);
@@ -0,0 +1,17 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return The String representation of the receiver, inheriting any taint and
4
+ * untrust, encoded as UTF-8 */
5
+ VALUE
6
+ rb_u_string_to_str(VALUE self)
7
+ {
8
+ const struct rb_u_string *string = RVAL2USTRING(self);
9
+
10
+ VALUE result = NIL_P(string->rb) ?
11
+ rb_u_str_new(USTRING_STR(string), USTRING_LENGTH(string)) :
12
+ string->rb;
13
+
14
+ OBJ_INFECT(result, self);
15
+
16
+ return result;
17
+ }
@@ -0,0 +1,12 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @raise [EncodingError] If the receiver contains an invalid UTF-8 sequence
4
+ * @raise [RuntimeError] If there’s no more room for a new Symbol in Ruby’s
5
+ * Symbol table
6
+ * @return [Symbol] The Symbol representation of the receiver */
7
+ VALUE
8
+ rb_u_string_to_sym(VALUE self)
9
+ {
10
+ /* NOTE: Lazy, but MRI makes it hard to implement this method. */
11
+ return rb_str_intern(StringValue(self));
12
+ }
@@ -0,0 +1,290 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_internal_tr.h"
3
+
4
+ #include "output.h"
5
+
6
+ struct tr_range
7
+ {
8
+ uint32_t begin;
9
+ uint32_t end;
10
+ };
11
+
12
+ static int
13
+ tr_ranges_setup(struct tr *tr, struct tr_range *ranges)
14
+ {
15
+ int n = 0;
16
+ bool was_inside_range = false;
17
+ while (tr_next(tr) != TR_FINISHED) {
18
+ if (tr->inside_range) {
19
+ if (!was_inside_range) {
20
+ ranges[n].begin = tr->now;
21
+ was_inside_range = true;
22
+ }
23
+ } else {
24
+ if (was_inside_range)
25
+ ranges[n].end = tr->now;
26
+ else
27
+ ranges[n].begin = ranges[n].end = tr->now;
28
+ n++;
29
+ was_inside_range = false;
30
+ }
31
+ }
32
+
33
+ return n;
34
+ }
35
+
36
+
37
+ struct tr_trans_closure
38
+ {
39
+ struct tr_range *from;
40
+ int n_from;
41
+ struct tr_range *to;
42
+ int n_to;
43
+ };
44
+
45
+ static uint32_t
46
+ tr_trans_replace_exclude(UNUSED(uint32_t c), void *closure)
47
+ {
48
+ return *((uint32_t *)closure);
49
+ }
50
+
51
+ static int
52
+ tr_trans_replace_include_offset_of(struct tr_range *ranges, int range, uint32_t c)
53
+ {
54
+ int offset = 0;
55
+
56
+ for (int i = 0; i < range; i++)
57
+ offset += ranges[i].end - ranges[i].begin + 1;
58
+ offset += c - ranges[range].begin;
59
+
60
+ return offset;
61
+ }
62
+
63
+ static int
64
+ tr_trans_replace_include_find_from_range(struct tr_trans_closure *closure, uint32_t c)
65
+ {
66
+ for (int i = closure->n_from - 1; i >= 0; i--)
67
+ if (closure->from[i].begin <= c && c <= closure->from[i].end)
68
+ return i;
69
+
70
+ return -1;
71
+ }
72
+
73
+ static uint32_t
74
+ tr_trans_replace_include_find_to_u_char(struct tr_trans_closure *closure, int offset)
75
+ {
76
+ for (int i = 0, seen = 0; i < closure->n_to; i++) {
77
+ int size = closure->to[i].end - closure->to[i].begin + 1;
78
+ if (seen + size > offset)
79
+ return closure->to[i].begin + (offset - seen);
80
+ seen += size;
81
+ }
82
+
83
+ return closure->to[closure->n_to - 1].end;
84
+ }
85
+
86
+ static uint32_t
87
+ tr_trans_replace_include(uint32_t c, void *v_closure)
88
+ {
89
+ struct tr_trans_closure *closure = (struct tr_trans_closure *)v_closure;
90
+
91
+ int from = tr_trans_replace_include_find_from_range(closure, c);
92
+ if (from == -1)
93
+ return closure->to[closure->n_to - 1].end;
94
+
95
+ int offset = tr_trans_replace_include_offset_of(closure->from, from, c);
96
+
97
+ return tr_trans_replace_include_find_to_u_char(closure, offset);
98
+ }
99
+
100
+ static void
101
+ tr_trans_real_squeeze(const char *str, const char *end,
102
+ struct tr_table *translation,
103
+ uint32_t replace(uint32_t, void *), void *closure,
104
+ struct output *output, bool *modified)
105
+ {
106
+ size_t n = output->n;
107
+ const char *p = str;
108
+ uint32_t prev_c = U_N_CODEPOINTS;
109
+ while (p < end) {
110
+ const char *prev = p;
111
+ uint32_t c0 = u_decode(&p, p, end);
112
+ if (tr_table_lookup(translation, c0)) {
113
+ uint32_t c = replace(c0, closure);
114
+ if (prev_c == c)
115
+ continue;
116
+ prev_c = c;
117
+ output_char(output, c);
118
+ if (c != c0)
119
+ *modified = true;
120
+ } else {
121
+ output_string(output, prev, p - prev);
122
+ prev_c = U_N_CODEPOINTS;
123
+ }
124
+ }
125
+
126
+ if ((size_t)(end - str) > (output->n - n))
127
+ *modified = true;
128
+ }
129
+
130
+ static void
131
+ tr_trans_real_standard(const char *str, const char *end,
132
+ struct tr_table *translation,
133
+ uint32_t replace(uint32_t, void *), void *closure,
134
+ struct output *output, bool *modified)
135
+ {
136
+ const char *p = str;
137
+
138
+ while (p < end) {
139
+ const char *prev = p;
140
+ uint32_t c = u_decode(&p, p, end);
141
+ if (tr_table_lookup(translation, c)) {
142
+ uint32_t replacement = replace(c, closure);
143
+ output_char(output, replacement);
144
+ if (replacement != c)
145
+ *modified = true;
146
+ } else
147
+ output_string(output, prev, p - prev);
148
+ }
149
+ }
150
+
151
+ static void
152
+ tr_trans_real(const char *str, const char *end,
153
+ struct tr_table *translation,
154
+ uint32_t replace(uint32_t, void *), void *closure, bool squeeze,
155
+ struct output *output, bool *modified)
156
+ {
157
+ if (squeeze)
158
+ tr_trans_real_squeeze(str, end,
159
+ translation,
160
+ replace, closure,
161
+ output, modified);
162
+ else
163
+ tr_trans_real_standard(str, end,
164
+ translation,
165
+ replace, closure,
166
+ output, modified);
167
+ }
168
+
169
+ static VALUE
170
+ tr_trans_do(VALUE self, struct tr_table *translation,
171
+ uint32_t (*replace)(uint32_t, void *), void *closure, bool squeeze)
172
+ {
173
+ const struct rb_u_string *string = RVAL2USTRING(self);
174
+
175
+ const char *begin = USTRING_STR(string);
176
+ const char *end = USTRING_END(string);
177
+ bool modified = false;
178
+ struct output output = OUTPUT_INIT(NULL, 0);
179
+ tr_trans_real(begin, end,
180
+ translation,
181
+ replace, closure, squeeze,
182
+ &output, &modified);
183
+ if (!modified)
184
+ return self;
185
+ output.result = ALLOC_N(char, output.n + 1);
186
+ output.m = output.n + 1;
187
+ output.n = 0;
188
+ tr_trans_real(begin, end,
189
+ translation,
190
+ replace, closure, squeeze,
191
+ &output, &modified);
192
+ output_finalize(&output);
193
+
194
+ return rb_u_string_new_c_own(self, output.result, output.n);
195
+ }
196
+
197
+ static VALUE
198
+ tr_trans(VALUE self, VALUE rbfrom, VALUE rbto, bool squeeze)
199
+ {
200
+ const struct rb_u_string *string = RVAL2USTRING(self);
201
+ const struct rb_u_string *from = RVAL2USTRING_ANY(rbfrom);
202
+ const struct rb_u_string *to = RVAL2USTRING_ANY(rbto);
203
+
204
+ if (USTRING_STR(string) == NULL || USTRING_LENGTH(string) == 0)
205
+ return self;
206
+
207
+ if (USTRING_LENGTH(to) == 0)
208
+ return rb_u_string_delete(1, &rbfrom, self);
209
+
210
+ struct tr tr_from;
211
+ tr_init(&tr_from, USTRING_STR(from), USTRING_END(from));
212
+
213
+ struct tr tr_to;
214
+ tr_init(&tr_to, USTRING_STR(to), USTRING_END(to));
215
+
216
+ struct tr_table translation;
217
+ tr_table_initialize(&translation, rbfrom);
218
+
219
+ if (tr_should_exclude(&tr_from)) {
220
+ /* This case is easy. Just include everything by default and
221
+ * exclude the rest as always. Replace characters found by the
222
+ * last character found in tr_to. */
223
+ while (tr_next(&tr_to) != TR_FINISHED)
224
+ ; /* We just need the last replacement character. */
225
+ return tr_trans_do(self, &translation, tr_trans_replace_exclude,
226
+ &tr_to.now, squeeze);
227
+ }
228
+
229
+ /* This case is hard. We need a full-fledged lookup of what character
230
+ * to translate to, not simply a check whether to include it or not. */
231
+ struct tr_trans_closure trans_closure;
232
+
233
+ struct tr_range from_ranges[u_n_chars_n(USTRING_STR(from), USTRING_LENGTH(from))];
234
+ trans_closure.from = from_ranges;
235
+ trans_closure.n_from = tr_ranges_setup(&tr_from, from_ranges);
236
+
237
+ struct tr_range to_ranges[u_n_chars_n(USTRING_STR(to), USTRING_LENGTH(to))];
238
+ trans_closure.to = to_ranges;
239
+ trans_closure.n_to = tr_ranges_setup(&tr_to, to_ranges);
240
+
241
+ return tr_trans_do(self, &translation, tr_trans_replace_include,
242
+ &trans_closure, squeeze);
243
+ }
244
+
245
+ /* @overload tr(from, to)
246
+ *
247
+ * Returns the receiver, translating characters in FROM to their equivalent
248
+ * character, by index, in TO, inheriting any taint and untrust. If
249
+ * TO{#length} < FROM{#length}, TO[-1] will be used for any index _i_ >
250
+ * TO{#length}.
251
+ *
252
+ * The complement of all Unicode characters and a given set of characters may
253
+ * be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
254
+ * ACCENT).
255
+ *
256
+ * Any sequence of characters _a_-_b_ inside a set will expand to also
257
+ * include all characters whose code points lay between those of _a_ and _b_.
258
+ *
259
+ * @param [#to_str] from
260
+ * @param [#to_str] to
261
+ * @return [U::String] */
262
+ VALUE
263
+ rb_u_string_tr(VALUE self, VALUE from, VALUE to)
264
+ {
265
+ return tr_trans(self, from, to, false);
266
+ }
267
+
268
+ /* @overload tr_s(from, to)
269
+ *
270
+ * Returns the receiver, translating characters in FROM to their equivalent
271
+ * character, by index, in TO and then squeezing any substrings of
272
+ * {#length} > 1 consisting of the same character _c_ with _c_, inheriting
273
+ * any taint and untrust. If TO{#length} < FROM{#length}, TO[-1] will be
274
+ * used for any index _i_ > TO{#length}.
275
+ *
276
+ * The complement of all Unicode characters and a given set of characters may
277
+ * be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
278
+ * ACCENT).
279
+ *
280
+ * Any sequence of characters _a_-_b_ inside a set will expand to also
281
+ * include all characters whose code points lay between those of _a_ and _b_.
282
+ *
283
+ * @param [#to_str] from
284
+ * @param [#to_str] to
285
+ * @return [U::String] */
286
+ VALUE
287
+ rb_u_string_tr_s(VALUE self, VALUE from, VALUE to)
288
+ {
289
+ return tr_trans(self, from, to, true);
290
+ }
@@ -0,0 +1,12 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload upcase(locale = ENV['LC_CTYPE'])
4
+ * @param [#to_str] locale
5
+ * @return [U::String] The upcasing of the receiver according to the rules of
6
+ * of the language of LOCALE, which may be empty to specifically use the
7
+ * default, language-independent, rules, inheriting any taint and untrust */
8
+ VALUE
9
+ rb_u_string_upcase(int argc, VALUE *argv, VALUE self)
10
+ {
11
+ return _rb_u_string_convert_locale(argc, argv, self, u_upcase, NULL);
12
+ }
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload upper?(locale = ENV[LC_CTYPE])
4
+ * @param [#to_str] locale
5
+ * @return [Boolean] True if the receiver has been upcased according to the
6
+ * rules of the language of LOCALE, which may be empty to specifically use
7
+ * the default, language-independent, rules, that is, if _a_ =
8
+ * _a_{#upcase}(LOCALE), where _a_ = {#normalize}(`:nfd`) */
9
+ VALUE
10
+ rb_u_string_upper(int argc, VALUE *argv, VALUE self)
11
+ {
12
+ return _rb_u_string_test_locale(argc, argv, self, u_upcase);
13
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload valid?
4
+ * @return [Boolean] True if the receiver contains only valid Unicode
5
+ * characters */
6
+ VALUE
7
+ rb_u_string_valid(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isvalid);
10
+ }
@@ -0,0 +1,12 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload valid_encoding?
4
+ * @return [Boolean] True if the receiver contains only valid UTF-8
5
+ * sequences */
6
+ VALUE
7
+ rb_u_string_valid_encoding(VALUE self)
8
+ {
9
+ const struct rb_u_string *string = RVAL2USTRING(self);
10
+
11
+ return u_valid(USTRING_STR(string), USTRING_LENGTH(string), NULL) ? Qtrue : Qfalse;
12
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload wide?
4
+ *
5
+ * Returns true if the receiver contains only “wide” characters. Wide
6
+ * character are those that have their East_Asian_Width property set to Wide
7
+ * or Fullwidth.
8
+ *
9
+ * This is mostly useful for determining how many “cells” a character will
10
+ * take up on a terminal or similar cell-based display.
11
+ *
12
+ * @return [Boolean]
13
+ * @see http://www.unicode.org/reports/tr11/
14
+ * Unicode Standard Annex #11: East Asian Width
15
+ * @see #wide_cjk?
16
+ * @see #width */
17
+ VALUE
18
+ rb_u_string_wide(VALUE self)
19
+ {
20
+ return _rb_u_character_test(self, u_char_iswide);
21
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload wide_cjk?
4
+ *
5
+ * Returns true if the receiver contains only “wide” and “ambiguously wide”
6
+ * characters. Wide and ambiguously wide character are those that have their
7
+ * East_Asian_Width property set to Ambiguous, Wide or Fullwidth.
8
+ *
9
+ * This is mostly useful for determining how many “cells” a character will
10
+ * take up on a terminal or similar cell-based display.
11
+ *
12
+ * @return [Boolean]
13
+ * @see http://www.unicode.org/reports/tr11/
14
+ * Unicode Standard Annex #11: East Asian Width
15
+ * @see #wide?
16
+ * @see #width */
17
+ VALUE
18
+ rb_u_string_wide_cjk(VALUE self)
19
+ {
20
+ return _rb_u_character_test(self, u_char_iswide_cjk);
21
+ }
@@ -0,0 +1,19 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* Returns the width of the receiver. The width is defined as the sum of the
4
+ * number of “cells” on a terminal or similar cell-based display that the
5
+ * characters in the string will require.
6
+ *
7
+ * Characters that are {#wide?} have a width of 2. Characters that are
8
+ * {#zero_width?} have a width of 0. Other characters have a width of 1.
9
+ *
10
+ * @return [Integer]
11
+ * @see http://www.unicode.org/reports/tr11/
12
+ * Unicode Standard Annex #11: East Asian Width */
13
+ VALUE
14
+ rb_u_string_width(VALUE self)
15
+ {
16
+ const struct rb_u_string *string = RVAL2USTRING(self);
17
+
18
+ return UINT2NUM(u_width_n(USTRING_STR(string), USTRING_LENGTH(string)));
19
+ }
@@ -0,0 +1,63 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define BREAK2ID(value, symbol) \
4
+ case U_WORD_BREAK_##value: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ break_to_symbol(enum u_word_break value)
13
+ {
14
+ switch (value) {
15
+ BREAK2ID(ALETTER, aletter);
16
+ BREAK2ID(CR, cr);
17
+ BREAK2ID(EXTEND, extend);
18
+ BREAK2ID(EXTENDNUMLET, extendnumlet);
19
+ BREAK2ID(FORMAT, format);
20
+ BREAK2ID(KATAKANA, katakana);
21
+ BREAK2ID(LF, lf);
22
+ BREAK2ID(MIDLETTER, midletter);
23
+ BREAK2ID(MIDNUM, midnum);
24
+ BREAK2ID(MIDNUMLET, midnumlet);
25
+ BREAK2ID(NEWLINE, newline);
26
+ BREAK2ID(NUMERIC, numeric);
27
+ BREAK2ID(OTHER, other);
28
+ BREAK2ID(REGIONAL_INDICATOR, regional_indicator);
29
+ default:
30
+ rb_u_raise(rb_eNotImpError, "unknown word break value: %d", value);
31
+ }
32
+ }
33
+
34
+ /* Returns the word break property value of the characters of the receiver.
35
+ *
36
+ * The possible word break values are
37
+ *
38
+ * * :aletter
39
+ * * :cr
40
+ * * :extend
41
+ * * :extendnumlet
42
+ * * :format
43
+ * * :katakana
44
+ * * :lf
45
+ * * :midletter
46
+ * * :midnum
47
+ * * :midnumlet
48
+ * * :newline
49
+ * * :numeric
50
+ * * :other
51
+ * * :regional_indicator
52
+ *
53
+ * @raise [ArgumentError] If the string consists of more than one break type
54
+ * @return [Symbol]
55
+ * @see http://www.unicode.org/reports/tr29/
56
+ * Unicode Standard Annex #29: Unicode Text Segmentation */
57
+ VALUE
58
+ rb_u_string_word_break(VALUE self)
59
+ {
60
+ return _rb_u_string_property(self, "word break", U_WORD_BREAK_OTHER,
61
+ (int (*)(uint32_t))u_char_word_break,
62
+ (VALUE (*)(int))break_to_symbol);
63
+ }