u 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,189 @@
1
+ #include "rb_includes.h"
2
+
3
+ static void
4
+ rb_u_string_inspect_bad_input(const char *p, const char *q, VALUE result)
5
+ {
6
+ while (p < q) {
7
+ char hex[5];
8
+ snprintf(hex, lengthof(hex), "\\x%02X", *p & 0xff);
9
+ rb_str_buf_cat2(result, hex);
10
+ p++;
11
+ }
12
+ }
13
+
14
+ static void
15
+ rb_u_string_inspect_special_char(uint32_t c, VALUE result)
16
+ {
17
+ char str[U_CHAR_MAX_BYTE_LENGTH];
18
+
19
+ rb_str_buf_cat2(result, "\\");
20
+ rb_str_buf_cat(result, str, u_char_to_u(c, str));
21
+ }
22
+
23
+ static void
24
+ rb_u_string_inspect_escaped_char(uint32_t c, VALUE result)
25
+ {
26
+ char str[4 + 8 + 1];
27
+
28
+ if (c < 0x10000)
29
+ snprintf(str, lengthof(str), "\\u%04X", c);
30
+ else
31
+ snprintf(str, lengthof(str), "\\u{%X}", c & 0xffffffff);
32
+
33
+ rb_str_buf_cat2(result, str);
34
+ }
35
+
36
+ static void
37
+ rb_u_string_inspect_default(uint32_t c, VALUE result)
38
+ {
39
+ if (!u_char_isprint(c)) {
40
+ rb_u_string_inspect_escaped_char(c, result);
41
+ return;
42
+ }
43
+
44
+ char str[U_CHAR_MAX_BYTE_LENGTH];
45
+ rb_str_buf_cat(result, str, u_char_to_u(c, str));
46
+ }
47
+
48
+ #define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
49
+
50
+ static const char *
51
+ rb_u_string_inspect_hash_char(const char *q, const char *end,
52
+ VALUE result)
53
+ {
54
+ if (q == end) {
55
+ rb_str_buf_cat2(result, "#");
56
+ return q;
57
+ }
58
+
59
+ const char *p = q;
60
+ uint32_t c = u_decode(&q, p, end);
61
+ switch (c) {
62
+ case REPLACEMENT_CHARACTER:
63
+ rb_str_buf_cat2(result, "#");
64
+ if (!u_valid(p, q - p, NULL))
65
+ rb_u_string_inspect_bad_input(p, q, result);
66
+ else
67
+ rb_u_string_inspect_default(c, result);
68
+ return q;
69
+ case '$':
70
+ case '@':
71
+ case '{':
72
+ rb_str_buf_cat2(result, "\\#");
73
+ rb_u_string_inspect_default(c, result);
74
+ return q;
75
+ default:
76
+ rb_str_buf_cat2(result, "#");
77
+ return p;
78
+ }
79
+ }
80
+
81
+ /* Returns the receiver in a reader-friendly inspectable format, inheriting
82
+ * any taint and untrust, encoded using UTF-8.
83
+ *
84
+ * The reader-friendly inspectable format looks like “`"…".u`”. Inside the
85
+ * “…”, any {#print?} characters are output as-is, the following special
86
+ * characters are escaped according to the following table:
87
+ *
88
+ * <table>
89
+ * <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
90
+ * <tbody>
91
+ * <tr><td>U+0022 QUOTATION MARK</td><td><code>\"</code></td></tr>
92
+ * <tr><td>U+005C REVERSE SOLIDUS</td><td><code>\\</code></td></tr>
93
+ * <tr><td>U+000A LINE FEED (LF)</td><td><code>\n</code></td></tr>
94
+ * <tr><td>U+000D CARRIAGE RETURN (CR)</td><td><code>\r</code></td></tr>
95
+ * <tr><td>U+0009 CHARACTER TABULATION</td><td><code>\t</code></td></tr>
96
+ * <tr><td>U+000C FORM FEED (FF)</td><td><code>\f</code></td></tr>
97
+ * <tr><td>U+000B LINE TABULATION</td><td><code>\v</code></td></tr>
98
+ * <tr><td>U+0008 BACKSPACE</td><td><code>\b</code></td></tr>
99
+ * <tr><td>U+0007 BELL</td><td><code>\a</code></td></tr>
100
+ * <tr><td>U+001B ESCAPE</td><td><code>\e</code></td></tr>
101
+ * </tbody>
102
+ * </table>
103
+ *
104
+ * the following special sequences are also escaped:
105
+ *
106
+ * <table>
107
+ * <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
108
+ * <tbody>
109
+ * <tr><td><code>#$</code></td><td><code>\#$</code></td></tr>
110
+ * <tr><td><code>#@</code></td><td><code>\#@</code></td></tr>
111
+ * <tr><td><code>#{</code></td><td><code>\#{</code></td></tr>
112
+ * </tbody>
113
+ * </table>
114
+ *
115
+ * Valid UTF-8 byte sequences representing code points < 0x10000 are output as
116
+ * `\u`_n_, where _n_ is the four-digit uppercase hexadecimal representation
117
+ * of the code point.
118
+ *
119
+ * Valid UTF-8 byte sequences representing code points ≥ 0x10000 are output as
120
+ * `\u{`_n_`}`, where _n_ is the uppercase hexadecimal representation of the
121
+ * code point.
122
+ *
123
+ * Any other byte is output as `\x`_n_, where _n_ is the two-digit uppercase
124
+ * hexadecimal representation of the byte’s value.
125
+ *
126
+ * @return [String] */
127
+ VALUE
128
+ rb_u_string_inspect(VALUE self)
129
+ {
130
+ const struct rb_u_string *string = RVAL2USTRING(self);
131
+
132
+ VALUE result = rb_u_str_buf_new(0);
133
+ rb_str_buf_cat2(result, "\"");
134
+ const char *p = USTRING_STR(string);
135
+ const char *end = USTRING_END(string);
136
+ while (p < end) {
137
+ const char *q;
138
+ uint32_t c = u_decode(&q, p, end);
139
+ switch (c) {
140
+ case '"':
141
+ case '\\':
142
+ rb_u_string_inspect_special_char(c, result);
143
+ break;
144
+ case '#':
145
+ p = rb_u_string_inspect_hash_char(q, end, result);
146
+ continue;
147
+ case '\n':
148
+ rb_str_buf_cat2(result, "\\n");
149
+ break;
150
+ case '\r':
151
+ rb_str_buf_cat2(result, "\\r");
152
+ break;
153
+ case '\t':
154
+ rb_str_buf_cat2(result, "\\t");
155
+ break;
156
+ case '\f':
157
+ rb_str_buf_cat2(result, "\\f");
158
+ break;
159
+ case '\013':
160
+ rb_str_buf_cat2(result, "\\v");
161
+ break;
162
+ case '\010':
163
+ rb_str_buf_cat2(result, "\\b");
164
+ break;
165
+ case '\007':
166
+ rb_str_buf_cat2(result, "\\a");
167
+ break;
168
+ case '\033':
169
+ rb_str_buf_cat2(result, "\\e");
170
+ break;
171
+ case REPLACEMENT_CHARACTER:
172
+ if (!u_valid(p, q - p, NULL)) {
173
+ rb_u_string_inspect_bad_input(p, q, result);
174
+ break;
175
+ }
176
+ /* fall through */
177
+ default:
178
+ rb_u_string_inspect_default(c, result);
179
+ break;
180
+ }
181
+ p = q;
182
+ }
183
+
184
+ rb_str_buf_cat2(result, "\".u");
185
+
186
+ OBJ_INFECT(result, self);
187
+
188
+ return result;
189
+ }
@@ -0,0 +1,148 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_internal_tr.h"
3
+
4
+ void
5
+ tr_init(struct tr *tr, const char *p, const char *end)
6
+ {
7
+ tr->p = p;
8
+ tr->end = end;
9
+ tr->inside_range = false;
10
+ }
11
+
12
+ bool
13
+ tr_should_exclude(struct tr *tr)
14
+ {
15
+ if (tr->p + 1 < tr->end && *tr->p == '^') {
16
+ tr->p++;
17
+ return true;
18
+ }
19
+
20
+ return false;
21
+ }
22
+
23
+ static enum tr_state
24
+ tr_next_char(struct tr *t)
25
+ {
26
+ if (t->p == t->end)
27
+ return TR_FINISHED;
28
+
29
+ t->now = u_decode(&t->p, t->p, t->end);
30
+ if (t->p == t->end)
31
+ return TR_FOUND;
32
+ if (t->now == '\\') {
33
+ t->now = u_decode(&t->p, t->p, t->end);
34
+ if (t->p == t->end)
35
+ return TR_FOUND;
36
+ }
37
+
38
+ const char *next;
39
+ if (u_decode(&next, t->p, t->end) == '-') {
40
+ /* TODO: Make this simpler. Perhaps we don’t need
41
+ * TR_READ_ANOTHER, as we advance it here ourselves. I got to
42
+ * check the offsets here. Perhaps TR_READ_ANOTHER should also
43
+ * have advanced t->p one more step. */
44
+ if (next < t->end) {
45
+ uint32_t max = u_decode(&t->p, next, t->end);
46
+ if (max < t->now) {
47
+ t->p = next;
48
+ return TR_READ_ANOTHER;
49
+ }
50
+ t->inside_range = true;
51
+ t->max = max;
52
+ }
53
+ }
54
+
55
+ return TR_FOUND;
56
+ }
57
+
58
+ enum tr_state
59
+ tr_next(struct tr *t)
60
+ {
61
+ while (true) {
62
+ if (!t->inside_range) {
63
+ enum tr_state state;
64
+
65
+ if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
66
+ continue;
67
+
68
+ return state;
69
+ } else if (++t->now < t->max) {
70
+ return TR_FOUND;
71
+ } else {
72
+ t->inside_range = false;
73
+ return TR_FOUND;
74
+ }
75
+ }
76
+ }
77
+
78
+ static void
79
+ tr_table_set(struct tr_table *table, bool *buffer, uint32_t c, bool value)
80
+ {
81
+ if (c < lengthof(table->continuous)) {
82
+ buffer[c] = value;
83
+ return;
84
+ }
85
+
86
+ if (NIL_P(table->sparse))
87
+ table->sparse = rb_hash_new();
88
+
89
+ rb_hash_aset(table->sparse, UINT2NUM(c), value ? Qtrue : Qfalse);
90
+ }
91
+
92
+ static void
93
+ tr_table_add(struct tr_table *table, const struct rb_u_string *string)
94
+ {
95
+ struct tr tr;
96
+ tr_init(&tr, USTRING_STR(string), USTRING_END(string));
97
+
98
+ bool exclude = tr_should_exclude(&tr);
99
+
100
+ bool buffer[lengthof(table->continuous)];
101
+
102
+ for (size_t i = 0; i < lengthof(buffer); i++)
103
+ buffer[i] = exclude;
104
+
105
+ while (tr_next(&tr) != TR_FINISHED)
106
+ tr_table_set(table, buffer, tr.now, !exclude);
107
+
108
+ for (size_t i = 0; i < lengthof(table->continuous); i++)
109
+ table->continuous[i] = table->continuous[i] && buffer[i];
110
+ }
111
+
112
+ void
113
+ tr_table_initialize(struct tr_table *table, VALUE rbstring)
114
+ {
115
+ const struct rb_u_string *string = RVAL2USTRING_ANY(rbstring);
116
+
117
+ struct tr tr;
118
+ tr_init(&tr, USTRING_STR(string), USTRING_END(string));
119
+
120
+ table->exclude = tr_should_exclude(&tr);
121
+
122
+ for (size_t i = 0; i < lengthof(table->continuous); i++)
123
+ table->continuous[i] = true;
124
+
125
+ table->sparse = Qnil;
126
+
127
+ tr_table_add(table, string);
128
+ }
129
+
130
+ void
131
+ tr_table_initialize_from_strings(struct tr_table *table, int argc, VALUE *argv)
132
+ {
133
+ tr_table_initialize(table, argv[0]);
134
+ for (int i = 1; i < argc; i++)
135
+ tr_table_add(table, RVAL2USTRING_ANY(argv[i]));
136
+ }
137
+
138
+ bool
139
+ tr_table_lookup(struct tr_table *table, uint32_t c)
140
+ {
141
+ if (c < lengthof(table->continuous))
142
+ return table->continuous[c];
143
+
144
+ VALUE value = NIL_P(table->sparse) ?
145
+ Qnil : rb_hash_lookup(table->sparse, UINT2NUM(c));
146
+
147
+ return NIL_P(value) ? table->exclude : RTEST(value);
148
+ }
@@ -0,0 +1,29 @@
1
+ struct tr {
2
+ bool inside_range;
3
+ uint32_t now;
4
+ uint32_t max;
5
+ const char *p;
6
+ const char *end;
7
+ };
8
+
9
+ enum tr_state
10
+ {
11
+ TR_FOUND,
12
+ TR_READ_ANOTHER,
13
+ TR_FINISHED
14
+ };
15
+
16
+ struct tr_table {
17
+ bool exclude;
18
+ bool continuous[256];
19
+ VALUE sparse;
20
+ };
21
+
22
+ void tr_init(struct tr *tr, const char *p, const char *end);
23
+ bool tr_should_exclude(struct tr *tr);
24
+ enum tr_state tr_next(struct tr *t);
25
+ void tr_table_initialize(struct tr_table *table, VALUE rbstring);
26
+ void tr_table_initialize_from_strings(struct tr_table *table,
27
+ int argc,
28
+ VALUE *argv);
29
+ bool tr_table_lookup(struct tr_table *table, uint32_t c);
@@ -0,0 +1,169 @@
1
+ #include "rb_includes.h"
2
+
3
+ static char *
4
+ rb_u_string_justify_one_side(char *p, const struct rb_u_string *padding, long padding_width, long n)
5
+ {
6
+ const char *padding_str = USTRING_STR(padding);
7
+ long padding_size = USTRING_LENGTH(padding);
8
+
9
+ long i = 0;
10
+
11
+ for ( ; i + padding_width < n; i += padding_width, p += padding_size)
12
+ memcpy(p, padding_str, padding_size);
13
+
14
+ const char *q = padding_str;
15
+ const char *end = padding_str + padding_size;
16
+ while (i < n)
17
+ i += u_char_width(u_decode(&q, q, end));
18
+ memcpy(p, padding_str, q - padding_str);
19
+ p += q - padding_str;
20
+
21
+ return p;
22
+ }
23
+
24
+ static long
25
+ rounding_size(const struct rb_u_string *padding, long padding_width, long n)
26
+ {
27
+ const char *padding_str = USTRING_STR(padding);
28
+ const char *q = padding_str, *end = padding_str + USTRING_LENGTH(padding);
29
+ long r = n % padding_width;
30
+ long i = 0;
31
+ while (i < r && q < end)
32
+ i += u_char_width(u_decode(&q, q, end));
33
+ // NOTE I think i ≮ r is guaranteed, but I can’t seem to prove it, so
34
+ // leave this in for safety.
35
+ if (i < r)
36
+ rb_u_raise(rb_eArgError,
37
+ "padding isn’t wide enough to complete rounding (%ld < %ld)",
38
+ i, r);
39
+ if (i > r)
40
+ rb_u_raise(rb_eArgError,
41
+ "padding is too wide to complete rounding (%ld > %ld)",
42
+ i, r);
43
+ return q - padding_str;
44
+ }
45
+
46
+ static long
47
+ rb_u_string_justified_size(long string_size,
48
+ const struct rb_u_string *padding, long padding_width,
49
+ long left_n, long right_n)
50
+ {
51
+ long size;
52
+
53
+ long left_n_2 = rounding_size(padding, padding_width, left_n);
54
+ long right_n_2 = rounding_size(padding, padding_width, right_n);
55
+ if ((size = left_n / padding_width + right_n / padding_width) >= LONG_MAX / USTRING_LENGTH(padding) ||
56
+ (size *= USTRING_LENGTH(padding)) >= LONG_MAX - left_n_2 - right_n_2 ||
57
+ (size += left_n_2 + right_n_2) >= LONG_MAX - string_size)
58
+ rb_u_raise(rb_eArgError, "argument too big");
59
+ size += string_size;
60
+
61
+ return size;
62
+ }
63
+
64
+ static VALUE
65
+ rb_u_string_justify_impl(VALUE self,
66
+ const struct rb_u_string *string, long string_width,
67
+ const struct rb_u_string *padding, long padding_width,
68
+ long width, char jflag)
69
+ {
70
+ long n = width - string_width;
71
+ long left_n = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n / 2);
72
+ long right_n = n - left_n;
73
+
74
+ long string_size = USTRING_LENGTH(string);
75
+ long justified_size = rb_u_string_justified_size(string_size,
76
+ padding, padding_width,
77
+ left_n, right_n);
78
+ char *justified = ALLOC_N(char, justified_size + 1);
79
+
80
+ char *p = rb_u_string_justify_one_side(justified, padding, padding_width, left_n);
81
+ memcpy(p, USTRING_STR(string), string_size);
82
+ p += string_size;
83
+ p = rb_u_string_justify_one_side(p, padding, padding_width, right_n);
84
+ justified[justified_size] = '\0';
85
+
86
+ return rb_u_string_new_c_own(self, justified, justified_size);
87
+ }
88
+
89
+ static VALUE
90
+ rb_u_string_justify(int argc, VALUE *argv, VALUE self, char jflag)
91
+ {
92
+ const struct rb_u_string *string = RVAL2USTRING(self);
93
+
94
+ VALUE rbwidth, rbpadding;
95
+ const struct rb_u_string *padding = USTRING_LOCAL(Qnil, " ", 1);
96
+ long padding_width = 1;
97
+ if (rb_scan_args(argc, argv, "11", &rbwidth, &rbpadding) == 2) {
98
+ padding = RVAL2USTRING_ANY(rbpadding);
99
+ padding_width = u_width_n(USTRING_STR(padding), USTRING_LENGTH(padding));
100
+ if (padding_width == 0)
101
+ rb_u_raise(rb_eArgError, "zero-width padding");
102
+ }
103
+
104
+ long string_width = u_width_n(USTRING_STR(string), USTRING_LENGTH(string));
105
+
106
+ long width = NUM2LONG(rbwidth);
107
+ if (width < 0 || string_width >= width)
108
+ return self;
109
+
110
+ VALUE result = rb_u_string_justify_impl(self,
111
+ string, string_width,
112
+ padding, padding_width,
113
+ width, jflag);
114
+ if (!NIL_P(rbpadding))
115
+ OBJ_INFECT(result, rbpadding);
116
+ return result;
117
+ }
118
+
119
+ /* @overload center(width, padding = ' ')
120
+ * @param [#to_int] width
121
+ * @param [U::String, #to_str] padding
122
+ * @raise [ArgumentError] If PADDING{#width} = 0
123
+ * @raise [ArgumentError] If characters inside PADDING that should be used
124
+ * for round-off padding are too wide
125
+ * @return [U::String] The receiver padded as evenly as possible on both
126
+ * sides with PADDING to make it max({#length}, WIDTH) wide, inheriting any
127
+ * taint and untrust from the receiver and also from PADDING if PADDING is
128
+ * used
129
+ * @see #ljust
130
+ * @see #rjust */
131
+ VALUE
132
+ rb_u_string_center(int argc, VALUE *argv, VALUE self)
133
+ {
134
+ return rb_u_string_justify(argc, argv, self, 'c');
135
+ }
136
+
137
+ /* @overload ljust(width, padding = ' ')
138
+ * @param [#to_int] width
139
+ * @param [U::String, #to_str] padding
140
+ * @raise [ArgumentError] If PADDING{#width} = 0
141
+ * @raise [ArgumentError] If characters inside PADDING that should be used
142
+ * for round-off padding are too wide
143
+ * @return [U::String] The receiver padded on the right with PADDING to make
144
+ * it max({#length}, WIDTH) wide, inheriting any taint and untrust from
145
+ * the receiver and also from PADDING if PADDING is used
146
+ * @see #center
147
+ * @see #rjust */
148
+ VALUE
149
+ rb_u_string_ljust(int argc, VALUE *argv, VALUE self)
150
+ {
151
+ return rb_u_string_justify(argc, argv, self, 'l');
152
+ }
153
+
154
+ /* @overload rjust(width, padding = ' ')
155
+ * @param [#to_int] width
156
+ * @param [U::String, #to_str] padding
157
+ * @raise [ArgumentError] If PADDING{#width} = 0
158
+ * @raise [ArgumentError] If characters inside PADDING that should be used
159
+ * for round-off padding are too wide
160
+ * @return [U::String] The receiver padded on the left with PADDING to make
161
+ * it max({#length}, WIDTH) wide, inheriting any taint and untrust from the
162
+ * receiver and also from PADDING if PADDING is used
163
+ * @see #center
164
+ * @see #ljust */
165
+ VALUE
166
+ rb_u_string_rjust(int argc, VALUE *argv, VALUE self)
167
+ {
168
+ return rb_u_string_justify(argc, argv, self, 'r');
169
+ }