u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,189 @@
1
+ #include "rb_includes.h"
2
+
3
+ static void
4
+ rb_u_string_inspect_bad_input(const char *p, const char *q, VALUE result)
5
+ {
6
+ while (p < q) {
7
+ char hex[5];
8
+ snprintf(hex, lengthof(hex), "\\x%02X", *p & 0xff);
9
+ rb_str_buf_cat2(result, hex);
10
+ p++;
11
+ }
12
+ }
13
+
14
+ static void
15
+ rb_u_string_inspect_special_char(uint32_t c, VALUE result)
16
+ {
17
+ char str[U_CHAR_MAX_BYTE_LENGTH];
18
+
19
+ rb_str_buf_cat2(result, "\\");
20
+ rb_str_buf_cat(result, str, u_char_to_u(c, str));
21
+ }
22
+
23
+ static void
24
+ rb_u_string_inspect_escaped_char(uint32_t c, VALUE result)
25
+ {
26
+ char str[4 + 8 + 1];
27
+
28
+ if (c < 0x10000)
29
+ snprintf(str, lengthof(str), "\\u%04X", c);
30
+ else
31
+ snprintf(str, lengthof(str), "\\u{%X}", c & 0xffffffff);
32
+
33
+ rb_str_buf_cat2(result, str);
34
+ }
35
+
36
+ static void
37
+ rb_u_string_inspect_default(uint32_t c, VALUE result)
38
+ {
39
+ if (!u_char_isprint(c)) {
40
+ rb_u_string_inspect_escaped_char(c, result);
41
+ return;
42
+ }
43
+
44
+ char str[U_CHAR_MAX_BYTE_LENGTH];
45
+ rb_str_buf_cat(result, str, u_char_to_u(c, str));
46
+ }
47
+
48
+ #define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
49
+
50
+ static const char *
51
+ rb_u_string_inspect_hash_char(const char *q, const char *end,
52
+ VALUE result)
53
+ {
54
+ if (q == end) {
55
+ rb_str_buf_cat2(result, "#");
56
+ return q;
57
+ }
58
+
59
+ const char *p = q;
60
+ uint32_t c = u_decode(&q, p, end);
61
+ switch (c) {
62
+ case REPLACEMENT_CHARACTER:
63
+ rb_str_buf_cat2(result, "#");
64
+ if (!u_valid(p, q - p, NULL))
65
+ rb_u_string_inspect_bad_input(p, q, result);
66
+ else
67
+ rb_u_string_inspect_default(c, result);
68
+ return q;
69
+ case '$':
70
+ case '@':
71
+ case '{':
72
+ rb_str_buf_cat2(result, "\\#");
73
+ rb_u_string_inspect_default(c, result);
74
+ return q;
75
+ default:
76
+ rb_str_buf_cat2(result, "#");
77
+ return p;
78
+ }
79
+ }
80
+
81
+ /* Returns the receiver in a reader-friendly inspectable format, inheriting
82
+ * any taint and untrust, encoded using UTF-8.
83
+ *
84
+ * The reader-friendly inspectable format looks like “`"…".u`”. Inside the
85
+ * “…”, any {#print?} characters are output as-is, the following special
86
+ * characters are escaped according to the following table:
87
+ *
88
+ * <table>
89
+ * <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
90
+ * <tbody>
91
+ * <tr><td>U+0022 QUOTATION MARK</td><td><code>\"</code></td></tr>
92
+ * <tr><td>U+005C REVERSE SOLIDUS</td><td><code>\\</code></td></tr>
93
+ * <tr><td>U+000A LINE FEED (LF)</td><td><code>\n</code></td></tr>
94
+ * <tr><td>U+000D CARRIAGE RETURN (CR)</td><td><code>\r</code></td></tr>
95
+ * <tr><td>U+0009 CHARACTER TABULATION</td><td><code>\t</code></td></tr>
96
+ * <tr><td>U+000C FORM FEED (FF)</td><td><code>\f</code></td></tr>
97
+ * <tr><td>U+000B LINE TABULATION</td><td><code>\v</code></td></tr>
98
+ * <tr><td>U+0008 BACKSPACE</td><td><code>\b</code></td></tr>
99
+ * <tr><td>U+0007 BELL</td><td><code>\a</code></td></tr>
100
+ * <tr><td>U+001B ESCAPE</td><td><code>\e</code></td></tr>
101
+ * </tbody>
102
+ * </table>
103
+ *
104
+ * the following special sequences are also escaped:
105
+ *
106
+ * <table>
107
+ * <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
108
+ * <tbody>
109
+ * <tr><td><code>#$</code></td><td><code>\#$</code></td></tr>
110
+ * <tr><td><code>#@</code></td><td><code>\#@</code></td></tr>
111
+ * <tr><td><code>#{</code></td><td><code>\#{</code></td></tr>
112
+ * </tbody>
113
+ * </table>
114
+ *
115
+ * Valid UTF-8 byte sequences representing code points < 0x10000 are output as
116
+ * `\u`_n_, where _n_ is the four-digit uppercase hexadecimal representation
117
+ * of the code point.
118
+ *
119
+ * Valid UTF-8 byte sequences representing code points ≥ 0x10000 are output as
120
+ * `\u{`_n_`}`, where _n_ is the uppercase hexadecimal representation of the
121
+ * code point.
122
+ *
123
+ * Any other byte is output as `\x`_n_, where _n_ is the two-digit uppercase
124
+ * hexadecimal representation of the byte’s value.
125
+ *
126
+ * @return [String] */
127
+ VALUE
128
+ rb_u_string_inspect(VALUE self)
129
+ {
130
+ const struct rb_u_string *string = RVAL2USTRING(self);
131
+
132
+ VALUE result = rb_u_str_buf_new(0);
133
+ rb_str_buf_cat2(result, "\"");
134
+ const char *p = USTRING_STR(string);
135
+ const char *end = USTRING_END(string);
136
+ while (p < end) {
137
+ const char *q;
138
+ uint32_t c = u_decode(&q, p, end);
139
+ switch (c) {
140
+ case '"':
141
+ case '\\':
142
+ rb_u_string_inspect_special_char(c, result);
143
+ break;
144
+ case '#':
145
+ p = rb_u_string_inspect_hash_char(q, end, result);
146
+ continue;
147
+ case '\n':
148
+ rb_str_buf_cat2(result, "\\n");
149
+ break;
150
+ case '\r':
151
+ rb_str_buf_cat2(result, "\\r");
152
+ break;
153
+ case '\t':
154
+ rb_str_buf_cat2(result, "\\t");
155
+ break;
156
+ case '\f':
157
+ rb_str_buf_cat2(result, "\\f");
158
+ break;
159
+ case '\013':
160
+ rb_str_buf_cat2(result, "\\v");
161
+ break;
162
+ case '\010':
163
+ rb_str_buf_cat2(result, "\\b");
164
+ break;
165
+ case '\007':
166
+ rb_str_buf_cat2(result, "\\a");
167
+ break;
168
+ case '\033':
169
+ rb_str_buf_cat2(result, "\\e");
170
+ break;
171
+ case REPLACEMENT_CHARACTER:
172
+ if (!u_valid(p, q - p, NULL)) {
173
+ rb_u_string_inspect_bad_input(p, q, result);
174
+ break;
175
+ }
176
+ /* fall through */
177
+ default:
178
+ rb_u_string_inspect_default(c, result);
179
+ break;
180
+ }
181
+ p = q;
182
+ }
183
+
184
+ rb_str_buf_cat2(result, "\".u");
185
+
186
+ OBJ_INFECT(result, self);
187
+
188
+ return result;
189
+ }
@@ -0,0 +1,148 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_internal_tr.h"
3
+
4
+ void
5
+ tr_init(struct tr *tr, const char *p, const char *end)
6
+ {
7
+ tr->p = p;
8
+ tr->end = end;
9
+ tr->inside_range = false;
10
+ }
11
+
12
+ bool
13
+ tr_should_exclude(struct tr *tr)
14
+ {
15
+ if (tr->p + 1 < tr->end && *tr->p == '^') {
16
+ tr->p++;
17
+ return true;
18
+ }
19
+
20
+ return false;
21
+ }
22
+
23
+ static enum tr_state
24
+ tr_next_char(struct tr *t)
25
+ {
26
+ if (t->p == t->end)
27
+ return TR_FINISHED;
28
+
29
+ t->now = u_decode(&t->p, t->p, t->end);
30
+ if (t->p == t->end)
31
+ return TR_FOUND;
32
+ if (t->now == '\\') {
33
+ t->now = u_decode(&t->p, t->p, t->end);
34
+ if (t->p == t->end)
35
+ return TR_FOUND;
36
+ }
37
+
38
+ const char *next;
39
+ if (u_decode(&next, t->p, t->end) == '-') {
40
+ /* TODO: Make this simpler. Perhaps we don’t need
41
+ * TR_READ_ANOTHER, as we advance it here ourselves. I got to
42
+ * check the offsets here. Perhaps TR_READ_ANOTHER should also
43
+ * have advanced t->p one more step. */
44
+ if (next < t->end) {
45
+ uint32_t max = u_decode(&t->p, next, t->end);
46
+ if (max < t->now) {
47
+ t->p = next;
48
+ return TR_READ_ANOTHER;
49
+ }
50
+ t->inside_range = true;
51
+ t->max = max;
52
+ }
53
+ }
54
+
55
+ return TR_FOUND;
56
+ }
57
+
58
+ enum tr_state
59
+ tr_next(struct tr *t)
60
+ {
61
+ while (true) {
62
+ if (!t->inside_range) {
63
+ enum tr_state state;
64
+
65
+ if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
66
+ continue;
67
+
68
+ return state;
69
+ } else if (++t->now < t->max) {
70
+ return TR_FOUND;
71
+ } else {
72
+ t->inside_range = false;
73
+ return TR_FOUND;
74
+ }
75
+ }
76
+ }
77
+
78
+ static void
79
+ tr_table_set(struct tr_table *table, bool *buffer, uint32_t c, bool value)
80
+ {
81
+ if (c < lengthof(table->continuous)) {
82
+ buffer[c] = value;
83
+ return;
84
+ }
85
+
86
+ if (NIL_P(table->sparse))
87
+ table->sparse = rb_hash_new();
88
+
89
+ rb_hash_aset(table->sparse, UINT2NUM(c), value ? Qtrue : Qfalse);
90
+ }
91
+
92
+ static void
93
+ tr_table_add(struct tr_table *table, const struct rb_u_string *string)
94
+ {
95
+ struct tr tr;
96
+ tr_init(&tr, USTRING_STR(string), USTRING_END(string));
97
+
98
+ bool exclude = tr_should_exclude(&tr);
99
+
100
+ bool buffer[lengthof(table->continuous)];
101
+
102
+ for (size_t i = 0; i < lengthof(buffer); i++)
103
+ buffer[i] = exclude;
104
+
105
+ while (tr_next(&tr) != TR_FINISHED)
106
+ tr_table_set(table, buffer, tr.now, !exclude);
107
+
108
+ for (size_t i = 0; i < lengthof(table->continuous); i++)
109
+ table->continuous[i] = table->continuous[i] && buffer[i];
110
+ }
111
+
112
+ void
113
+ tr_table_initialize(struct tr_table *table, VALUE rbstring)
114
+ {
115
+ const struct rb_u_string *string = RVAL2USTRING_ANY(rbstring);
116
+
117
+ struct tr tr;
118
+ tr_init(&tr, USTRING_STR(string), USTRING_END(string));
119
+
120
+ table->exclude = tr_should_exclude(&tr);
121
+
122
+ for (size_t i = 0; i < lengthof(table->continuous); i++)
123
+ table->continuous[i] = true;
124
+
125
+ table->sparse = Qnil;
126
+
127
+ tr_table_add(table, string);
128
+ }
129
+
130
+ void
131
+ tr_table_initialize_from_strings(struct tr_table *table, int argc, VALUE *argv)
132
+ {
133
+ tr_table_initialize(table, argv[0]);
134
+ for (int i = 1; i < argc; i++)
135
+ tr_table_add(table, RVAL2USTRING_ANY(argv[i]));
136
+ }
137
+
138
+ bool
139
+ tr_table_lookup(struct tr_table *table, uint32_t c)
140
+ {
141
+ if (c < lengthof(table->continuous))
142
+ return table->continuous[c];
143
+
144
+ VALUE value = NIL_P(table->sparse) ?
145
+ Qnil : rb_hash_lookup(table->sparse, UINT2NUM(c));
146
+
147
+ return NIL_P(value) ? table->exclude : RTEST(value);
148
+ }
@@ -0,0 +1,29 @@
1
+ struct tr {
2
+ bool inside_range;
3
+ uint32_t now;
4
+ uint32_t max;
5
+ const char *p;
6
+ const char *end;
7
+ };
8
+
9
+ enum tr_state
10
+ {
11
+ TR_FOUND,
12
+ TR_READ_ANOTHER,
13
+ TR_FINISHED
14
+ };
15
+
16
+ struct tr_table {
17
+ bool exclude;
18
+ bool continuous[256];
19
+ VALUE sparse;
20
+ };
21
+
22
+ void tr_init(struct tr *tr, const char *p, const char *end);
23
+ bool tr_should_exclude(struct tr *tr);
24
+ enum tr_state tr_next(struct tr *t);
25
+ void tr_table_initialize(struct tr_table *table, VALUE rbstring);
26
+ void tr_table_initialize_from_strings(struct tr_table *table,
27
+ int argc,
28
+ VALUE *argv);
29
+ bool tr_table_lookup(struct tr_table *table, uint32_t c);
@@ -0,0 +1,169 @@
1
+ #include "rb_includes.h"
2
+
3
+ static char *
4
+ rb_u_string_justify_one_side(char *p, const struct rb_u_string *padding, long padding_width, long n)
5
+ {
6
+ const char *padding_str = USTRING_STR(padding);
7
+ long padding_size = USTRING_LENGTH(padding);
8
+
9
+ long i = 0;
10
+
11
+ for ( ; i + padding_width < n; i += padding_width, p += padding_size)
12
+ memcpy(p, padding_str, padding_size);
13
+
14
+ const char *q = padding_str;
15
+ const char *end = padding_str + padding_size;
16
+ while (i < n)
17
+ i += u_char_width(u_decode(&q, q, end));
18
+ memcpy(p, padding_str, q - padding_str);
19
+ p += q - padding_str;
20
+
21
+ return p;
22
+ }
23
+
24
+ static long
25
+ rounding_size(const struct rb_u_string *padding, long padding_width, long n)
26
+ {
27
+ const char *padding_str = USTRING_STR(padding);
28
+ const char *q = padding_str, *end = padding_str + USTRING_LENGTH(padding);
29
+ long r = n % padding_width;
30
+ long i = 0;
31
+ while (i < r && q < end)
32
+ i += u_char_width(u_decode(&q, q, end));
33
+ // NOTE I think i ≮ r is guaranteed, but I can’t seem to prove it, so
34
+ // leave this in for safety.
35
+ if (i < r)
36
+ rb_u_raise(rb_eArgError,
37
+ "padding isn’t wide enough to complete rounding (%ld < %ld)",
38
+ i, r);
39
+ if (i > r)
40
+ rb_u_raise(rb_eArgError,
41
+ "padding is too wide to complete rounding (%ld > %ld)",
42
+ i, r);
43
+ return q - padding_str;
44
+ }
45
+
46
+ static long
47
+ rb_u_string_justified_size(long string_size,
48
+ const struct rb_u_string *padding, long padding_width,
49
+ long left_n, long right_n)
50
+ {
51
+ long size;
52
+
53
+ long left_n_2 = rounding_size(padding, padding_width, left_n);
54
+ long right_n_2 = rounding_size(padding, padding_width, right_n);
55
+ if ((size = left_n / padding_width + right_n / padding_width) >= LONG_MAX / USTRING_LENGTH(padding) ||
56
+ (size *= USTRING_LENGTH(padding)) >= LONG_MAX - left_n_2 - right_n_2 ||
57
+ (size += left_n_2 + right_n_2) >= LONG_MAX - string_size)
58
+ rb_u_raise(rb_eArgError, "argument too big");
59
+ size += string_size;
60
+
61
+ return size;
62
+ }
63
+
64
+ static VALUE
65
+ rb_u_string_justify_impl(VALUE self,
66
+ const struct rb_u_string *string, long string_width,
67
+ const struct rb_u_string *padding, long padding_width,
68
+ long width, char jflag)
69
+ {
70
+ long n = width - string_width;
71
+ long left_n = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n / 2);
72
+ long right_n = n - left_n;
73
+
74
+ long string_size = USTRING_LENGTH(string);
75
+ long justified_size = rb_u_string_justified_size(string_size,
76
+ padding, padding_width,
77
+ left_n, right_n);
78
+ char *justified = ALLOC_N(char, justified_size + 1);
79
+
80
+ char *p = rb_u_string_justify_one_side(justified, padding, padding_width, left_n);
81
+ memcpy(p, USTRING_STR(string), string_size);
82
+ p += string_size;
83
+ p = rb_u_string_justify_one_side(p, padding, padding_width, right_n);
84
+ justified[justified_size] = '\0';
85
+
86
+ return rb_u_string_new_c_own(self, justified, justified_size);
87
+ }
88
+
89
+ static VALUE
90
+ rb_u_string_justify(int argc, VALUE *argv, VALUE self, char jflag)
91
+ {
92
+ const struct rb_u_string *string = RVAL2USTRING(self);
93
+
94
+ VALUE rbwidth, rbpadding;
95
+ const struct rb_u_string *padding = USTRING_LOCAL(Qnil, " ", 1);
96
+ long padding_width = 1;
97
+ if (rb_scan_args(argc, argv, "11", &rbwidth, &rbpadding) == 2) {
98
+ padding = RVAL2USTRING_ANY(rbpadding);
99
+ padding_width = u_width_n(USTRING_STR(padding), USTRING_LENGTH(padding));
100
+ if (padding_width == 0)
101
+ rb_u_raise(rb_eArgError, "zero-width padding");
102
+ }
103
+
104
+ long string_width = u_width_n(USTRING_STR(string), USTRING_LENGTH(string));
105
+
106
+ long width = NUM2LONG(rbwidth);
107
+ if (width < 0 || string_width >= width)
108
+ return self;
109
+
110
+ VALUE result = rb_u_string_justify_impl(self,
111
+ string, string_width,
112
+ padding, padding_width,
113
+ width, jflag);
114
+ if (!NIL_P(rbpadding))
115
+ OBJ_INFECT(result, rbpadding);
116
+ return result;
117
+ }
118
+
119
+ /* @overload center(width, padding = ' ')
120
+ * @param [#to_int] width
121
+ * @param [U::String, #to_str] padding
122
+ * @raise [ArgumentError] If PADDING{#width} = 0
123
+ * @raise [ArgumentError] If characters inside PADDING that should be used
124
+ * for round-off padding are too wide
125
+ * @return [U::String] The receiver padded as evenly as possible on both
126
+ * sides with PADDING to make it max({#length}, WIDTH) wide, inheriting any
127
+ * taint and untrust from the receiver and also from PADDING if PADDING is
128
+ * used
129
+ * @see #ljust
130
+ * @see #rjust */
131
+ VALUE
132
+ rb_u_string_center(int argc, VALUE *argv, VALUE self)
133
+ {
134
+ return rb_u_string_justify(argc, argv, self, 'c');
135
+ }
136
+
137
+ /* @overload ljust(width, padding = ' ')
138
+ * @param [#to_int] width
139
+ * @param [U::String, #to_str] padding
140
+ * @raise [ArgumentError] If PADDING{#width} = 0
141
+ * @raise [ArgumentError] If characters inside PADDING that should be used
142
+ * for round-off padding are too wide
143
+ * @return [U::String] The receiver padded on the right with PADDING to make
144
+ * it max({#length}, WIDTH) wide, inheriting any taint and untrust from
145
+ * the receiver and also from PADDING if PADDING is used
146
+ * @see #center
147
+ * @see #rjust */
148
+ VALUE
149
+ rb_u_string_ljust(int argc, VALUE *argv, VALUE self)
150
+ {
151
+ return rb_u_string_justify(argc, argv, self, 'l');
152
+ }
153
+
154
+ /* @overload rjust(width, padding = ' ')
155
+ * @param [#to_int] width
156
+ * @param [U::String, #to_str] padding
157
+ * @raise [ArgumentError] If PADDING{#width} = 0
158
+ * @raise [ArgumentError] If characters inside PADDING that should be used
159
+ * for round-off padding are too wide
160
+ * @return [U::String] The receiver padded on the left with PADDING to make
161
+ * it max({#length}, WIDTH) wide, inheriting any taint and untrust from the
162
+ * receiver and also from PADDING if PADDING is used
163
+ * @see #center
164
+ * @see #ljust */
165
+ VALUE
166
+ rb_u_string_rjust(int argc, VALUE *argv, VALUE self)
167
+ {
168
+ return rb_u_string_justify(argc, argv, self, 'r');
169
+ }