u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,35 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload *(n)
4
+ * @param [#to_int] n
5
+ * @raise [ArgumentError] If N < 0
6
+ * @raise [ArgumentError] If N > 0 and N × {#bytesize} > LONG_MAX
7
+ * @return [U::String] The concatenation of N copies of the receiver,
8
+ * inheriting any taint and untrust */
9
+ VALUE
10
+ rb_u_string_times(VALUE self, VALUE rbtimes)
11
+ {
12
+ const struct rb_u_string *string = RVAL2USTRING(self);
13
+
14
+ long times = NUM2LONG(rbtimes);
15
+ if (times < 0)
16
+ rb_u_raise(rb_eArgError, "negative argument: %ld", times);
17
+
18
+ /* TODO: Isn’t this off by one, as we add one to length for the
19
+ * ALLOC_N() call? */
20
+ if (times > 0 && LONG_MAX / times < USTRING_LENGTH(string))
21
+ rb_u_raise(rb_eArgError, "argument too big: %ld", times);
22
+ long length = times * USTRING_LENGTH(string);
23
+
24
+ char *product = ALLOC_N(char, length + 1);
25
+ long i = USTRING_LENGTH(string);
26
+ if (i > 0) {
27
+ memcpy(product, USTRING_STR(string), i);
28
+ for ( ; i <= times / 2; i *= 2)
29
+ memcpy(product + i, product, i);
30
+ memcpy(product + i, product, times - i);
31
+ }
32
+ product[length] = '\0';
33
+
34
+ return rb_u_string_new_c_own(self, product, length);
35
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload title?
4
+ * @return [Boolean] True if the receiver contains only characters in the
5
+ * general category Letter, Titlecase (Lt) */
6
+ VALUE
7
+ rb_u_string_title(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_istitle);
10
+ }
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload titlecase(locale = ENV['LC_CTYPE'])
4
+ * @param [#to_str] locale
5
+ * @return [U::String] The title-casing of the receiver according to the
6
+ * rules of the language of LOCALE, which may be empty to specifically use
7
+ * the default, language-independent, rules, inheriting any taint and
8
+ * untrust */
9
+ VALUE
10
+ rb_u_string_titlecase(int argc, VALUE *argv, VALUE self)
11
+ {
12
+ return _rb_u_string_convert_locale(argc, argv, self, u_titlecase, NULL);
13
+ }
@@ -0,0 +1,45 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_to_inum.h"
3
+
4
+ /* @overload to_i(base = 16)
5
+ *
6
+ * Returns the Integer value that results from treating the receiver as a
7
+ * string of digits in BASE.
8
+ *
9
+ * The conversion algorithm is
10
+ *
11
+ * 1. Skip any leading {#space?}s
12
+ * 2. Check for an optional sign, ‘+’ or ‘-’
13
+ * 3. If base is 2, skip an optional “0b” or “0B” prefix
14
+ * 4. If base is 8, skip an optional “0o” or “0o” prefix
15
+ * 5. If base is 10, skip an optional “0d” or “0D” prefix
16
+ * 6. If base is 16, skip an optional “0x” or “0X” prefix
17
+ * 7. Skip any ‘0’s
18
+ * 8. Read an as long sequence of digits in BASE separated by optional U+005F
19
+ * LOW LINE characters, using letters in the following ranges of characters
20
+ * for digits or the characters digit value, if any
21
+ *
22
+ * * U+0041 LATIN CAPITAL LETTER A through U+005A LATIN CAPITAL LETTER Z
23
+ * * U+0061 LATIN SMALL LETTER A through U+007A LATIN SMALL LETTER Z
24
+ * * U+FF21 FULLWIDTH LATIN CAPITAL LETTER A through U+FF3A FULLWIDTH LATIN CAPITAL LETTER Z
25
+ * * U+FF41 FULLWIDTH LATIN SMALL LETTER A through U+FF5A FULLWIDTH LATIN SMALL LETTER Z
26
+ *
27
+ * Note that only one separator is allowed in a row.
28
+ *
29
+ * @param [#to_int] base
30
+ * @raise [ArgumentError] Unless 2 ≤ BASE ≤ 36
31
+ * @return [Integer] */
32
+ VALUE
33
+ rb_u_string_to_i(int argc, VALUE *argv, VALUE self)
34
+ {
35
+ int base = 10;
36
+
37
+ VALUE rbbase;
38
+ if (rb_scan_args(argc, argv, "01", &rbbase) == 1)
39
+ base = NUM2INT(rbbase);
40
+
41
+ if (base < 0)
42
+ rb_u_raise(rb_eArgError, "illegal radix %d", base);
43
+
44
+ return rb_u_string_to_inum(self, base, false);
45
+ }
@@ -0,0 +1,364 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_to_inum.h"
3
+
4
+ /* XXX: Stolen straight from bignum.c. */
5
+ #define BDIGITS(x) (RBIGNUM_DIGITS(x))
6
+ #define BITSPERDIG (SIZEOF_BDIGITS * CHAR_BIT)
7
+ #define BIGRAD ((BDIGIT_DBL)1 << BITSPERDIG)
8
+ #define BIGDN(x) RSHIFT((x), BITSPERDIG)
9
+ #define BIGLO(x) ((BDIGIT)((x) & (BIGRAD - 1)))
10
+
11
+ static VALUE
12
+ bignew_1(VALUE klass, long len, int sign)
13
+ {
14
+ NEWOBJ(big, struct RBignum);
15
+ OBJSETUP(big, klass, T_BIGNUM);
16
+ RBIGNUM_SET_SIGN(big, sign ? 1 : 0);
17
+ #ifdef RBIGNUM_EMBED_LEN_MAX
18
+ #define RBIGNUM_SET_LEN(b,l) \
19
+ ((RBASIC(b)->flags & RBIGNUM_EMBED_FLAG) ? \
20
+ (void)(RBASIC(b)->flags = \
21
+ (RBASIC(b)->flags & ~RBIGNUM_EMBED_LEN_MASK) | \
22
+ ((l) << RBIGNUM_EMBED_LEN_SHIFT)) : \
23
+ (void)(RBIGNUM(b)->as.heap.len = (l)))
24
+
25
+ if (len <= RBIGNUM_EMBED_LEN_MAX) {
26
+ RBASIC(big)->flags |= RBIGNUM_EMBED_FLAG;
27
+ RBIGNUM_SET_LEN(big, len);
28
+ }
29
+ else {
30
+ RBIGNUM(big)->as.heap.digits = ALLOC_N(BDIGIT, len);
31
+ RBIGNUM(big)->as.heap.len = len;
32
+ }
33
+ #else
34
+ big->len = len;
35
+ big->digits = ALLOC_N(BDIGIT, len);
36
+ #endif
37
+
38
+ return (VALUE)big;
39
+ }
40
+
41
+ #define bignew(len, sign) bignew_1(rb_cBignum, len, sign)
42
+
43
+ static const char *
44
+ rb_u_string_to_inum_sign(const char *s, int *sign)
45
+ {
46
+ *sign = 1;
47
+
48
+ if (*s == '-')
49
+ *sign = 0;
50
+
51
+ if (*s == '+' || *s == '-')
52
+ return s + 1;
53
+
54
+ return s;
55
+ }
56
+
57
+ static const char *
58
+ rb_u_string_to_inum_base(const char *s, int *base)
59
+ {
60
+ if (s[0] == '0') {
61
+ int offset = 2;
62
+ switch (s[1]) {
63
+ case 'x': case 'X':
64
+ *base = 16;
65
+ break;
66
+ case 'b': case 'B':
67
+ *base = 2;
68
+ break;
69
+ case 'o': case 'O':
70
+ *base = 8;
71
+ break;
72
+ case 'd': case 'D':
73
+ *base = 10;
74
+ break;
75
+ default:
76
+ *base = 8;
77
+ offset = 1;
78
+ break;
79
+ }
80
+ return s + offset;
81
+ } else if (*base < -1) {
82
+ *base = -*base;
83
+ } else {
84
+ *base = 10;
85
+ }
86
+
87
+ return s;
88
+ }
89
+
90
+ static size_t
91
+ rb_u_string_to_inum_base_bit_length(const char *s, int base)
92
+ {
93
+ if (base < 2 || base > 36)
94
+ rb_u_raise(rb_eArgError, "illegal radix %d", base);
95
+
96
+ size_t bit_length;
97
+ switch (base) {
98
+ case 2:
99
+ bit_length = 1;
100
+ case 3:
101
+ bit_length = 2;
102
+ case 4: case 5: case 6: case 7: case 8:
103
+ bit_length = 3;
104
+ case 9: case 10: case 11: case 12: case 13: case 14: case 15: case 16:
105
+ bit_length = 4;
106
+ default:
107
+ if (base <= 32)
108
+ bit_length = 5;
109
+
110
+ bit_length = 6;
111
+ }
112
+
113
+ return bit_length * u_n_chars(s);
114
+ }
115
+
116
+ static bool
117
+ rb_u_string_to_inum_num_separator(const char *str, const char *s, bool verify,
118
+ uint32_t c, bool *previous_was_separator)
119
+ {
120
+ if (c != '_') {
121
+ *previous_was_separator = false;
122
+
123
+ return false;
124
+ }
125
+
126
+ if (*previous_was_separator) {
127
+ if (!verify)
128
+ return false;
129
+ char buf[U_CHAR_MAX_BYTE_LENGTH];
130
+ int length = u_char_to_u(c, buf);
131
+ rb_u_raise(rb_eArgError,
132
+ "unexpected ‘%.*s’ found at position %ld",
133
+ length, buf, u_pointer_to_offset(str, s));
134
+ }
135
+
136
+ *previous_was_separator = true;
137
+
138
+ return true;
139
+ }
140
+
141
+ #define FULLWIDTH_A ((uint32_t)0xff21)
142
+ #define FULLWIDTH_Z ((uint32_t)0xff3a)
143
+ #define FULLWIDTH_a ((uint32_t)0xff41)
144
+ #define FULLWIDTH_z ((uint32_t)0xff5a)
145
+
146
+ static int
147
+ u_char_zdigit_value(uint32_t c)
148
+ {
149
+ if (c >= 'a' && c <= 'z')
150
+ return c - 'a' + 10;
151
+ else if (c >= 'A' && c <= 'Z')
152
+ return c - 'A' + 10;
153
+ else if (c >= FULLWIDTH_a && c <= FULLWIDTH_z)
154
+ return c - FULLWIDTH_a + 10;
155
+ else if (c >= FULLWIDTH_A && c <= FULLWIDTH_Z)
156
+ return c - FULLWIDTH_A + 10;
157
+ else
158
+ return u_char_digit_value(c);
159
+ }
160
+
161
+ static bool
162
+ rb_u_string_to_inum_digit_value(const char *str, const char *s, uint32_t c,
163
+ int base, bool verify, int *digit_value)
164
+ {
165
+ /* If we stumble upon a space, return false so that we may end our
166
+ * processing and skip over any trailing white-space. */
167
+ if (u_char_isspace(c))
168
+ return false;
169
+
170
+ int value = u_char_zdigit_value(c);
171
+
172
+ if (value == -1) {
173
+ if (!verify)
174
+ return false;
175
+ char buf[U_CHAR_MAX_BYTE_LENGTH];
176
+ int length = u_char_to_u(c, buf);
177
+ rb_u_raise(rb_eArgError,
178
+ "non-digit character ‘%.*s’ found at position %ld",
179
+ length, buf, u_pointer_to_offset(str, s));
180
+ }
181
+
182
+ if (value >= base) {
183
+ if (!verify)
184
+ return false;
185
+
186
+ rb_u_raise(rb_eArgError,
187
+ "value (%d) greater than base (%d) at position %ld",
188
+ value, base, u_pointer_to_offset(str, s));
189
+ }
190
+
191
+ *digit_value = value;
192
+
193
+ return true;
194
+ }
195
+
196
+ static VALUE
197
+ rb_u_string_to_inum_as_fix(const char *str, const char *s, int sign, int base,
198
+ bool verify)
199
+ {
200
+ unsigned long value = 0;
201
+
202
+ bool previous_was_separator = false;
203
+ while (*s != '\0') {
204
+ uint32_t c = u_decode(&s, s, s + 4);
205
+
206
+ if (rb_u_string_to_inum_num_separator(str, s, verify, c, &previous_was_separator))
207
+ continue;
208
+
209
+ int digit_value;
210
+ if (!rb_u_string_to_inum_digit_value(str, s, c, base, verify, &digit_value))
211
+ break;
212
+ value *= base;
213
+ value += digit_value;
214
+ }
215
+
216
+ if (verify) {
217
+ const char *t;
218
+ while (*s != '\0' && u_char_isspace(u_decode(&t, s, s + 4)))
219
+ s = t;
220
+ if (*s != '\0')
221
+ rb_u_raise(rb_eArgError,
222
+ "trailing garbage found at position %ld",
223
+ u_pointer_to_offset(str, s));
224
+ }
225
+
226
+ if (POSFIXABLE(value))
227
+ return sign ? LONG2FIX(value) : LONG2FIX(-(long)value);
228
+
229
+ VALUE big = rb_uint2big(value);
230
+ RBIGNUM_SET_SIGN(big, sign);
231
+ return rb_big_norm(big);
232
+ }
233
+
234
+ static VALUE
235
+ rb_cutf_to_inum(const char * const str, int base, bool verify)
236
+ {
237
+ /* FIXME: How can this even happen? */
238
+ if (str == NULL) {
239
+ if (verify)
240
+ rb_invalid_str(str, "Integer");
241
+ return INT2FIX(0);
242
+ }
243
+
244
+ const char *s = str;
245
+ const char *t;
246
+ /* Skip any leading whitespace. */
247
+ while (u_char_isspace(u_decode(&t, s, s + 4)))
248
+ s = t;
249
+
250
+ /* Figure out what sign this number uses. */
251
+ int sign;
252
+ s = rb_u_string_to_inum_sign(s, &sign);
253
+
254
+ /* Do we have another sign? If so, that’s not correct. */
255
+ if (*s == '+' || *s == '-') {
256
+ if (verify)
257
+ rb_u_raise(rb_eArgError,
258
+ "extra sign ‘%c’ found at position %ld",
259
+ *s, u_pointer_to_offset(str, s));
260
+ return INT2FIX(0);
261
+ }
262
+
263
+ int tmp_base = base;
264
+ s = rb_u_string_to_inum_base(s, &tmp_base);
265
+ if (base <= 0)
266
+ base = tmp_base;
267
+
268
+ /* Remove preceeding 0s. */
269
+ while (*s == '0')
270
+ s++;
271
+
272
+ /* Figure out how many bits we need to represent the number. */
273
+ size_t bit_length = rb_u_string_to_inum_base_bit_length(str, base);
274
+
275
+ /* If the bit_length is less than the number of bits in a VALUE we can
276
+ * try to store it as a FIXNUM. */
277
+ if (bit_length <= sizeof(VALUE) * CHAR_BIT)
278
+ return rb_u_string_to_inum_as_fix(str, s, sign, base, verify);
279
+
280
+ if (verify && *str == '_')
281
+ rb_u_raise(rb_eArgError,
282
+ "leading digit-separator ‘_’ found at position %ld",
283
+ u_pointer_to_offset(str, s));
284
+
285
+ bit_length = bit_length / BITSPERDIG + 1;
286
+
287
+ /* TODO: Rename these variables. */
288
+ VALUE z = bignew(bit_length, sign);
289
+ BDIGIT *zds = BDIGITS(z);
290
+ MEMZERO(zds, BDIGIT, bit_length);
291
+ int big_len = 1;
292
+
293
+ bool previous_was_separator = false;
294
+ while (true) {
295
+ uint32_t c = u_decode(&s, s, s + 4);
296
+
297
+ if (rb_u_string_to_inum_num_separator(str, s, verify, c, &previous_was_separator))
298
+ continue;
299
+
300
+ int digit_value;
301
+ if (!rb_u_string_to_inum_digit_value(str, s, c, base, verify, &digit_value))
302
+ break;
303
+
304
+ int i = 0;
305
+ BDIGIT_DBL num = digit_value;
306
+ while (true) {
307
+ for ( ; i < big_len; i++) {
308
+ num += (BDIGIT_DBL)zds[i] * base;
309
+ zds[i] = BIGLO(num);
310
+ num = BIGDN(num);
311
+ }
312
+
313
+ if (num == 0)
314
+ break;
315
+
316
+ big_len++;
317
+ }
318
+ }
319
+
320
+ if (!verify)
321
+ return rb_big_norm(z);
322
+
323
+ s--;
324
+ if (str + 1 < s && s[-1] == '_')
325
+ rb_u_raise(rb_eArgError,
326
+ "trailing digit-separator ‘_’ found at position %ld",
327
+ u_pointer_to_offset(str, s));
328
+
329
+ if (*s != '\0')
330
+ rb_u_raise(rb_eArgError,
331
+ "trailing garbage found at position %ld",
332
+ u_pointer_to_offset(str, s));
333
+
334
+ return rb_big_norm(z);
335
+ }
336
+
337
+ VALUE
338
+ rb_u_string_to_inum(VALUE self, int base, bool verify)
339
+ {
340
+ const struct rb_u_string *string = RVAL2USTRING(self);
341
+
342
+ const char *s = USTRING_STR(string);
343
+ if (verify && (s == NULL || memchr(s, '\0', USTRING_LENGTH(string))))
344
+ rb_u_raise(rb_eArgError, "string contains null byte");
345
+
346
+ bool allocated = false;
347
+ if (s != NULL) {
348
+ long len = USTRING_LENGTH(string);
349
+ /* no sentinel somehow */
350
+ if (s[len] != '\0') {
351
+ char *p = ALLOC_N(char, len + 1);
352
+
353
+ MEMCPY(p, s, char, len);
354
+ p[len] = '\0';
355
+ s = p;
356
+ allocated = true;
357
+ }
358
+ }
359
+
360
+ VALUE result = rb_cutf_to_inum(s, base, verify);
361
+ if (allocated)
362
+ free((char *)s);
363
+ return result;
364
+ }