u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,33 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload +(other)
4
+ * @param [U::String, #to_str] other
5
+ * @raise [ArgumentError] If {#bytesize} + OTHER{#bytesize} > LONG_MAX
6
+ * @return [U::String] The concatenation of OTHER to the receiver, inheriting
7
+ * any taint on either */
8
+ VALUE
9
+ rb_u_string_plus(VALUE self, VALUE rbother)
10
+ {
11
+ const struct rb_u_string *string = RVAL2USTRING(self);
12
+ const struct rb_u_string *other = RVAL2USTRING_ANY(rbother);
13
+
14
+ long string_length = USTRING_LENGTH(string);
15
+ long other_length = USTRING_LENGTH(other);
16
+
17
+ /* TODO: Isn’t this off by one, as we add one to length for the
18
+ * ALLOC_N() call? */
19
+ if (string_length > LONG_MAX - other_length)
20
+ rb_u_raise(rb_eArgError, "length of resulting string would be too big");
21
+ long length = string_length + other_length;
22
+
23
+ char *sum = ALLOC_N(char, length + 1);
24
+ memcpy(sum, USTRING_STR(string), string_length);
25
+ memcpy(sum + string_length, USTRING_STR(other), other_length);
26
+ sum[length] = '\0';
27
+
28
+ VALUE result = rb_u_string_new_uninfected_own(sum, length);
29
+ if (OBJ_TAINTED(self) || OBJ_TAINTED(rbother))
30
+ OBJ_TAINT(result);
31
+
32
+ return result;
33
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload print?
4
+ * @return [Boolean] True if the receiver contains only characters not in the
5
+ * general category Other */
6
+ VALUE
7
+ rb_u_string_print(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isprint);
10
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload punct?
4
+ * @return [Boolean] True if the receiver contains only characters in the
5
+ * general categories Punctuation and Symbol */
6
+ VALUE
7
+ rb_u_string_punct(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_ispunct);
10
+ }
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [U::String] The reversal of the receiver, inheriting any taint and
4
+ * untrust from the receiver
5
+ * @note This doesn’t take into account proper handling of combining marks,
6
+ * direction indicators, and similarly relevant characters, so this method is
7
+ * mostly useful when you know the contents of the string is simple and the
8
+ * result isn’t intended for display. */
9
+ VALUE
10
+ rb_u_string_reverse(VALUE self)
11
+ {
12
+ return _rb_u_string_convert(self, u_reverse);
13
+ }
@@ -0,0 +1,104 @@
1
+ #include "rb_includes.h"
2
+
3
+ long
4
+ rb_u_string_rindex(VALUE self, VALUE rbsubstring, long offset)
5
+ {
6
+ const struct rb_u_string *string = RVAL2USTRING(self);
7
+ const struct rb_u_string *substring = RVAL2USTRING_ANY(rbsubstring);
8
+
9
+ if (USTRING_LENGTH(string) < USTRING_LENGTH(substring))
10
+ return -1;
11
+
12
+ const char *s = rb_u_string_begin_from_offset(string, offset);
13
+ if (s == NULL)
14
+ return -1;
15
+
16
+ if (USTRING_LENGTH(substring) == 0)
17
+ return offset;
18
+
19
+ const char *begin = USTRING_STR(string);
20
+ const char *t = USTRING_STR(substring);
21
+ long t_length = USTRING_LENGTH(substring);
22
+ while (s >= begin) {
23
+ if (rb_memcmp(s, t, t_length) == 0)
24
+ return u_pointer_to_offset(begin, s);
25
+ s--;
26
+ }
27
+
28
+ return -1;
29
+ }
30
+
31
+ /* @overload rindex(pattern, offset = -1)
32
+ *
33
+ * Returns the maximal index of the receiver where PATTERN matches, equal to
34
+ * or less than _i_, where _i_ = OFFSET if OFFSET ≥ 0, _i_ = {#length} -
35
+ * abs(OFFSET) otherwise, or nil if there is no match.
36
+ *
37
+ * If PATTERN is a Regexp, the Regexp special variables `$&`, `$'`,
38
+ * <code>$\`</code>, `$1`, `$2`, …, `$`_n_ are updated accordingly.
39
+ *
40
+ * If PATTERN responds to `#to_str`, the matching is performed by a byte
41
+ * comparison.
42
+ *
43
+ * @param [Regexp, #to_str] pattern
44
+ * @param [#to_int] offset
45
+ * @return [Integer, nil]
46
+ * @see #index */
47
+ VALUE
48
+ rb_u_string_rindex_m(int argc, VALUE *argv, VALUE self)
49
+ {
50
+ const struct rb_u_string *string = RVAL2USTRING(self);
51
+
52
+ VALUE sub, rboffset;
53
+ long offset;
54
+ if (rb_scan_args(argc, argv, "11", &sub, &rboffset) == 2)
55
+ offset = NUM2LONG(rboffset);
56
+ else
57
+ /* TODO: Why not simply use -1? Benchmark which is faster. */
58
+ offset = u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string));
59
+
60
+ const char *begin = rb_u_string_begin_from_offset(string, offset);
61
+ const char *end = USTRING_END(string);
62
+ if (begin == NULL) {
63
+ if (offset <= 0) {
64
+ if (TYPE(sub) == T_REGEXP)
65
+ rb_backref_set(Qnil);
66
+
67
+ return Qnil;
68
+ }
69
+
70
+ begin = end;
71
+ /* TODO: this converting back and forward can be optimized away
72
+ * if rb_u_string_index_regexp() and rb_u_string_rindex() were split up
73
+ * into two additional functions, adding
74
+ * rb_u_string_index_regexp_pointer() and rb_u_string_rindex_pointer(),
75
+ * so that one can pass a pointer to start at immediately
76
+ * instead of an offset that gets calculated into a pointer. */
77
+ offset = u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string));
78
+ }
79
+
80
+ switch (TYPE(sub)) {
81
+ case T_REGEXP:
82
+ /* TODO: What’s this first test for, exactly? */
83
+ if (RREGEXP(sub)->ptr == NULL || RREGEXP_SRC_LEN(sub) > 0)
84
+ offset = rb_u_string_index_regexp(self, begin, sub, true);
85
+ break;
86
+ default: {
87
+ VALUE tmp = rb_check_string_type(sub);
88
+ if (NIL_P(tmp))
89
+ rb_u_raise(rb_eTypeError, "type mismatch: %s given",
90
+ rb_obj_classname(sub));
91
+
92
+ sub = tmp;
93
+ }
94
+ /* fall through */
95
+ case T_STRING:
96
+ offset = rb_u_string_rindex(self, sub, offset);
97
+ break;
98
+ }
99
+
100
+ if (offset < 0)
101
+ return Qnil;
102
+
103
+ return LONG2NUM(offset);
104
+ }
@@ -0,0 +1,81 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ static VALUE
5
+ rb_u_string_rpartition_failure(VALUE self)
6
+ {
7
+ return rb_ary_new3(3,
8
+ rb_u_string_new_empty(self),
9
+ rb_u_string_new_empty(self),
10
+ self);
11
+ }
12
+
13
+ static VALUE
14
+ rb_u_string_rpartition_success(VALUE self, VALUE rbseparator, long offset)
15
+ {
16
+ const struct rb_u_string *string = RVAL2USTRING(self);
17
+ const struct rb_u_string *separator = RVAL2USTRING_ANY(rbseparator);
18
+
19
+ long after = offset + USTRING_LENGTH(separator);
20
+
21
+ return rb_ary_new3(3,
22
+ rb_u_string_new_subsequence(self, 0, offset),
23
+ TYPE(rbseparator) == T_STRING ?
24
+ rb_u_string_new_rb(rbseparator) :
25
+ rbseparator,
26
+ rb_u_string_new_subsequence(self,
27
+ after,
28
+ USTRING_LENGTH(string) - after));
29
+ }
30
+
31
+ static VALUE
32
+ rb_u_string_rpartition_regex(VALUE self, VALUE regex)
33
+ {
34
+ VALUE str = rb_str_to_str(self);
35
+
36
+ long offset = rb_reg_search(regex, str, RSTRING_LEN(str), 1);
37
+ if (offset < 0)
38
+ return rb_u_string_rpartition_failure(self);
39
+
40
+ VALUE separator = rb_u_pattern_match_reference(INT2FIX(0));
41
+
42
+ return rb_u_string_rpartition_success(self, separator, offset);
43
+ }
44
+
45
+ static VALUE
46
+ rb_u_string_rpartition_string(VALUE self, VALUE rbseparator)
47
+ {
48
+ const struct rb_u_string *string = RVAL2USTRING(self);
49
+ const char *begin = USTRING_STR(string);
50
+
51
+ VALUE validated = rb_u_string_validate_type(rbseparator);
52
+
53
+ long offset = rb_u_string_rindex(self,
54
+ validated,
55
+ u_n_chars_n(begin,
56
+ USTRING_LENGTH(string)));
57
+ if (offset < 0)
58
+ return rb_u_string_rpartition_failure(self);
59
+
60
+ long byte_offset = u_offset_to_pointer(begin, offset) - begin;
61
+
62
+ return rb_u_string_rpartition_success(self, validated, byte_offset);
63
+ }
64
+
65
+ /* @overload rpartition(separator)
66
+ * @param [Regexp, #to_str] separator
67
+ * @return [Array<U::String>] The receiver split into _s₁_ = {#slice}(0, _i_),
68
+ * _s₂_ = {#slice}(_i_, _n_), _s₃_ = {#slice}(_i_ + _n_, -1), where _i_ = _j_ if _j_ ≠
69
+ * nil, _i_ = 0 otherwise, _j_ = {#rindex}(SEPARATOR), _n_ =
70
+ * SEPARATOR{#length}, where _s₁_ and _s₃_ inherit any taint and untrust
71
+ * from the receiver and _s₂_ inherits any taint and untrust from SEPARATOR
72
+ * and also from the receiver if SEPARATOR is a Regexp
73
+ * @see #partition */
74
+ VALUE
75
+ rb_u_string_rpartition(VALUE self, VALUE separator)
76
+ {
77
+ if (TYPE(separator) == T_REGEXP)
78
+ return rb_u_string_rpartition_regex(self, separator);
79
+
80
+ return rb_u_string_rpartition_string(self, separator);
81
+ }
@@ -0,0 +1,29 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [U::String] The receiver with its maximum {#space?} suffix removed,
4
+ * inheriting any taint and untrust from the receiver
5
+ * @see #lstrip
6
+ * @see #strip */
7
+ VALUE
8
+ rb_u_string_rstrip(VALUE self)
9
+ {
10
+ const struct rb_u_string *string = RVAL2USTRING(self);
11
+
12
+ const char *begin = USTRING_STR(string);
13
+ if (begin == NULL)
14
+ return self;
15
+
16
+ const char *end = USTRING_END(string);
17
+ const char *q = end;
18
+ while (begin < q) {
19
+ const char *p;
20
+ uint32_t c = u_decode_r(&p, begin, q);
21
+ if (c != '\0' && !u_char_isspace(c))
22
+ break;
23
+ q = p;
24
+ }
25
+ if (q == end)
26
+ return self;
27
+
28
+ return rb_u_string_new_c(self, begin, q - begin);
29
+ }
@@ -0,0 +1,109 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ static VALUE
5
+ rb_u_string_scan_once(VALUE string, VALUE pattern, long *start, long *last)
6
+ {
7
+ if (rb_reg_search(pattern, string, *start, false) < 0) {
8
+ if (*last >= 0)
9
+ rb_reg_search(pattern, string, *last, false);
10
+
11
+ return Qnil;
12
+ }
13
+
14
+ *last = *start;
15
+
16
+ VALUE match = rb_backref_get();
17
+ struct re_registers *registers = RMATCH_REGS(match);
18
+ if (registers->beg[0] == registers->end[0]) {
19
+ if (RSTRING_LEN(string) > registers->end[0])
20
+ *start = registers->end[0] +
21
+ (u_next(RSTRING_PTR(string) + registers->end[0]) -
22
+ (RSTRING_PTR(string) + registers->end[0]));
23
+ else
24
+ *start = registers->end[0] + 1;
25
+ } else {
26
+ *start = registers->end[0];
27
+ }
28
+
29
+ if (registers->num_regs == 1)
30
+ return rb_u_string_new_rb(rb_reg_nth_match(0, match));
31
+
32
+ VALUE result = rb_ary_new2(registers->num_regs);
33
+ for (int i = 1; i < registers->num_regs; i++)
34
+ rb_ary_push(result, rb_u_string_new_rb(rb_reg_nth_match(i, match)));
35
+
36
+ return result;
37
+ }
38
+
39
+ static VALUE
40
+ rb_u_string_scan_block(VALUE self, VALUE string, VALUE pattern)
41
+ {
42
+ VALUE result;
43
+ long start = 0;
44
+ long last = -1;
45
+
46
+ while (!NIL_P(result = rb_u_string_scan_once(string, pattern, &start, &last)))
47
+ rb_yield(result);
48
+
49
+ return self;
50
+ }
51
+
52
+ static VALUE
53
+ rb_u_string_scan_array(VALUE string, VALUE pattern)
54
+ {
55
+ VALUE result;
56
+ long start = 0;
57
+ long last = -1;
58
+
59
+ VALUE array = rb_ary_new();
60
+ while (!NIL_P(result = rb_u_string_scan_once(string, pattern, &start, &last)))
61
+ rb_ary_push(array, result);
62
+
63
+ return array;
64
+ }
65
+
66
+ /* @overload scan(pattern)
67
+ * @param [Regexp] pattern
68
+ * @return [Array<U::String>, Array<Array<U::String>>] All matches – or
69
+ * sub-matches, if they exist – of matches of PATTERN in the receiver, each
70
+ * inheriting any taint and untrust from both the receiver and from PATTERN
71
+ * @note The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`,
72
+ * `$2`, …, `$`_n_ are updated accordingly.
73
+ *
74
+ * @overload scan(pattern)
75
+ * @param [#to_str] pattern
76
+ * @return [Array<U::String>] All matches of PATTERN in the receiver, each
77
+ * inheriting any taint and untrust from the receiver
78
+ *
79
+ * @overload scan(pattern){ |*submatches| … }
80
+ *
81
+ * Enumerates the sub-matches of matches of PATTERN in the receiver, each
82
+ * inheriting any taint and untrust from both the receiver and from PATTERN.
83
+ *
84
+ * @param [Regexp] pattern
85
+ * @yieldparam [Array<U::String>] submatches
86
+ * @return [self]
87
+ * @note The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`,
88
+ * `$2`, …, `$`_n_ are updated accordingly.
89
+ *
90
+ * @overload scan(pattern){ |match| … }
91
+ *
92
+ * Enumerates the matches of PATTERN in the receiver, each inheriting any
93
+ * taint and untrust from the receiver.
94
+ *
95
+ * @param [#to_str] pattern
96
+ * @yieldparam [U::String] match
97
+ * @return [self] */
98
+ VALUE
99
+ rb_u_string_scan(VALUE self, VALUE pattern)
100
+ {
101
+ pattern = rb_u_pattern_argument(pattern, true);
102
+
103
+ VALUE string = rb_str_to_str(self);
104
+
105
+ if (rb_block_given_p())
106
+ return rb_u_string_scan_block(self, string, pattern);
107
+
108
+ return rb_u_string_scan_array(string, pattern);
109
+ }
@@ -0,0 +1,253 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define SCRIPT2ID(script, symbol) \
4
+ case U_SCRIPT_##script: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ script_to_symbol(enum u_script script)
13
+ {
14
+ switch (script) {
15
+ SCRIPT2ID(COMMON, common)
16
+ SCRIPT2ID(INHERITED, inherited)
17
+ SCRIPT2ID(ARABIC, arabic)
18
+ SCRIPT2ID(ARMENIAN, armenian)
19
+ SCRIPT2ID(BENGALI, bengali)
20
+ SCRIPT2ID(BOPOMOFO, bopomofo)
21
+ SCRIPT2ID(CHEROKEE, cherokee)
22
+ SCRIPT2ID(COPTIC, coptic)
23
+ SCRIPT2ID(CYRILLIC, cyrillic)
24
+ SCRIPT2ID(DESERET, deseret)
25
+ SCRIPT2ID(DEVANAGARI, devanagari)
26
+ SCRIPT2ID(ETHIOPIC, ethiopic)
27
+ SCRIPT2ID(GEORGIAN, georgian)
28
+ SCRIPT2ID(GOTHIC, gothic)
29
+ SCRIPT2ID(GREEK, greek)
30
+ SCRIPT2ID(GUJARATI, gujarati)
31
+ SCRIPT2ID(GURMUKHI, gurmukhi)
32
+ SCRIPT2ID(HAN, han)
33
+ SCRIPT2ID(HANGUL, hangul)
34
+ SCRIPT2ID(HEBREW, hebrew)
35
+ SCRIPT2ID(HIRAGANA, hiragana)
36
+ SCRIPT2ID(KANNADA, kannada)
37
+ SCRIPT2ID(KATAKANA, katakana)
38
+ SCRIPT2ID(KHMER, khmer)
39
+ SCRIPT2ID(LAO, lao)
40
+ SCRIPT2ID(LATIN, latin)
41
+ SCRIPT2ID(MALAYALAM, malayalam)
42
+ SCRIPT2ID(MONGOLIAN, mongolian)
43
+ SCRIPT2ID(MYANMAR, myanmar)
44
+ SCRIPT2ID(OGHAM, ogham)
45
+ SCRIPT2ID(OLD_ITALIC, old_italic)
46
+ SCRIPT2ID(ORIYA, oriya)
47
+ SCRIPT2ID(RUNIC, runic)
48
+ SCRIPT2ID(SINHALA, sinhala)
49
+ SCRIPT2ID(SYRIAC, syriac)
50
+ SCRIPT2ID(TAMIL, tamil)
51
+ SCRIPT2ID(TELUGU, telugu)
52
+ SCRIPT2ID(THAANA, thaana)
53
+ SCRIPT2ID(THAI, thai)
54
+ SCRIPT2ID(TIBETAN, tibetan)
55
+ SCRIPT2ID(CANADIAN_ABORIGINAL, canadian_aboriginal)
56
+ SCRIPT2ID(YI, yi)
57
+ SCRIPT2ID(TAGALOG, tagalog)
58
+ SCRIPT2ID(HANUNOO, hanunoo)
59
+ SCRIPT2ID(BUHID, buhid)
60
+ SCRIPT2ID(TAGBANWA, tagbanwa)
61
+ SCRIPT2ID(BRAILLE, braille)
62
+ SCRIPT2ID(CYPRIOT, cypriot)
63
+ SCRIPT2ID(LIMBU, limbu)
64
+ SCRIPT2ID(OSMANYA, osmanya)
65
+ SCRIPT2ID(SHAVIAN, shavian)
66
+ SCRIPT2ID(LINEAR_B, linear_b)
67
+ SCRIPT2ID(TAI_LE, tai_le)
68
+ SCRIPT2ID(UGARITIC, ugaritic)
69
+ SCRIPT2ID(NEW_TAI_LUE, new_tai_lue)
70
+ SCRIPT2ID(BUGINESE, buginese)
71
+ SCRIPT2ID(GLAGOLITIC, glagolitic)
72
+ SCRIPT2ID(TIFINAGH, tifinagh)
73
+ SCRIPT2ID(SYLOTI_NAGRI, syloti_nagri)
74
+ SCRIPT2ID(OLD_PERSIAN, old_persian)
75
+ SCRIPT2ID(KHAROSHTHI, kharoshthi)
76
+ SCRIPT2ID(UNKNOWN, unknown)
77
+ SCRIPT2ID(BALINESE, balinese)
78
+ SCRIPT2ID(CUNEIFORM, cuneiform)
79
+ SCRIPT2ID(PHOENICIAN, phoenician)
80
+ SCRIPT2ID(PHAGS_PA, phags_pa)
81
+ SCRIPT2ID(NKO, nko)
82
+ SCRIPT2ID(KAYAH_LI, kayah_li)
83
+ SCRIPT2ID(LEPCHA, lepcha)
84
+ SCRIPT2ID(REJANG, rejang)
85
+ SCRIPT2ID(SUNDANESE, sundanese)
86
+ SCRIPT2ID(SAURASHTRA, saurashtra)
87
+ SCRIPT2ID(CHAM, cham)
88
+ SCRIPT2ID(OL_CHIKI, ol_chiki)
89
+ SCRIPT2ID(VAI, vai)
90
+ SCRIPT2ID(CARIAN, carian)
91
+ SCRIPT2ID(LYCIAN, lycian)
92
+ SCRIPT2ID(LYDIAN, lydian)
93
+ SCRIPT2ID(AVESTAN, avestan)
94
+ SCRIPT2ID(BAMUM, bamum)
95
+ SCRIPT2ID(EGYPTIAN_HIEROGLYPHS, egyptian_hieroglyphs)
96
+ SCRIPT2ID(IMPERIAL_ARAMAIC, imperial_aramaic)
97
+ SCRIPT2ID(INSCRIPTIONAL_PAHLAVI, inscriptional_pahlavi)
98
+ SCRIPT2ID(INSCRIPTIONAL_PARTHIAN, inscriptional_parthian)
99
+ SCRIPT2ID(JAVANESE, javanese)
100
+ SCRIPT2ID(KAITHI, kaithi)
101
+ SCRIPT2ID(LISU, lisu)
102
+ SCRIPT2ID(MEETEI_MAYEK, meetei_mayek)
103
+ SCRIPT2ID(OLD_SOUTH_ARABIAN, old_south_arabian)
104
+ SCRIPT2ID(OLD_TURKIC, old_turkic)
105
+ SCRIPT2ID(SAMARITAN, samaritan)
106
+ SCRIPT2ID(TAI_THAM, tai_tham)
107
+ SCRIPT2ID(TAI_VIET, tai_viet)
108
+ SCRIPT2ID(BATAK, batak)
109
+ SCRIPT2ID(BRAHMI, brahmi)
110
+ SCRIPT2ID(MANDAIC, mandaic)
111
+ SCRIPT2ID(MEROITIC_HIEROGLYPHS, meroitic_hieroglyphs)
112
+ SCRIPT2ID(MEROITIC_CURSIVE, meroitic_cursive)
113
+ SCRIPT2ID(SORA_SOMPENG, sora_sompeng)
114
+ SCRIPT2ID(CHAKMA, chakma)
115
+ SCRIPT2ID(SHARADA, sharada)
116
+ SCRIPT2ID(TAKRI, takri)
117
+ SCRIPT2ID(MIAO, miao)
118
+ default:
119
+ rb_u_raise(rb_eNotImpError, "unknown script: %d", script);
120
+ }
121
+ }
122
+
123
+ /* Returns the script of the characters of the receiver.
124
+ *
125
+ * The script of a character identifies the primary writing system that uses
126
+ * the character.
127
+ *
128
+ * <table>
129
+ * <thead><tr><th>Script</th><th>Description</th></tr></thead>
130
+ * <tbody>
131
+ * <tr><td>:arabic</td><td>Arabic</td></tr>
132
+ * <tr><td>:armenian</td><td>Armenian</td></tr>
133
+ * <tr><td>:avestan</td><td>Avestan</td></tr>
134
+ * <tr><td>:balinese</td><td>Balinese</td></tr>
135
+ * <tr><td>:bamum</td><td>Bamum</td></tr>
136
+ * <tr><td>:batak</td><td>Batak</td></tr>
137
+ * <tr><td>:bengali</td><td>Bengali</td></tr>
138
+ * <tr><td>:bopomofo</td><td>Bopomofo</td></tr>
139
+ * <tr><td>:brahmi</td><td>Brahmi</td></tr>
140
+ * <tr><td>:braille</td><td>Braille</td></tr>
141
+ * <tr><td>:buginese</td><td>Buginese</td></tr>
142
+ * <tr><td>:buhid</td><td>Buhid</td></tr>
143
+ * <tr><td>:canadian_aboriginal</td><td>Canadian Aboriginal</td></tr>
144
+ * <tr><td>:carian</td><td>Carian</td></tr>
145
+ * <tr><td>:chakma</td><td>Chakma</td></tr>
146
+ * <tr><td>:cham</td><td>Cham</td></tr>
147
+ * <tr><td>:cherokee</td><td>Cherokee</td></tr>
148
+ * <tr><td>:common</td><td>For other characters that may be used with multiple scripts</td></tr>
149
+ * <tr><td>:coptic</td><td>Coptic</td></tr>
150
+ * <tr><td>:cuneiform</td><td>Cuneiform</td></tr>
151
+ * <tr><td>:cypriot</td><td>Cypriot</td></tr>
152
+ * <tr><td>:cyrillic</td><td>Cyrillic</td></tr>
153
+ * <tr><td>:deseret</td><td>Deseret</td></tr>
154
+ * <tr><td>:devanagari</td><td>Devanagari</td></tr>
155
+ * <tr><td>:egyptian_hieroglyphs</td><td>Egyptian Hieroglpyhs</td></tr>
156
+ * <tr><td>:ethiopic</td><td>Ethiopic</td></tr>
157
+ * <tr><td>:georgian</td><td>Georgian</td></tr>
158
+ * <tr><td>:glagolitic</td><td>Glagolitic</td></tr>
159
+ * <tr><td>:gothic</td><td>Gothic</td></tr>
160
+ * <tr><td>:greek</td><td>Greek</td></tr>
161
+ * <tr><td>:gujarati</td><td>Gujarati</td></tr>
162
+ * <tr><td>:gurmukhi</td><td>Gurmukhi</td></tr>
163
+ * <tr><td>:han</td><td>Han</td></tr>
164
+ * <tr><td>:hangul</td><td>Hangul</td></tr>
165
+ * <tr><td>:hanunoo</td><td>Hanunoo</td></tr>
166
+ * <tr><td>:hebrew</td><td>Hebrew</td></tr>
167
+ * <tr><td>:hiragana</td><td>Hiragana</td></tr>
168
+ * <tr><td>:imperial_aramaic</td><td>Imperial Aramaic</td></tr>
169
+ * <tr><td>:inherited</td><td>For characters that may be used with multiple
170
+ * scripts, and that inherit their script from the preceding characters;
171
+ * these include nonspacing marks, enclosing marks, and the zero-width
172
+ * joiner/non-joiner characters</td></tr>
173
+ * <tr><td>:inscriptional_pahlavi</td><td>Inscriptional Pahlavi</td></tr>
174
+ * <tr><td>:inscriptional_parthian</td><td>Inscriptional Parthian</td></tr>
175
+ * <tr><td>:javanese</td><td>Javanese</td></tr>
176
+ * <tr><td>:kaithi</td><td>Kaithi</td></tr>
177
+ * <tr><td>:kannada</td><td>Kannada</td></tr>
178
+ * <tr><td>:katakana</td><td>Katakana</td></tr>
179
+ * <tr><td>:kayah_li</td><td>Kayah Li</td></tr>
180
+ * <tr><td>:kharoshthi</td><td>Kharoshthi</td></tr>
181
+ * <tr><td>:khmer</td><td>Khmer</td></tr>
182
+ * <tr><td>:lao</td><td>Lao</td></tr>
183
+ * <tr><td>:latin</td><td>Latin</td></tr>
184
+ * <tr><td>:lepcha</td><td>Lepcha</td></tr>
185
+ * <tr><td>:limbu</td><td>Limbu</td></tr>
186
+ * <tr><td>:linear_b</td><td>Linear B</td></tr>
187
+ * <tr><td>:lisu</td><td>Lisu</td></tr>
188
+ * <tr><td>:lycian</td><td>Lycian</td></tr>
189
+ * <tr><td>:lydian</td><td>Lydian</td></tr>
190
+ * <tr><td>:malayalam</td><td>Malayalam</td></tr>
191
+ * <tr><td>:mandaic</td><td>Mandaic</td></tr>
192
+ * <tr><td>:meetei_mayek</td><td>Meetei Mayek</td></tr>
193
+ * <tr><td>:meroitic_hieroglyphs</td><td>Meroitic Hieroglyphs</td></tr>
194
+ * <tr><td>:meroitic_cursive</td><td>Meroitic Cursives</td></tr>
195
+ * <tr><td>:miao</td><td>Miao</td></tr>
196
+ * <tr><td>:mongolian</td><td>Mongolian</td></tr>
197
+ * <tr><td>:myanmar</td><td>Myanmar</td></tr>
198
+ * <tr><td>:new_tai_lue</td><td>New Tai Lue</td></tr>
199
+ * <tr><td>:nko</td><td>N'Ko</td></tr>
200
+ * <tr><td>:ogham</td><td>Ogham</td></tr>
201
+ * <tr><td>:old_italic</td><td>Old Italic</td></tr>
202
+ * <tr><td>:old_persian</td><td>Old Persian</td></tr>
203
+ * <tr><td>:old_south_arabian</td><td>Old South Arabian</td></tr>
204
+ * <tr><td>:old_turkic</td><td>Old Turkic</td></tr>
205
+ * <tr><td>:ol_chiki</td><td>Ol Chiki</td></tr>
206
+ * <tr><td>:oriya</td><td>Oriya</td></tr>
207
+ * <tr><td>:osmanya</td><td>Osmanya</td></tr>
208
+ * <tr><td>:phags_pa</td><td>Phags-pa</td></tr>
209
+ * <tr><td>:phoenician</td><td>Phoenician</td></tr>
210
+ * <tr><td>:rejang</td><td>Rejang</td></tr>
211
+ * <tr><td>:runic</td><td>Runic</td></tr>
212
+ * <tr><td>:samaritan</td><td>Samaritan</td></tr>
213
+ * <tr><td>:saurashtra</td><td>Saurashtra</td></tr>
214
+ * <tr><td>:sharada</td><td>Sharada</td></tr>
215
+ * <tr><td>:shavian</td><td>Shavian</td></tr>
216
+ * <tr><td>:sinhala</td><td>Sinhala</td></tr>
217
+ * <tr><td>:sora_sompeng</td><td>Sora Sompeng</td></tr>
218
+ * <tr><td>:sundanese</td><td>Sundanese</td></tr>
219
+ * <tr><td>:syloti_nagri</td><td>Syloti Nagri</td></tr>
220
+ * <tr><td>:syriac</td><td>Syriac</td></tr>
221
+ * <tr><td>:tagalog</td><td>Tagalog</td></tr>
222
+ * <tr><td>:tagbanwa</td><td>Tagbanwa</td></tr>
223
+ * <tr><td>:tai_le</td><td>Tai Le</td></tr>
224
+ * <tr><td>:tai_tham</td><td>Tai Tham</td></tr>
225
+ * <tr><td>:tai_viet</td><td>Tai Viet</td></tr>
226
+ * <tr><td>:takri</td><td>Takri</td></tr>
227
+ * <tr><td>:tamil</td><td>Tamil</td></tr>
228
+ * <tr><td>:telugu</td><td>Telugu</td></tr>
229
+ * <tr><td>:thaana</td><td>Thaana</td></tr>
230
+ * <tr><td>:thai</td><td>Thai</td></tr>
231
+ * <tr><td>:tibetan</td><td>Tibetan</td></tr>
232
+ * <tr><td>:tifinagh</td><td>Tifinagh</td></tr>
233
+ * <tr><td>:ugaritic</td><td>Ugaritic</td></tr>
234
+ * <tr><td>:unknown</td><td>For not assigned, private-use, non-character, and surrogate code points</td></tr>
235
+ * <tr><td>:vai</td><td>Vai</td></tr>
236
+ * <tr><td>:yi</td><td>Yi</td></tr>
237
+ * </tbody>
238
+ * </table>
239
+ *
240
+ * @raise [ArgumentError] If the receiver contains two characters belonging to
241
+ * different scripts
242
+ * @raise [ArgumentError] If the receiver contains an incomplete UTF-8 sequence
243
+ * @raise [ArgumentError] If the receiver contains an invalid UTF-8 sequence
244
+ * @return [Symbol]
245
+ * @see http://www.unicode.org/reports/tr24/
246
+ * Unicode Standard Annex #24 Unicode Script Property */
247
+ VALUE
248
+ rb_u_string_script(VALUE self)
249
+ {
250
+ return _rb_u_string_property(self, "script", U_SCRIPT_UNKNOWN,
251
+ (int (*)(uint32_t))u_char_script,
252
+ (VALUE (*)(int))script_to_symbol);
253
+ }