u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,173 @@
1
+ /* TODO: Move this to rb_u_string.c and turn USTRING_STR() into
2
+ * rb_u_string_c_str() and USTRING_LENGTH() into rb_u_string_c_length() */
3
+ struct rb_u_string {
4
+ VALUE rb;
5
+ const char *c;
6
+ long length;
7
+ };
8
+
9
+
10
+ #define RVAL2USTRING(object) \
11
+ (Check_Type(object, T_DATA), (struct rb_u_string *)DATA_PTR(object))
12
+
13
+ #define RVAL2USTRING_ANY(value) \
14
+ (rb_obj_is_kind_of((value), rb_cUString) ? \
15
+ RVAL2USTRING(value) : \
16
+ USTRING_LOCAL(StringValue(value), NULL, 0))
17
+
18
+ #define USTRING2RVAL(string) \
19
+ Data_Wrap_Struct(rb_cUString, rb_u_string_mark, rb_u_string_free, string)
20
+
21
+ #define USTRING_LOCAL(arb, ac, alength) \
22
+ (&(struct rb_u_string){ .rb = (arb), .c = (ac), .length = (alength) })
23
+
24
+ /* TODO: Call this USTRING_BEGIN() instead */
25
+ #define USTRING_STR(string) \
26
+ (NIL_P((string)->rb) ? (string)->c : RSTRING_PTR(string->rb))
27
+
28
+ #define USTRING_LENGTH(string) \
29
+ (NIL_P((string)->rb) ? (string)->length : RSTRING_LEN(string->rb))
30
+
31
+ #define USTRING_END(string) \
32
+ (NIL_P((string)->rb) ? (string)->c + (string)->length : RSTRING_END(string->rb))
33
+
34
+ static inline const char *
35
+ rb_u_string_begin_from_offset(const struct rb_u_string *string, long offset)
36
+ {
37
+ return u_offset_to_pointer_n(offset >= 0 ?
38
+ USTRING_STR(string) :
39
+ USTRING_END(string),
40
+ offset,
41
+ USTRING_LENGTH(string));
42
+ }
43
+
44
+
45
+ extern VALUE rb_cUString;
46
+
47
+
48
+ VALUE rb_u_string_new_uninfected(const char *str, long length);
49
+ VALUE rb_u_string_new_uninfected_own(const char *str, long length);
50
+ VALUE rb_u_string_new_c(VALUE self, const char *str, long length);
51
+ VALUE rb_u_string_new_c_own(VALUE self, const char *str, long length);
52
+ VALUE rb_u_string_new_rb(VALUE str);
53
+ VALUE rb_u_string_new_subsequence(VALUE self, long begin, long length);
54
+ VALUE rb_u_string_new_empty(VALUE self);
55
+
56
+ VALUE rb_u_string_check_type(VALUE str);
57
+ VALUE rb_u_string_validate_type(VALUE str);
58
+ VALUE rb_u_string_object_as_string(VALUE object);
59
+
60
+ const char *rb_u_string_begin_from_offset(const struct rb_u_string *string, long offset);
61
+
62
+ /* TODO: Move to rb_private.h. */
63
+ long rb_u_string_index(VALUE self, VALUE sub, long offset);
64
+ long rb_u_string_rindex(VALUE self, VALUE rbsubstring, long offset);
65
+
66
+ VALUE rb_u_string_alnum(VALUE self);
67
+ VALUE rb_u_string_alpha(VALUE self);
68
+ VALUE rb_u_string_assigned(VALUE self);
69
+ VALUE rb_u_string_aref_m(int argc, VALUE *argv, VALUE self);
70
+ VALUE rb_u_string_ascii_only(VALUE self);
71
+ VALUE rb_u_string_b(VALUE self);
72
+ VALUE rb_u_string_bytes(VALUE self);
73
+ VALUE rb_u_string_bytesize(VALUE self);
74
+ VALUE rb_u_string_byteslice_m(int argc, VALUE *argv, VALUE self);
75
+ VALUE rb_u_string_center(int argc, VALUE *argv, VALUE self);
76
+ VALUE rb_u_string_casecmp(int argc, VALUE *argv, VALUE self);
77
+ VALUE rb_u_string_cased(VALUE self);
78
+ VALUE rb_u_string_case_ignorable(VALUE self);
79
+ VALUE rb_u_string_chars(VALUE self);
80
+ VALUE rb_u_string_chomp(int argc, VALUE *argv, VALUE self);
81
+ VALUE rb_u_string_chop(VALUE self);
82
+ VALUE rb_u_string_chr(VALUE self);
83
+ VALUE rb_u_string_cntrl(VALUE self);
84
+ VALUE rb_u_string_codepoints(VALUE self);
85
+ VALUE rb_u_string_collate(int argc, VALUE *argv, VALUE self);
86
+ VALUE rb_u_string_collation_key(int argc, VALUE *argv, VALUE self);
87
+ VALUE rb_u_string_canonical_combining_class(VALUE self);
88
+ VALUE rb_u_string_count(int argc, VALUE *argv, VALUE self);
89
+ VALUE rb_u_string_defined(VALUE self);
90
+ VALUE rb_u_string_delete(int argc, VALUE *argv, VALUE self);
91
+ VALUE rb_u_string_digit(VALUE self);
92
+ VALUE rb_u_string_downcase(int argc, VALUE *argv, VALUE self);
93
+ VALUE rb_u_string_dump(VALUE self);
94
+ VALUE rb_u_string_dup(VALUE self);
95
+ VALUE rb_u_string_each_byte(VALUE self);
96
+ VALUE rb_u_string_each_char(VALUE self);
97
+ VALUE rb_u_string_each_codepoint(VALUE self);
98
+ VALUE rb_u_string_each_grapheme_cluster(VALUE self);
99
+ VALUE rb_u_string_each_line(int argc, VALUE *argv, VALUE self);
100
+ VALUE rb_u_string_each_word(VALUE self);
101
+ VALUE rb_u_string_empty(VALUE self);
102
+ VALUE rb_u_string_end_with(int argc, VALUE *argv, VALUE self);
103
+ VALUE rb_u_string_equal(VALUE self, VALUE other);
104
+ VALUE rb_u_string_eql(VALUE self, VALUE other);
105
+ VALUE rb_u_string_foldcase(int argc, VALUE *argv, VALUE self);
106
+ VALUE rb_u_string_folded(int argc, VALUE *argv, VALUE self);
107
+ VALUE rb_u_string_format(int argc, const VALUE *argv, VALUE self);
108
+ VALUE rb_u_string_format_m(VALUE self, VALUE argument);
109
+ VALUE rb_u_string_general_category(VALUE self);
110
+ VALUE rb_u_string_getbyte(VALUE self, VALUE rbindex);
111
+ VALUE rb_u_string_graph(VALUE self);
112
+ VALUE rb_u_string_grapheme_break(VALUE self);
113
+ VALUE rb_u_string_gsub(int argc, VALUE *argv, VALUE self);
114
+ VALUE rb_u_string_hash(VALUE self);
115
+ VALUE rb_u_string_hex(VALUE self);
116
+ VALUE rb_u_string_index_m(int argc, VALUE *argv, VALUE self);
117
+ VALUE rb_u_string_include(VALUE self, VALUE other);
118
+ VALUE rb_u_string_inspect(VALUE self);
119
+ VALUE rb_u_string_length(VALUE self);
120
+ VALUE rb_u_string_line_break(VALUE self);
121
+ VALUE rb_u_string_lines(int argc, VALUE *argv, VALUE self);
122
+ VALUE rb_u_string_ljust(int argc, VALUE *argv, VALUE self);
123
+ VALUE rb_u_string_lower(int argc, VALUE *argv, VALUE self);
124
+ VALUE rb_u_string_lstrip(VALUE self);
125
+ VALUE rb_u_string_match(VALUE self, VALUE other);
126
+ VALUE rb_u_string_match_m(int argc, VALUE *argv, VALUE self);
127
+ VALUE rb_u_string_mirror(VALUE self);
128
+ VALUE rb_u_string_normalize(int argc, VALUE *argv, VALUE self);
129
+ VALUE rb_u_string_normalized(int argc, VALUE *argv, VALUE self);
130
+ VALUE rb_u_string_newline(VALUE self);
131
+ VALUE rb_u_string_oct(VALUE self);
132
+ VALUE rb_u_string_ord(VALUE self);
133
+ VALUE rb_u_string_partition(VALUE self, VALUE separator);
134
+ VALUE rb_u_string_plus(VALUE self, VALUE rbother);
135
+ VALUE rb_u_string_print(VALUE self);
136
+ VALUE rb_u_string_punct(VALUE self);
137
+ VALUE rb_u_string_reverse(VALUE self);
138
+ VALUE rb_u_string_rindex_m(int argc, VALUE *argv, VALUE self);
139
+ VALUE rb_u_string_rjust(int argc, VALUE *argv, VALUE self);
140
+ VALUE rb_u_string_rpartition(VALUE self, VALUE separator);
141
+ VALUE rb_u_string_rstrip(VALUE self);
142
+ VALUE rb_u_string_scan(VALUE self, VALUE pattern);
143
+ VALUE rb_u_string_script(VALUE self);
144
+ VALUE rb_u_string_soft_dotted(VALUE self);
145
+ VALUE rb_u_string_space(VALUE self);
146
+ VALUE rb_u_string_split(VALUE self, const char *separator, long length);
147
+ VALUE rb_u_string_split_m(int argc, VALUE *argv, VALUE self);
148
+ VALUE rb_u_string_squeeze(int argc, VALUE *argv, VALUE self);
149
+ VALUE rb_u_string_start_with(int argc, VALUE *argv, VALUE self);
150
+ VALUE rb_u_string_strip(VALUE self);
151
+ VALUE rb_u_string_sub(int argc, VALUE *argv, VALUE self);
152
+ VALUE rb_u_string_substr(VALUE self, long offset, long len);
153
+ VALUE rb_u_string_times(VALUE self, VALUE rbtimes);
154
+ VALUE rb_u_string_title(VALUE self);
155
+ VALUE rb_u_string_titlecase(int argc, VALUE *argv, VALUE self);
156
+ VALUE rb_u_string_to_i(int argc, VALUE *argv, VALUE self);
157
+ VALUE rb_u_string_to_str(VALUE self);
158
+ VALUE rb_u_string_to_sym(VALUE self);
159
+ VALUE rb_u_string_tr(VALUE self, VALUE from, VALUE to);
160
+ VALUE rb_u_string_tr_s(VALUE self, VALUE from, VALUE to);
161
+ VALUE rb_u_string_upcase(int argc, VALUE *argv, VALUE self);
162
+ VALUE rb_u_string_upper(int argc, VALUE *argv, VALUE self);
163
+ VALUE rb_u_string_valid(VALUE self);
164
+ VALUE rb_u_string_valid_encoding(VALUE self);
165
+ VALUE rb_u_string_wide(VALUE self);
166
+ VALUE rb_u_string_wide_cjk(VALUE self);
167
+ VALUE rb_u_string_width(VALUE self);
168
+ VALUE rb_u_string_word_break(VALUE self);
169
+ VALUE rb_u_string_xdigit(VALUE self);
170
+ VALUE rb_u_string_zero_width(VALUE self);
171
+
172
+
173
+ void Init_u_string(VALUE mU);
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload alnum?
4
+ * @return [Boolean] True if the receiver contains only characters in the
5
+ * general categories Letter and Number */
6
+ VALUE
7
+ rb_u_string_alnum(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isalnum);
10
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload alpha?
4
+ * @return [Boolean] True if the receiver contains only characters in the
5
+ * general category Alpha */
6
+ VALUE
7
+ rb_u_string_alpha(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isalpha);
10
+ }
@@ -0,0 +1,142 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ static VALUE
5
+ rb_u_string_substr_impl(VALUE self, long offset, long len, bool nil_on_empty)
6
+ {
7
+ if (len < 0)
8
+ return Qnil;
9
+
10
+ const struct rb_u_string *string = RVAL2USTRING(self);
11
+
12
+ const char *begin = rb_u_string_begin_from_offset(string, offset);
13
+ if (begin == NULL)
14
+ return Qnil;
15
+
16
+ const char *end = u_offset_to_pointer_n(begin, len, USTRING_END(string) - begin);
17
+ if (end == NULL)
18
+ end = USTRING_END(string);
19
+
20
+ if (nil_on_empty && begin == end)
21
+ return Qnil;
22
+
23
+ if (begin == USTRING_STR(string) && end == USTRING_END(string))
24
+ return self;
25
+
26
+ return rb_u_string_new_c(self, begin, end - begin);
27
+ }
28
+
29
+ VALUE
30
+ rb_u_string_substr(VALUE self, long offset, long len)
31
+ {
32
+ return rb_u_string_substr_impl(self, offset, len, false);
33
+ }
34
+
35
+ static VALUE
36
+ rb_u_string_subpat(VALUE self, VALUE re, VALUE reference)
37
+ {
38
+ if (rb_reg_search(re, StringValue(self), 0, 0) < 0)
39
+ return Qnil;
40
+
41
+ volatile VALUE match = rb_u_pattern_match_reference(reference);
42
+
43
+ return NIL_P(match) ? Qnil : rb_u_string_new_rb(match);
44
+ }
45
+
46
+ static VALUE
47
+ rb_u_string_aref_num(VALUE self, long offset)
48
+ {
49
+ return rb_u_string_substr_impl(self, offset, 1, true);
50
+ }
51
+
52
+ static VALUE
53
+ rb_u_string_aref_default(VALUE self, VALUE index)
54
+ {
55
+ const struct rb_u_string *string = RVAL2USTRING(self);
56
+ long n_chars = u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string));
57
+
58
+ long begin, length;
59
+ switch (rb_range_beg_len(index, &begin, &length, n_chars, 0)) {
60
+ case Qfalse:
61
+ return rb_u_string_aref_num(self, NUM2LONG(index));
62
+ case Qnil:
63
+ return Qnil;
64
+ default:
65
+ return rb_u_string_substr(self, begin, length);
66
+ }
67
+ }
68
+
69
+ static VALUE
70
+ rb_u_string_aref(VALUE self, VALUE index)
71
+ {
72
+ if (TYPE(index) == T_STRING || RTEST(rb_obj_is_kind_of(index, rb_cUString))) {
73
+ if (rb_u_string_index(self, index, 0) == -1)
74
+ return Qnil;
75
+
76
+ return TYPE(index) == T_STRING ?
77
+ rb_u_string_new_c(index, RSTRING_PTR(index), RSTRING_LEN(index)) :
78
+ index;
79
+ }
80
+
81
+ switch (TYPE(index)) {
82
+ case T_FIXNUM:
83
+ return rb_u_string_aref_num(self, FIX2LONG(index));
84
+ case T_REGEXP:
85
+ return rb_u_string_subpat(self, index, INT2FIX(0));
86
+ default:
87
+ return rb_u_string_aref_default(self, index);
88
+ }
89
+ }
90
+
91
+ /* @overload [](index)
92
+ * @param [#to_int] index
93
+ * @return [U::String, nil] The substring [max(_i_, 0), min({#length}, _i_ +
94
+ * 1)], where _i_ = INDEX if INDEX ≥ 0, _i_ = {#length} - abs(INDEX)
95
+ * otherwise, inheriting any taint and untrust, or nil if this substring is
96
+ * empty
97
+ *
98
+ * @overload [](index, length)
99
+ * @param [#to_int] index
100
+ * @param [#to_int] length
101
+ * @return [U::String, nil] The substring [max(_i_, 0), min({#length}, _i_ +
102
+ * LENGTH)], where _i_ = INDEX if INDEX ≥ 0, _i_ = {#length} - abs(INDEX)
103
+ * otherwise, inheriting any taint or untrust, or nil if LENGTH < 0
104
+ *
105
+ * @overload [](range)
106
+ * @param [Range] range
107
+ * @return [U::String, nil] The result of `#[i, j - k]`, where _i_ =
108
+ * RANGE#begin if RANGE#begin ≥ 0, _i_ = {#length} - abs(RANGE#begin)
109
+ * otherwise, _j_ = RANGE#end if RANGE#end ≥ 0, _j_ = {#length} -
110
+ * abs(RANGE#end) otherwise, and _k_ = 1 if RANGE#exclude_end?, _k_ = 0
111
+ * otherwise, or nil if _j_ - _k_ < 0
112
+ *
113
+ * @overload [](regexp, reference = 0)
114
+ * @param [Regexp] regexp
115
+ * @param [#to_int, #to_str, Symbol] reference
116
+ * @raise [IndexError] If REFERENCE doesn’t refer to a submatch
117
+ * @return [U::String, nil] The submatch REFERENCE from the first match of
118
+ * REGEXP in the receiver, inheriting any taint and untrust from both
119
+ * the receiver and from REGEXP, or nil if there is no match or if the
120
+ * submatch isn’t part of the overall match
121
+ *
122
+ * @overload [](string)
123
+ * @param [U::String, ::String] string
124
+ * @return [U::String, nil] The substring STRING, inheriting any taint and
125
+ * untrust from STRING, if STRING is a substring of the receiver
126
+ *
127
+ * @overload [](object)
128
+ * @param [Object] object
129
+ * @return [nil] Nil for any object that doesn’t satisfy the other cases */
130
+ VALUE
131
+ rb_u_string_aref_m(int argc, VALUE *argv, VALUE self)
132
+ {
133
+ need_m_to_n_arguments(argc, 1, 2);
134
+
135
+ if (argc == 1)
136
+ return rb_u_string_aref(self, argv[0]);
137
+
138
+ if (TYPE(argv[0]) == T_REGEXP)
139
+ return rb_u_string_subpat(self, argv[0], argv[1]);
140
+
141
+ return rb_u_string_substr(self, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
142
+ }
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload ascii_only?
4
+ * @return [Boolean] True if the receiver contains only characters in the ASCII region, that
5
+ * is, U+0000 through U+007F */
6
+ VALUE
7
+ rb_u_string_ascii_only(VALUE self)
8
+ {
9
+ const struct rb_u_string *string = RVAL2USTRING(self);
10
+
11
+ return u_is_ascii_only_n(USTRING_STR(string), USTRING_LENGTH(string)) ?
12
+ Qtrue : Qfalse;
13
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload assigned?
4
+ * @return [Boolean] True if the receiver contains only code points that have
5
+ * been assigned a code value */
6
+ VALUE
7
+ rb_u_string_assigned(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isassigned);
10
+ }
@@ -0,0 +1,18 @@
1
+ #include "rb_includes.h"
2
+ #ifdef HAVE_RUBY_ENCODING_H
3
+ # include <ruby/encoding.h>
4
+ #endif
5
+
6
+ /* @return [String] The String representation of the receiver, inheriting any
7
+ * taint and untrust, encoded as ASCII-8BIT. */
8
+ VALUE
9
+ rb_u_string_b(VALUE self)
10
+ {
11
+ const struct rb_u_string *string = RVAL2USTRING(self);
12
+ VALUE result = rb_str_new(USTRING_STR(string), USTRING_LENGTH(string));
13
+ #ifdef HAVE_RUBY_ENCODING_H
14
+ rb_enc_associate(result, rb_ascii8bit_encoding());
15
+ #endif
16
+ OBJ_INFECT(result, self);
17
+ return result;
18
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [Integer] The number of bytes required to represent the receiver */
4
+ VALUE
5
+ rb_u_string_bytesize(VALUE self)
6
+ {
7
+ const struct rb_u_string *string = RVAL2USTRING(self);
8
+
9
+ return LONG2NUM(USTRING_LENGTH(string));
10
+ }
@@ -0,0 +1,103 @@
1
+ #include "rb_includes.h"
2
+
3
+ static VALUE
4
+ rb_u_string_byte_substr(VALUE self, long offset, long length)
5
+ {
6
+ const struct rb_u_string *string = RVAL2USTRING(self);
7
+ long n = USTRING_LENGTH(string);
8
+
9
+ if (offset > n || length < 0)
10
+ return Qnil;
11
+
12
+ if (offset < 0) {
13
+ offset += n;
14
+ if (offset < 0)
15
+ return Qnil;
16
+ }
17
+
18
+ if (offset + length > n)
19
+ length = n - offset;
20
+
21
+ if (length <= 0)
22
+ return rb_u_string_new_empty(self);
23
+
24
+ return rb_u_string_new_subsequence(self, offset, length);
25
+ }
26
+
27
+ static VALUE
28
+ rb_u_string_byteslice_num(VALUE self, long offset)
29
+ {
30
+ VALUE result = rb_u_string_byte_substr(self, offset, 1);
31
+
32
+ if (NIL_P(result) || USTRING_LENGTH(RVAL2USTRING(result)) == 0)
33
+ return Qnil;
34
+
35
+ return result;
36
+ }
37
+
38
+ static VALUE
39
+ rb_u_string_byteslice_default(VALUE self, VALUE index)
40
+ {
41
+ const struct rb_u_string *string = RVAL2USTRING(self);
42
+ long n_bytes = USTRING_LENGTH(string);
43
+
44
+ long begin, length;
45
+ switch (rb_range_beg_len(index, &begin, &length, n_bytes, 0)) {
46
+ case Qfalse:
47
+ return rb_u_string_byteslice_num(self, NUM2LONG(index));
48
+ case Qnil:
49
+ return Qnil;
50
+ default:
51
+ return rb_u_string_byte_substr(self, begin, length);
52
+ }
53
+ }
54
+
55
+ static VALUE
56
+ rb_u_string_byteslice(VALUE self, VALUE index)
57
+ {
58
+ switch (TYPE(index)) {
59
+ case T_FIXNUM:
60
+ return rb_u_string_byteslice_num(self, FIX2LONG(index));
61
+ default:
62
+ return rb_u_string_byteslice_default(self, index);
63
+ }
64
+ }
65
+
66
+ /* @overload byteslice(index)
67
+ * @param [#to_int] index
68
+ * @return [U::String, nil] The byte-index-based substring [max(_i_, 0),
69
+ * min({#bytesize}, _i_ + 1)], where _i_ = INDEX if INDEX ≥ 0, _i_ =
70
+ * {#bytesize} - abs(INDEX) otherwise, inheriting any taint and untrust, or
71
+ * nil if this substring is empty
72
+ *
73
+ * @overload byteslice(index, length)
74
+ * @param [#to_int] index
75
+ * @param [#to_int] length
76
+ * @return [U::String, nil] The byte-index-based substring [max(_i_, 0),
77
+ * min({#bytesize}, _i_ + LENGTH)], where _i_ = INDEX if INDEX ≥ 0, _i_ =
78
+ * {#bytesize} - abs(INDEX) otherwise, inheriting any taint and untrust, or
79
+ * nil if LENGTH < 0
80
+ *
81
+ * @overload byteslice(range)
82
+ * @param [Range] range
83
+ * @return [U::String, nil] The result of `#[i, j - k]`, where _i_ =
84
+ * RANGE#begin if RANGE#begin ≥ 0, _i_ = {#bytesize} - abs(RANGE#begin)
85
+ * otherwise, _j_ = RANGE#end if RANGE#end ≥ 0, _j_ = {#bytesize} -
86
+ * abs(RANGE#end) otherwise, and _k_ = 1 if RANGE#exclude_end?, _k_ = 0
87
+ * otherwise, or nil if _j_ - _k_ < 0
88
+ *
89
+ * @overload byteslice(object)
90
+ * @param [Object] object
91
+ * @return [nil] Nil for any object that doesn’t satisfy the other cases */
92
+ VALUE
93
+ rb_u_string_byteslice_m(int argc, VALUE *argv, VALUE self)
94
+ {
95
+ need_m_to_n_arguments(argc, 1, 2);
96
+
97
+ if (argc == 1)
98
+ return rb_u_string_byteslice(self, argv[0]);
99
+
100
+ return rb_u_string_byte_substr(self,
101
+ NUM2LONG(argv[0]),
102
+ NUM2LONG(argv[1]));
103
+ }