u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,33 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* Returns the canonical combining class of the characters of the receiver.
4
+ *
5
+ * The canonical combining class of a character is a number in the range [0,
6
+ * 254]. The canonical combining class is used when generating a canonical
7
+ * ordering of the characters in a string.
8
+ *
9
+ * The empty string has a canonical combining class of 0.
10
+ *
11
+ * @raise [ArgumentError] If the receiver contains two characters belonging to
12
+ * different combining classes
13
+ * @raise [ArgumentError] If the receiver contains an incomplete UTF-8 sequence
14
+ * @raise [ArgumentError] If the receiver contains an invalid UTF-8 sequence
15
+ * @return [Fixnum] */
16
+ VALUE
17
+ rb_u_string_canonical_combining_class(VALUE self)
18
+ {
19
+ const struct rb_u_string *string = RVAL2USTRING(self);
20
+ const char *p = USTRING_STR(string);
21
+ const char *end = USTRING_END(string);
22
+ if (p == end)
23
+ return 0;
24
+ int first = u_char_canonical_combining_class(u_decode(&p, p, end));
25
+ while (p < end) {
26
+ int value = u_char_canonical_combining_class(u_decode(&p, p, end));
27
+ if (value != first)
28
+ rb_u_raise(rb_eArgError,
29
+ "string consists of characters with different canonical combining class values: %d+, %d",
30
+ first, value);
31
+ }
32
+ return INT2FIX(first);
33
+ }
@@ -0,0 +1,25 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload case_ignorable?
4
+ *
5
+ * @return [Boolean] True if the receiver contains only “case ignorable”
6
+ * characters, that is, characters in the general categories
7
+ *
8
+ * * Other, format (Cf)
9
+ * * Letter, modifier (Lm)
10
+ * * Mark, enclosing (Me)
11
+ * * Mark, nonspacing (Mn)
12
+ * * Symbol, modifier (Sk)
13
+ *
14
+ * and the characters
15
+ *
16
+ * * U+0027 APOSTROPHE
17
+ * * U+00AD SOFT HYPHEN
18
+ * * U+2019 RIGHT SINGLE QUOTATION MARK
19
+ * @see http://unicode.org/reports/tr21/tr21-5.html
20
+ * Unicode Standard Annex #21: Case Mappings */
21
+ VALUE
22
+ rb_u_string_case_ignorable(VALUE self)
23
+ {
24
+ return _rb_u_character_test(self, u_char_iscaseignorable);
25
+ }
@@ -0,0 +1,61 @@
1
+ #include <errno.h>
2
+ #include "rb_includes.h"
3
+
4
+ static size_t
5
+ foldcase(char **result, const struct rb_u_string *string, const char *locale,
6
+ char *previous)
7
+ {
8
+ size_t n = u_foldcase(NULL, 0,
9
+ USTRING_STR(string), USTRING_LENGTH(string),
10
+ locale);
11
+ *result = _rb_u_guarded_alloc(n + 1, previous, NULL);
12
+ return u_foldcase(*result, n + 1,
13
+ USTRING_STR(string), USTRING_LENGTH(string),
14
+ locale);
15
+ }
16
+
17
+ /* @overload casecmp(other, locale = ENV['LC_COLLATE'])
18
+ *
19
+ * Returns the comparison of {#foldcase} to _other_{#foldcase} using the
20
+ * linguistically correct rules of LOCALE. This is, however, only an
21
+ * approximation of a case-insensitive comparison. The LOCALE must be given
22
+ * as a language, region, and encoding, for example, “en_US.UTF-8”.
23
+ *
24
+ * This operation is known as “collation” and you can find more information
25
+ * about the collation algorithm employed in the
26
+ * Unicode Technical Standard #10, see http://unicode.org/reports/tr10/.
27
+ *
28
+ * @param [U::String, #to_str] other
29
+ * @param [#to_str] locale
30
+ * @return [Fixnum] */
31
+ VALUE
32
+ rb_u_string_casecmp(int argc, VALUE *argv, VALUE self)
33
+ {
34
+ const char *locale = NULL;
35
+
36
+ VALUE rbother, rblocale;
37
+ if (rb_scan_args(argc, argv, "11", &rbother, &rblocale) == 2)
38
+ locale = StringValuePtr(rblocale);
39
+
40
+ const struct rb_u_string *string = RVAL2USTRING(self);
41
+ const struct rb_u_string *other = RVAL2USTRING_ANY(rbother);
42
+
43
+ char *folded;
44
+ size_t folded_n = foldcase(&folded, string, locale, NULL);
45
+
46
+ char *folded_other;
47
+ size_t folded_other_n = foldcase(&folded_other, other, locale, folded);
48
+
49
+ errno = 0;
50
+ int r = u_collate(folded, folded_n,
51
+ folded_other, folded_other_n,
52
+ locale);
53
+
54
+ free(folded_other);
55
+ free(folded);
56
+
57
+ if (errno != 0)
58
+ rb_u_raise_errno(errno, "can’t collate strings");
59
+
60
+ return INT2FIX(r);
61
+ }
@@ -0,0 +1,17 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload cased?
4
+ *
5
+ * @return [Boolean] True if the receiver only contains characters in the
6
+ * general categories
7
+ *
8
+ * * Letter, uppercase (Lu)
9
+ * * Letter, lowercase (Ll)
10
+ * * Letter, titlecase (Lt)
11
+ *
12
+ * or has the derived properties Other_Uppercase or Other_Lowercase */
13
+ VALUE
14
+ rb_u_string_cased(VALUE self)
15
+ {
16
+ return _rb_u_character_test(self, u_char_iscased);
17
+ }
@@ -0,0 +1,107 @@
1
+ #include "rb_includes.h"
2
+
3
+ static VALUE
4
+ rb_u_string_chomp_default(VALUE self)
5
+ {
6
+ const struct rb_u_string *string = RVAL2USTRING(self);
7
+
8
+ const char *begin = USTRING_STR(string);
9
+ const char *end = USTRING_END(string);
10
+
11
+ const char *last;
12
+ uint32_t c = u_decode_r(&last, begin, end);
13
+ if (c == '\n') {
14
+ if (*(last - 1) == '\r')
15
+ last--;
16
+ } else if (!u_char_isnewline(c))
17
+ return self;
18
+
19
+ return rb_u_string_new_c(self, begin, last - begin);
20
+ }
21
+
22
+ static VALUE
23
+ rb_u_string_chomp_newlines(VALUE self)
24
+ {
25
+ const struct rb_u_string *string = RVAL2USTRING(self);
26
+ const char *begin = USTRING_STR(string);
27
+ const char *end = USTRING_END(string);
28
+
29
+ const char *last = end;
30
+ const char *last_but_one;
31
+ while (last > begin &&
32
+ u_char_isnewline(u_decode_r(&last_but_one, begin, last)))
33
+ last = last_but_one;
34
+
35
+ if (last == end)
36
+ return self;
37
+
38
+ return rb_u_string_new_c(self, begin, last - begin);
39
+ }
40
+
41
+ /* @overload chomp(separator = $/)
42
+ *
43
+ * Returns the receiver, minus any SEPARATOR suffix, inheriting any taint and
44
+ * untrust, unless {#length} = 0, in which case nil is returned. If
45
+ * SEPARATOR is nil or invalidly encoded, the receiver is returned.
46
+ *
47
+ * If SEPARATOR is `$/` and `$/` has its default value or if SEPARATOR is
48
+ * U+000A LINE FEED, the longest suffix consisting of any of
49
+ *
50
+ * * U+000A LINE FEED
51
+ * * U+000D CARRIAGE RETURN
52
+ * * U+000D CARRIAGE RETURN, U+000D LINE FEED
53
+ *
54
+ * will be removed. If no such suffix exists and the last character is a
55
+ * {#newline?}, it will be removed instead.
56
+ *
57
+ * If SEPARATOR is {#empty?}, remove the longest {#newline?} suffix.
58
+ *
59
+ * @param [U::String, #to_str, nil] separator
60
+ * @return [U::String, self, nil]
61
+ * @see #chop
62
+ * @see #lstrip
63
+ * @see #rstrip
64
+ * @see #strip */
65
+ VALUE
66
+ rb_u_string_chomp(int argc, VALUE *argv, VALUE self)
67
+ {
68
+ const struct rb_u_string *string = RVAL2USTRING(self);
69
+
70
+ long length = USTRING_LENGTH(string);
71
+ if (length == 0)
72
+ return Qnil;
73
+
74
+ VALUE rs;
75
+ if (argc == 0) {
76
+ rs = rb_rs;
77
+ if (rs == rb_default_rs)
78
+ return rb_u_string_chomp_default(self);
79
+ } else {
80
+ rb_scan_args(argc, argv, "01", &rs);
81
+ }
82
+ if (NIL_P(rs))
83
+ return self;
84
+
85
+ const struct rb_u_string *separator = RVAL2USTRING_ANY(rs);
86
+
87
+ long separator_length = USTRING_LENGTH(separator);
88
+ if (separator_length == 0)
89
+ return rb_u_string_chomp_newlines(self);
90
+
91
+ if (separator_length > length)
92
+ return self;
93
+
94
+ char last_char = USTRING_STR(separator)[separator_length - 1];
95
+ if (separator_length == 1 && last_char == '\n')
96
+ return rb_u_string_chomp_default(self);
97
+
98
+ if (!u_valid(USTRING_STR(separator), separator_length, NULL) ||
99
+ USTRING_STR(string)[length - 1] != last_char ||
100
+ (separator_length > 1 &&
101
+ rb_memcmp(USTRING_STR(separator),
102
+ USTRING_END(string) - separator_length,
103
+ separator_length) != 0))
104
+ return self;
105
+
106
+ return rb_u_string_new_c(self, USTRING_STR(string), length - separator_length);
107
+ }
@@ -0,0 +1,33 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* Returns the receiver, minus its last character, inheriting any taint and
4
+ * untrust, unless the receiver is {#empty?} or if the last character is
5
+ * invalidly encoded, in which case the receiver is returned.
6
+ *
7
+ * If the last character is U+000A LINE FEED and the second-to-last character
8
+ * is U+000D CARRIAGE RETURN, both characters are removed.
9
+ *
10
+ * @return [U::String]
11
+ * @see #chomp
12
+ * @see #lstrip
13
+ * @see #rstrip
14
+ * @see #strip */
15
+ VALUE
16
+ rb_u_string_chop(VALUE self)
17
+ {
18
+ const struct rb_u_string *string = RVAL2USTRING(self);
19
+
20
+ if (USTRING_LENGTH(string) == 0)
21
+ return self;
22
+
23
+ const char *begin = USTRING_STR(string);
24
+ const char *end = USTRING_END(string);
25
+
26
+ const char *last;
27
+ uint32_t c = u_decode_r(&last, begin, end);
28
+ if (c == '\n')
29
+ if (*(last - 1) == '\r')
30
+ last--;
31
+
32
+ return rb_u_string_new_c(self, begin, last - begin);
33
+ }
@@ -0,0 +1,9 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [U::String] The substring [0, min({#length}, 1)], inheriting any
4
+ * taint and untrust */
5
+ VALUE
6
+ rb_u_string_chr(VALUE self)
7
+ {
8
+ return rb_u_string_substr(self, 0, 1);
9
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload cntrl?
4
+ * @return [Boolean] True if the receiver contains only characters in the
5
+ * general category Other, control (Cc) */
6
+ VALUE
7
+ rb_u_string_cntrl(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_iscntrl);
10
+ }
@@ -0,0 +1,46 @@
1
+ #include "rb_includes.h"
2
+ #include <errno.h>
3
+
4
+ /* @overload <=>(other, locale = ENV['LC_COLLATE'])
5
+ *
6
+ * Returns the comparison of the receiver and OTHER using the linguistically
7
+ * correct rules of LOCALE. The LOCALE must be given as a language, region,
8
+ * and encoding, for example, “en_US.UTF-8”.
9
+ *
10
+ * This operation is known as “collation” and you can find more information
11
+ * about the collation algorithm employed in the
12
+ * Unicode Technical Standard #10, see http://unicode.org/reports/tr10/.
13
+ *
14
+ * @param [U::String, #to_str] other
15
+ * @param [#to_str] locale
16
+ * @raise [Errno::EILSEQ] If a character in the receiver can’t be converted
17
+ * into the encoding of the locale
18
+ * @return [Fixnum]
19
+ * @see #==
20
+ * @see #eql? */
21
+ VALUE
22
+ rb_u_string_collate(int argc, VALUE *argv, VALUE self)
23
+ {
24
+ const char *locale = NULL;
25
+
26
+ VALUE rbother, rblocale;
27
+ if (rb_scan_args(argc, argv, "11", &rbother, &rblocale) == 2)
28
+ locale = StringValuePtr(rblocale);
29
+ else {
30
+ const char * const env[] = { "LC_ALL", "LC_COLLATE", "LANG", NULL };
31
+ for (const char * const *p = env; *p != NULL; p++)
32
+ if ((locale = getenv(*p)) != NULL)
33
+ break;
34
+ }
35
+
36
+ const struct rb_u_string *string = RVAL2USTRING(self);
37
+ const struct rb_u_string *other = RVAL2USTRING_ANY(rbother);
38
+
39
+ errno = 0;
40
+ int r = u_collate(USTRING_STR(string), USTRING_LENGTH(string),
41
+ USTRING_STR(other), USTRING_LENGTH(other),
42
+ locale);
43
+ if (errno != 0)
44
+ rb_u_raise_errno(errno, "can’t collate strings");
45
+ return INT2FIX(r);
46
+ }
@@ -0,0 +1,18 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload collation_key(locale = ENV['LC_COLLATE'])
4
+ *
5
+ * @raise [Errno::EILSEQ] If a character in the receiver can’t be converted
6
+ * into the encoding of the locale
7
+ * @return [U::String] The locale-dependent collation key of the receiver in
8
+ * LOCALE, inheriting any taint and untrust
9
+ * @note Use the collation key when comparing U::Strings to each other
10
+ * repeatedly, as occurs when, for example, sorting a list of
11
+ * U::Strings.
12
+ * @note The LOCALE must be given as a language, region, and encoding, for
13
+ * example, “en_US.UTF-8”. */
14
+ VALUE
15
+ rb_u_string_collation_key(int argc, VALUE *argv, VALUE self)
16
+ {
17
+ return _rb_u_string_convert_locale(argc, argv, self, u_collation_key, "LC_COLLATE");
18
+ }
@@ -0,0 +1,38 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_internal_tr.h"
3
+
4
+ /* @overload count(set, *sets)
5
+ *
6
+ * Returns the number of characters in the receiver that are included in the
7
+ * intersection of SET and any additional SETS of characters.
8
+ *
9
+ * The complement of all Unicode characters and a given set of characters may
10
+ * be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
11
+ * ACCENT).
12
+ *
13
+ * Any sequence of characters _a_-_b_ inside a set will expand to also
14
+ * include all characters whose code points lay between those of _a_ and _b_.
15
+ *
16
+ * @param [U::String, #to_str] set
17
+ * @param [Array<U::String, #to_str>] sets
18
+ * @return [Integer] */
19
+ VALUE
20
+ rb_u_string_count(int argc, VALUE *argv, VALUE self)
21
+ {
22
+ const struct rb_u_string *string = RVAL2USTRING(self);
23
+
24
+ need_at_least_n_arguments(argc, 1);
25
+
26
+ if (USTRING_LENGTH(string) == 0)
27
+ return INT2FIX(0);
28
+
29
+ struct tr_table table;
30
+ tr_table_initialize_from_strings(&table, argc, argv);
31
+
32
+ long count = 0;
33
+ for (const char *p = USTRING_STR(string), *end = USTRING_END(string); p < end; )
34
+ if (tr_table_lookup(&table, u_decode(&p, p, end)))
35
+ count++;
36
+
37
+ return LONG2NUM(count);
38
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload defined?
4
+ * @return [Boolean] True if the receiver contains only characters not in the
5
+ * general categories Other, not assigned (Cn) and Other, surrogate (Cs) */
6
+ VALUE
7
+ rb_u_string_defined(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isdefined);
10
+ }
@@ -0,0 +1,62 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_internal_tr.h"
3
+
4
+ static long
5
+ rb_u_string_delete_loop(const struct rb_u_string *string, struct tr_table *table,
6
+ char *result)
7
+ {
8
+ long count = 0;
9
+
10
+ char *base = result;
11
+ for (const char *p = USTRING_STR(string), *q, *end = USTRING_END(string); p < end; p = q)
12
+ if (!tr_table_lookup(table, u_decode(&q, p, end))) {
13
+ long run = q - p;
14
+ if (base != NULL) {
15
+ memcpy(base, p, run);
16
+ base += run;
17
+ }
18
+ count += run;
19
+ }
20
+
21
+ return count;
22
+ }
23
+
24
+ /* @overload delete(set, *sets)
25
+ *
26
+ * Returns the receiver, minus any characters that are included in the
27
+ * intersection of SET and any additional SETS of characters, inheriting any
28
+ * taint and untrust.
29
+ *
30
+ * The complement of all Unicode characters and a given set of characters may
31
+ * be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
32
+ * ACCENT).
33
+ *
34
+ * Any sequence of characters _a_-_b_ inside a set will expand to also
35
+ * include all characters whose code points lay between those of _a_ and _b_.
36
+ *
37
+ * @param [U::String, #to_str] set
38
+ * @param [Array<U::String, #to_str>] sets
39
+ * @return [U::String] */
40
+ VALUE
41
+ rb_u_string_delete(int argc, VALUE *argv, VALUE self)
42
+ {
43
+ const struct rb_u_string *string = RVAL2USTRING(self);
44
+
45
+ need_at_least_n_arguments(argc, 1);
46
+
47
+ if (USTRING_LENGTH(string) == 0)
48
+ return self;
49
+
50
+ struct tr_table table;
51
+ tr_table_initialize_from_strings(&table, argc, argv);
52
+
53
+ long count = rb_u_string_delete_loop(string, &table, NULL);
54
+ if (count == 0)
55
+ return self;
56
+
57
+ char *remaining = ALLOC_N(char, count + 1);
58
+ rb_u_string_delete_loop(string, &table, remaining);
59
+ remaining[count] = '\0';
60
+
61
+ return rb_u_string_new_c_own(self, remaining, count);
62
+ }