u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [Integer] The number of characters in the receiver */
4
+ VALUE
5
+ rb_u_string_length(VALUE self)
6
+ {
7
+ const struct rb_u_string *string = RVAL2USTRING(self);
8
+
9
+ return UINT2NUM(u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string)));
10
+ }
@@ -0,0 +1,115 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define BREAK2ID(value, symbol) \
4
+ case U_LINE_BREAK_##value: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ break_to_symbol(enum u_line_break value)
13
+ {
14
+ switch (value) {
15
+ BREAK2ID(MANDATORY, mandatory)
16
+ BREAK2ID(CARRIAGE_RETURN, carriage_return)
17
+ BREAK2ID(LINE_FEED, line_feed)
18
+ BREAK2ID(COMBINING_MARK, combining_mark)
19
+ BREAK2ID(NEXT_LINE, next_line)
20
+ BREAK2ID(SURROGATE, surrogate)
21
+ BREAK2ID(WORD_JOINER, word_joiner)
22
+ BREAK2ID(ZERO_WIDTH_SPACE, zero_width_space)
23
+ BREAK2ID(NON_BREAKING_GLUE, non_breaking_glue)
24
+ BREAK2ID(SPACE, space)
25
+ BREAK2ID(BREAK_OPPORTUNITY_BEFORE_AND_AFTER, break_opportunity_before_and_after)
26
+ BREAK2ID(BREAK_AFTER, break_after)
27
+ BREAK2ID(BREAK_BEFORE, break_before)
28
+ BREAK2ID(HYPHEN, hyphen)
29
+ BREAK2ID(CONTINGENT_BREAK_OPPORTUNITY, contingent_break_opportunity)
30
+ BREAK2ID(CLOSE_PUNCTUATION, close_punctuation)
31
+ BREAK2ID(CLOSE_PARENTHESIS, close_parenthesis)
32
+ BREAK2ID(EXCLAMATION_INTERROGATION, exclamation_interrogation)
33
+ BREAK2ID(INSEPARABLE, inseparable)
34
+ BREAK2ID(NONSTARTER, nonstarter)
35
+ BREAK2ID(OPEN_PUNCTUATION, open_punctuation)
36
+ BREAK2ID(QUOTATION, quotation)
37
+ BREAK2ID(INFIX_NUMERIC_SEPARATOR, infix_numeric_separator)
38
+ BREAK2ID(NUMERIC, numeric)
39
+ BREAK2ID(POSTFIX_NUMERIC, postfix_numeric)
40
+ BREAK2ID(PREFIX_NUMERIC, prefix_numeric)
41
+ BREAK2ID(SYMBOLS_ALLOWING_BREAK_AFTER, symbols_allowing_break_after)
42
+ BREAK2ID(AMBIGUOUS, ambiguous)
43
+ BREAK2ID(ALPHABETIC, alphabetic)
44
+ BREAK2ID(CONDITIONAL_JAPANESE_STARTER, conditional_japanese_starter)
45
+ BREAK2ID(HANGUL_LV_SYLLABLE, hangul_lv_syllable)
46
+ BREAK2ID(HANGUL_LVT_SYLLABLE, hangul_lvt_syllable)
47
+ BREAK2ID(HEBREW_LETTER, hebrew_letter)
48
+ BREAK2ID(IDEOGRAPHIC, ideographic)
49
+ BREAK2ID(HANGUL_L_JAMO, hangul_l_jamo)
50
+ BREAK2ID(HANGUL_V_JAMO, hangul_v_jamo)
51
+ BREAK2ID(HANGUL_T_JAMO, hangul_t_jamo)
52
+ BREAK2ID(REGIONAL_INDICATOR, regional_indicator)
53
+ BREAK2ID(COMPLEX_CONTEXT_DEPENDENT, complex_context_dependent)
54
+ BREAK2ID(UNKNOWN, unknown)
55
+ default:
56
+ rb_u_raise(rb_eNotImpError, "unknown line break: %d", value);
57
+ }
58
+ }
59
+
60
+ /* Returns the line break property value of the characters of the receiver.
61
+ *
62
+ * The possible break values are
63
+ *
64
+ * * :after
65
+ * * :alphabetic
66
+ * * :ambiguous
67
+ * * :before
68
+ * * :before_and_after
69
+ * * :carriage_return
70
+ * * :close_parenthesis
71
+ * * :close_punctuation
72
+ * * :combining_mark
73
+ * * :complex_context
74
+ * * :conditional_japanese_starter
75
+ * * :contingent
76
+ * * :exclamation
77
+ * * :hangul_l_jamo
78
+ * * :hangul_lv_syllable
79
+ * * :hangul_lvt_syllable
80
+ * * :hangul_t_jamo
81
+ * * :hangul_v_jamo
82
+ * * :hebrew_letter
83
+ * * :hyphen
84
+ * * :ideographic
85
+ * * :infix_separator
86
+ * * :inseparable
87
+ * * :line_feed
88
+ * * :mandatory
89
+ * * :next_line
90
+ * * :non_breaking_glue
91
+ * * :non_starter
92
+ * * :numeric
93
+ * * :open_punctuation
94
+ * * :postfix
95
+ * * :prefix
96
+ * * :quotation
97
+ * * :regional_indicator
98
+ * * :space
99
+ * * :surrogate
100
+ * * :symbol
101
+ * * :unknown
102
+ * * :word_joiner
103
+ * * :zero_width_space
104
+ *
105
+ * @raise [ArgumentError] If the string consists of more than one break type
106
+ * @return [Symbol]
107
+ * @see http://unicode.org/reports/tr14/
108
+ * Unicode Standard Annex #14: Unicode Line Breaking Algorithm */
109
+ VALUE
110
+ rb_u_string_line_break(VALUE self)
111
+ {
112
+ return _rb_u_string_property(self, "line break", U_LINE_BREAK_UNKNOWN,
113
+ (int (*)(uint32_t))u_char_line_break,
114
+ (VALUE (*)(int))break_to_symbol);
115
+ }
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload lower?(locale = ENV[LC_CTYPE])
4
+ * @param [#to_str] locale
5
+ * @return [Boolean] True if the receiver has been downcased according to the
6
+ * rules of the language of LOCALE, which may be empty to specifically use
7
+ * the default, language-independent, rules, that is, if _a_ =
8
+ * _a_{#downcase}(LOCALE), where _a_ = {#normalize}(`:nfd`) */
9
+ VALUE
10
+ rb_u_string_lower(int argc, VALUE *argv, VALUE self)
11
+ {
12
+ return _rb_u_string_test_locale(argc, argv, self, u_downcase);
13
+ }
@@ -0,0 +1,24 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [U::String] The receiver with its maximum {#space?} prefix removed,
4
+ * inheriting any taint and untrust
5
+ * @see #rstrip
6
+ * @see #strip */
7
+ VALUE
8
+ rb_u_string_lstrip(VALUE self)
9
+ {
10
+ const struct rb_u_string *string = RVAL2USTRING(self);
11
+
12
+ const char *begin = USTRING_STR(string);
13
+ if (begin == NULL)
14
+ return self;
15
+
16
+ const char *p = begin, *end = USTRING_END(string);
17
+ for (const char *q; p < end; p = q)
18
+ if (!u_char_isspace(u_decode(&q, p, end)))
19
+ break;
20
+ if (p == begin)
21
+ return self;
22
+
23
+ return rb_u_string_new_c(self, p, end - p);
24
+ }
@@ -0,0 +1,65 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ /* @overload =~(other)
5
+ * @param [Regexp, #=~] other
6
+ * @raise [TypeError] If OTHER is a {U::String} or String
7
+ * @return [Numeric, nil] The result of OTHER`#=~`(self), that is, the index
8
+ * of the first character of the match of OTHER in the receiver, if one
9
+ * exists */
10
+ VALUE
11
+ rb_u_string_match(VALUE self, VALUE other)
12
+ {
13
+ if (RTEST(rb_obj_is_kind_of(other, rb_cUString)))
14
+ rb_u_raise(rb_eTypeError, "type mismatch: U::String given");
15
+
16
+ switch (TYPE(other)) {
17
+ case T_STRING:
18
+ rb_u_raise(rb_eTypeError, "type mismatch: String given");
19
+ break;
20
+ case T_REGEXP: {
21
+ const struct rb_u_string *string = RVAL2USTRING(self);
22
+
23
+ long index = rb_reg_search(other, rb_str_to_str(self), 0, 0);
24
+ if (index < 0)
25
+ return Qnil;
26
+
27
+ return LONG2NUM(u_pointer_to_offset(USTRING_STR(string),
28
+ USTRING_STR(string) + index));
29
+ }
30
+ default:
31
+ return rb_funcall(other, rb_intern("=~"), 1, self);
32
+ }
33
+ }
34
+
35
+ /* @overload match(pattern, index = 0)
36
+ * @param [Regexp, #to_str] pattern
37
+ * @param [#to_int] index
38
+ * @return [MatchData, nil] The result of _r_#match(self, index), that is,
39
+ * the match data of the first match of _r_ in the receiver, inheriting any
40
+ * taint and untrust from both the receiver and from PATTERN, if one
41
+ * exists, where _r_ = PATTERN, if PATTERN is a Regexp, _r_ =
42
+ * Regexp.new(PATTERN) otherwise
43
+ * @overload match(pattern, index = 0){ |matchdata| … }
44
+ * @param [Regexp, #to_str] pattern
45
+ * @param [#to_int] index
46
+ * @yieldparam [MatchData] matchdata
47
+ * @return [Object, nil] The result of calling the given block with the
48
+ * result of _r_#match(self, index), that is, the match data of the first
49
+ * match of _r_ in the receiver, inheriting any taint and untrust from both
50
+ * the recevier and from PATTERN, if one exists, where _r_ = PATTERN, if
51
+ * PATTERN is a Regexp, _r_ = Regexp.new(PATTERN) otherwise */
52
+ VALUE
53
+ rb_u_string_match_m(int argc, VALUE *argv, VALUE self)
54
+ {
55
+ VALUE re;
56
+ if (argc < 0)
57
+ need_m_to_n_arguments(argc, 1, 2);
58
+ re = argv[0];
59
+ argv[0] = self;
60
+ VALUE result = rb_funcall2(rb_u_pattern_argument(re, false),
61
+ rb_intern("match"), argc, argv);
62
+ if (!NIL_P(result) && rb_block_given_p())
63
+ return rb_yield(result);
64
+ return result;
65
+ }
@@ -0,0 +1,16 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* Returns the mirroring of the receiver, inheriting any taint and untrust.
4
+ *
5
+ * Mirroring is done by replacing characters in the string with their
6
+ * horizontal mirror image, if any, in text that is laid out from right to
7
+ * left. For example, ‘(’ becomes ‘)’ and ‘)’ becomes ‘(’.
8
+ *
9
+ * @return [U::String]
10
+ * @see http://www.unicode.org/reports/tr9/
11
+ * Unicode Standard Annex #9: Unicode Bidirectional Algorithm */
12
+ VALUE
13
+ rb_u_string_mirror(VALUE self)
14
+ {
15
+ return _rb_u_string_convert(self, u_mirror);
16
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload newline?
4
+ *
5
+ * Returns true if the receiver contains only “newline” characters. A
6
+ * character is a “newline” character if it is any of the following
7
+ * characters:
8
+ *
9
+ * * U+000A (LINE FEED (LF))
10
+ * * U+000C (FORM FEED (FF))
11
+ * * U+000D (CARRIAGE RETURN (CR))
12
+ * * U+0085 (NEXT LINE)
13
+ * * U+2028 (LINE SEPARATOR)
14
+ * * U+2029 (PARAGRAPH SEPARATOR)
15
+ *
16
+ * @return [Boolean] */
17
+ VALUE
18
+ rb_u_string_newline(VALUE self)
19
+ {
20
+ return _rb_u_character_test(self, u_char_isnewline);
21
+ }
@@ -0,0 +1,70 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload normalize(form = :nfd)
4
+ *
5
+ * Returns the receiver normalized into FORM, inheriting any taint and
6
+ * untrust.
7
+ *
8
+ * Normalization is the process of converting characters and sequences of
9
+ * characters in string into a canonical form. This process includes dealing
10
+ * with whether characters are represented by a composed character or a base
11
+ * character and combining marks, such as accents.
12
+ *
13
+ * The possible normalization forms are
14
+ *
15
+ * <table>
16
+ * <thead>
17
+ * <tr><th>Form</th><th>Description</th></tr>
18
+ * </thead>
19
+ * <tbody>
20
+ * <tr>
21
+ * <td><code>:nfd</code></td>
22
+ * <td>Normalizes characters to their maximally decomposed form,
23
+ * ordering accents and so on according to their combining class</td>
24
+ * </tr>
25
+ * <tr>
26
+ * <td><code>:nfc</code></td>
27
+ * <td>Normalizes according to <code>:nfd</code>, then composes any
28
+ * decomposed characters</td>
29
+ * </tr>
30
+ * <tr>
31
+ * <td><code>:nfkd</code></td>
32
+ * <td>Normalizes according to <code>:nfd</code> and also normalizes
33
+ * “compatibility” characters, such as replacing U+00B3 SUPERSCRIPT
34
+ * THREE with U+0033 DIGIT THREE</td>
35
+ * </tr>
36
+ * <tr>
37
+ * <td><code>:nfkc</code></td>
38
+ * <td>Normalizes according to <code>:nfkd</code>, then composes any
39
+ * decomposed characters</td>
40
+ * </tr>
41
+ * </tbody>
42
+ * </table>
43
+ *
44
+ * @param [#to_sym] form
45
+ * @return [U::String]
46
+ * @see http://unicode.org/reports/tr15/
47
+ * Unicode Standard Annex #15: Unicode Normalization Forms */
48
+ VALUE
49
+ rb_u_string_normalize(int argc, VALUE *argv, VALUE self)
50
+ {
51
+ const struct rb_u_string *string = RVAL2USTRING(self);
52
+
53
+ VALUE rbform;
54
+ enum u_normalization_form form = U_NORMALIZATION_FORM_D;
55
+ if (rb_scan_args(argc, argv, "01", &rbform) == 1)
56
+ form = _rb_u_symbol_to_normalization_form(rbform);
57
+
58
+ size_t n = u_normalize(NULL, 0,
59
+ USTRING_STR(string), USTRING_LENGTH(string),
60
+ form);
61
+ char *normalized = ALLOC_N(char, n + 1);
62
+ n = u_normalize(normalized, n + 1,
63
+ USTRING_STR(string), USTRING_LENGTH(string),
64
+ form);
65
+ char *t = REALLOC_N(normalized, char, n + 1);
66
+ if (t != NULL)
67
+ normalized = t;
68
+
69
+ return rb_u_string_new_c_own(self, normalized, n);
70
+ }
@@ -0,0 +1,28 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload normalize?(mode = :default)
4
+ *
5
+ * Returns true if it can be determined that the receiver is normalized
6
+ * according to MODE.
7
+ *
8
+ * See {#normalize} for a discussion on normalization and a list of the
9
+ * possible normalization modes.
10
+ *
11
+ * @param [#to_sym] mode
12
+ * @return [Boolean]
13
+ * @see http://unicode.org/reports/tr15/
14
+ * Unicode Standard Annex #15: Unicode Normalization Forms */
15
+ VALUE
16
+ rb_u_string_normalized(int argc, VALUE *argv, VALUE self)
17
+ {
18
+ const struct rb_u_string *string = RVAL2USTRING(self);
19
+
20
+ VALUE rbform;
21
+ enum u_normalization_form form = U_NORMALIZATION_FORM_D;
22
+ if (rb_scan_args(argc, argv, "01", &rbform) == 1)
23
+ form = _rb_u_symbol_to_normalization_form(rbform);
24
+
25
+ return u_normalized(USTRING_STR(string),
26
+ USTRING_LENGTH(string),
27
+ form) == U_NORMALIZED_YES ? Qtrue : Qfalse;
28
+ }
@@ -0,0 +1,11 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_to_inum.h"
3
+
4
+ /* @return [Integer] The result of {#to_i}(8), but with the added provision
5
+ * that any leading base specification in the receiver will override the
6
+ * suggested octal (8) base, that is, `'0b11'.u`{#oct} = 3, not 9. */
7
+ VALUE
8
+ rb_u_string_oct(VALUE self)
9
+ {
10
+ return rb_u_string_to_inum(self, -8, false);
11
+ }
@@ -0,0 +1,14 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [Integer] The code point of the first character of the receiver */
4
+ VALUE
5
+ rb_u_string_ord(VALUE self)
6
+ {
7
+ const struct rb_u_string *s = RVAL2USTRING(self);
8
+ const char *p = USTRING_STR(s);
9
+ const char *end = USTRING_END(s);
10
+ if (p == end)
11
+ rb_u_raise(rb_eArgError, "empty string");
12
+ const char *q;
13
+ return UINT2NUM(u_decode(&q, p, end));
14
+ }
@@ -0,0 +1,80 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ static VALUE
5
+ rb_u_string_partition_failure(VALUE self)
6
+ {
7
+ return rb_ary_new3(3,
8
+ self,
9
+ rb_u_string_new_empty(self),
10
+ rb_u_string_new_empty(self));
11
+ }
12
+
13
+ static VALUE
14
+ rb_u_string_partition_success(VALUE self, VALUE rbseparator, long offset)
15
+ {
16
+ const struct rb_u_string *string = RVAL2USTRING(self);
17
+ const struct rb_u_string *separator = RVAL2USTRING_ANY(rbseparator);
18
+
19
+ long after = offset + USTRING_LENGTH(separator);
20
+
21
+ return rb_ary_new3(3,
22
+ rb_u_string_new_subsequence(self, 0, offset),
23
+ TYPE(rbseparator) == T_STRING ?
24
+ rb_u_string_new_rb(rbseparator) :
25
+ rbseparator,
26
+ rb_u_string_new_subsequence(self,
27
+ after,
28
+ USTRING_LENGTH(string) - after));
29
+ }
30
+
31
+ static VALUE
32
+ rb_u_string_partition_regex(VALUE self, VALUE regex)
33
+ {
34
+ VALUE str = rb_str_to_str(self);
35
+
36
+ long offset = rb_reg_search(regex, str, 0, 0);
37
+ if (offset < 0)
38
+ return rb_u_string_partition_failure(self);
39
+
40
+ VALUE separator = rb_u_pattern_match_reference(INT2FIX(0));
41
+
42
+ if (offset == 0 && RSTRING_LEN(separator) == 0)
43
+ return rb_u_string_partition_failure(self);
44
+
45
+ return rb_u_string_partition_success(self, separator, offset);
46
+ }
47
+
48
+ static VALUE
49
+ rb_u_string_partition_string(VALUE self, VALUE rbseparator)
50
+ {
51
+ VALUE validated = rb_u_string_validate_type(rbseparator);
52
+
53
+ long offset = rb_u_string_index(self, validated, 0);
54
+ if (offset < 0)
55
+ return rb_u_string_partition_failure(self);
56
+
57
+ const char *begin = USTRING_STR(RVAL2USTRING(self));
58
+ long byte_offset = u_offset_to_pointer(begin, offset) - begin;
59
+
60
+ return rb_u_string_partition_success(self, validated, byte_offset);
61
+ }
62
+
63
+ /* @overload partition(separator)
64
+ * @param [Regexp, #to_str] separator
65
+ * @return [Array<U::String>] The receiver split into _s₁_ = {#slice}(0,
66
+ * _i_), _s₂_ = {#slice}(_i_, _n_), _s₃_ = {#slice}(_i_+_n_, -1), where _i_
67
+ * = _j_ if _j_ ≠ nil, _i_ = {#length} otherwise, _j_ =
68
+ * {#index}(SEPARATOR), _n_ = SEPARATOR{#length}, where _s₁_ and _s₃_
69
+ * inherit any taint and untrust from the receiver and _s₂_ inherits any
70
+ * taint and untrust from SEPARATOR and also from the receiver if SEPARATOR
71
+ * is a Regexp
72
+ * @see #rpartition */
73
+ VALUE
74
+ rb_u_string_partition(VALUE self, VALUE separator)
75
+ {
76
+ if (TYPE(separator) == T_REGEXP)
77
+ return rb_u_string_partition_regex(self, separator);
78
+
79
+ return rb_u_string_partition_string(self, separator);
80
+ }