u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,109 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define CATEGORY2ID(type, symbol) \
4
+ case U_GENERAL_CATEGORY_##type: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ category_to_symbol(enum u_general_category category)
13
+ {
14
+ switch (category) {
15
+ CATEGORY2ID(OTHER_CONTROL, other_control)
16
+ CATEGORY2ID(OTHER_FORMAT, other_format)
17
+ CATEGORY2ID(OTHER_NOT_ASSIGNED, other_not_assigned)
18
+ CATEGORY2ID(OTHER_PRIVATE_USE, other_private_use)
19
+ CATEGORY2ID(OTHER_SURROGATE, other_surrogate)
20
+ CATEGORY2ID(LETTER_LOWERCASE, letter_lowercase)
21
+ CATEGORY2ID(LETTER_MODIFIER, letter_modifier)
22
+ CATEGORY2ID(LETTER_OTHER, letter_other)
23
+ CATEGORY2ID(LETTER_TITLECASE, letter_titlecase)
24
+ CATEGORY2ID(LETTER_UPPERCASE, letter_uppercase)
25
+ CATEGORY2ID(MARK_SPACING_COMBINING, mark_spacing_combining)
26
+ CATEGORY2ID(MARK_ENCLOSING, mark_enclosing)
27
+ CATEGORY2ID(MARK_NON_SPACING, mark_non_spacing)
28
+ CATEGORY2ID(NUMBER_DECIMAL, number_decimal)
29
+ CATEGORY2ID(NUMBER_LETTER, number_letter)
30
+ CATEGORY2ID(NUMBER_OTHER, number_other)
31
+ CATEGORY2ID(PUNCTUATION_CONNECTOR, punctuation_connector)
32
+ CATEGORY2ID(PUNCTUATION_DASH, punctuation_dash)
33
+ CATEGORY2ID(PUNCTUATION_CLOSE, punctuation_close)
34
+ CATEGORY2ID(PUNCTUATION_FINAL_QUOTE, punctuation_final_quote)
35
+ CATEGORY2ID(PUNCTUATION_INITIAL_QUOTE, punctuation_initial_quote)
36
+ CATEGORY2ID(PUNCTUATION_OTHER, punctuation_other)
37
+ CATEGORY2ID(PUNCTUATION_OPEN, punctuation_open)
38
+ CATEGORY2ID(SYMBOL_CURRENCY, symbol_currency)
39
+ CATEGORY2ID(SYMBOL_MODIFIER, symbol_modifier)
40
+ CATEGORY2ID(SYMBOL_MATH, symbol_math)
41
+ CATEGORY2ID(SYMBOL_OTHER, symbol_other)
42
+ CATEGORY2ID(SEPARATOR_LINE, separator_line)
43
+ CATEGORY2ID(SEPARATOR_PARAGRAPH, separator_paragraph)
44
+ CATEGORY2ID(SEPARATOR_SPACE, separator_space)
45
+ default:
46
+ rb_u_raise(rb_eNotImpError, "unknown general category: %d", category);
47
+ }
48
+ }
49
+
50
+ /* Returns the general category of the characters of the receiver.
51
+ *
52
+ * The general category identifies what kind of symbol the character is.
53
+ *
54
+ * <table>
55
+ * <thead>
56
+ * <tr>
57
+ * <th>Category Major, minor</th>
58
+ * <th>Unicode Value</th>
59
+ * <th>Ruby Value</th>
60
+ * </tr>
61
+ * </thead>
62
+ * <tbody>
63
+ * <tr><td>Other, control</td><td>Cc</td><td>:other_control</td></tr>
64
+ * <tr><td>Other, format</td><td>Cf</td><td>:other_format</td></tr>
65
+ * <tr><td>Other, not assigned</td><td>Cn</td><td>:other_not_assigned</td></tr>
66
+ * <tr><td>Other, private use</td><td>Co</td><td>:other_private_use</td></tr>
67
+ * <tr><td>Other, surrogate</td><td>Cs</td><td>:other_surrogate</td></tr>
68
+ * <tr><td>Letter, lowercase</td><td>Ll</td><td>:letter_lowercase</td></tr>
69
+ * <tr><td>Letter, modifier</td><td>Lm</td><td>:letter_modifier</td></tr>
70
+ * <tr><td>Letter, other</td><td>Lo</td><td>:letter_other</td></tr>
71
+ * <tr><td>Letter, titlecase</td><td>Lt</td><td>:letter_titlecase</td></tr>
72
+ * <tr><td>Letter, uppercase</td><td>Lu</td><td>:letter_uppercase</td></tr>
73
+ * <tr><td>Mark, spacing combining</td><td>Mc</td><td>:mark_spacing_combining</td></tr>
74
+ * <tr><td>Mark, enclosing</td><td>Me</td><td>:mark_enclosing</td></tr>
75
+ * <tr><td>Mark, nonspacing</td><td>Mn</td><td>:mark_non_spacing</td></tr>
76
+ * <tr><td>Number, decimal digit</td><td>Nd</td><td>:number_decimal</td></tr>
77
+ * <tr><td>Number, letter</td><td>Nl</td><td>:number_letter</td></tr>
78
+ * <tr><td>Number, other</td><td>No</td><td>:number_other</td></tr>
79
+ * <tr><td>Punctuation, connector</td><td>Pc</td><td>:punctuation_connector</td></tr>
80
+ * <tr><td>Punctuation, dash</td><td>Pd</td><td>:punctuation_dash</td></tr>
81
+ * <tr><td>Punctuation, close</td><td>Pe</td><td>:punctuation_close</td></tr>
82
+ * <tr><td>Punctuation, final quote</td><td>Pf</td><td>:punctuation_final_quote</td></tr>
83
+ * <tr><td>Punctuation, initial quote</td><td>Pi</td><td>:punctuation_initial_quote</td></tr>
84
+ * <tr><td>Punctuation, other</td><td>Po</td><td>:punctuation_other</td></tr>
85
+ * <tr><td>Punctuation, open</td><td>Ps</td><td>:punctuation_open</td></tr>
86
+ * <tr><td>Symbol, currency</td><td>Sc</td><td>:symbol_currency</td></tr>
87
+ * <tr><td>Symbol, modifier</td><td>Sk</td><td>:symbol_modifier</td></tr>
88
+ * <tr><td>Symbol, math</td><td>Sm</td><td>:symbol_math</td></tr>
89
+ * <tr><td>Symbol, other</td><td>So</td><td>:symbol_other</td></tr>
90
+ * <tr><td>Separator, line</td><td>Zl</td><td>:separator_line</td></tr>
91
+ * <tr><td>Separator, paragraph</td><td>Zp</td><td>:separator_paragraph</td></tr>
92
+ * <tr><td>Separator, space</td><td>Zs</td><td>:separator_space</td></tr>
93
+ * </tbody>
94
+ * </table>
95
+ *
96
+ * @raise [ArgumentError] If the receiver contains two characters belonging to
97
+ * different general categories
98
+ * @raise [ArgumentError] If the receiver contains an incomplete UTF-8 sequence
99
+ * @raise [ArgumentError] If the receiver contains an invalid UTF-8 sequence
100
+ * @return [Symbol]
101
+ * @see http://www.unicode.org/notes/tn36/
102
+ * Unicode Technical Note #36: A Categorization of Unicode Characters */
103
+ VALUE
104
+ rb_u_string_general_category(VALUE self)
105
+ {
106
+ return _rb_u_string_property(self, "general category", U_GENERAL_CATEGORY_OTHER_NOT_ASSIGNED,
107
+ (int (*)(uint32_t))u_char_general_category,
108
+ (VALUE (*)(int))category_to_symbol);
109
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload getbyte(index)
4
+ * @param [#to_int] index
5
+ * @return [Fixnum, nil] The byte at byte-index _i_, where _i_ = INDEX if
6
+ * INDEX ≥ 0, _i_ = {#bytesize} - abs(INDEX) otherwise, or nil if _i_ lays
7
+ * outside of [0, {#bytesize}] */
8
+ VALUE
9
+ rb_u_string_getbyte(VALUE self, VALUE rbindex)
10
+ {
11
+ const struct rb_u_string *string = RVAL2USTRING(self);
12
+ long index = NUM2LONG(rbindex);
13
+
14
+ if (index < 0)
15
+ index += USTRING_LENGTH(string);
16
+
17
+ if (index < 0 || USTRING_LENGTH(string) <= index)
18
+ return Qnil;
19
+
20
+ return INT2FIX((unsigned char)USTRING_STR(string)[index]);
21
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload graph?
4
+ *
5
+ * Returns true if the receiver contains only non-space “printable” characters.
6
+ *
7
+ * Non-space “printable” character are those not in the general categories
8
+ * Other or Space, separator (Zs):
9
+ *
10
+ * * Other, control (Cc)
11
+ * * Other, format (Cf)
12
+ * * Other, not assigned (Cn)
13
+ * * Other, surrogate (Cs)
14
+ * * Space, separator (Zs)
15
+ *
16
+ * @return [Boolean] */
17
+ VALUE
18
+ rb_u_string_graph(VALUE self)
19
+ {
20
+ return _rb_u_character_test(self, u_char_isgraph);
21
+ }
@@ -0,0 +1,61 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define BREAK2ID(value, symbol) \
4
+ case U_GRAPHEME_BREAK_##value: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ break_to_symbol(enum u_grapheme_break value)
13
+ {
14
+ switch (value) {
15
+ BREAK2ID(CONTROL, control)
16
+ BREAK2ID(CR, cr)
17
+ BREAK2ID(EXTEND, extend)
18
+ BREAK2ID(L, l)
19
+ BREAK2ID(LF, lf)
20
+ BREAK2ID(LV, lv)
21
+ BREAK2ID(LVT, lvt)
22
+ BREAK2ID(OTHER, other)
23
+ BREAK2ID(PREPEND, prepend)
24
+ BREAK2ID(REGIONAL_INDICATOR, regional_indicator)
25
+ BREAK2ID(SPACINGMARK, spacingmark)
26
+ BREAK2ID(T, t)
27
+ BREAK2ID(V, v)
28
+ default:
29
+ rb_u_raise(rb_eNotImpError, "unknown grapheme break: %d", value);
30
+ }
31
+ }
32
+
33
+ /* Returns the grapheme break property value of the characters of the receiver.
34
+ *
35
+ * The possible break values are
36
+ *
37
+ * * :control
38
+ * * :cr
39
+ * * :extend
40
+ * * :l
41
+ * * :lf
42
+ * * :lv
43
+ * * :lvt
44
+ * * :other
45
+ * * :prepend
46
+ * * :regional_indicator
47
+ * * :spacingmark
48
+ * * :t
49
+ * * :v
50
+ *
51
+ * @raise [ArgumentError] If the string consists of more than one break type
52
+ * @return [Symbol]
53
+ * @see http://www.unicode.org/reports/tr29/
54
+ * Unicode Standard Annex #29: Unicode Text Segmentation */
55
+ VALUE
56
+ rb_u_string_grapheme_break(VALUE self)
57
+ {
58
+ return _rb_u_string_property(self, "grapheme break", U_GRAPHEME_BREAK_OTHER,
59
+ (int (*)(uint32_t))u_char_grapheme_break,
60
+ (VALUE (*)(int))break_to_symbol);
61
+ }
@@ -0,0 +1,164 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ /* @overload gsub(pattern, replacement)
5
+ *
6
+ * Returns the receiver with all matches of PATTERN replaced by REPLACEMENT,
7
+ * inheriting any taint and untrust from the receiver and from REPLACEMENT.
8
+ *
9
+ * The REPLACEMENT is used as a specification for what to replace matches
10
+ * with:
11
+ *
12
+ * <table>
13
+ * <thead>
14
+ * <tr><th>Specification</th><th>Replacement</th></tr>
15
+ * </thead>
16
+ * <tbody>
17
+ * <tr>
18
+ * <td><code>\1</code>, <code>\2</code>, …, <code>\</code><em>n</em></td>
19
+ * <td>Numbered sub-match <em>n</em></td>
20
+ * </tr>
21
+ * <tr>
22
+ * <td><code>\k&lt;</code><em>name</em><code>></code></td>
23
+ * <td>Named sub-match <em>name</em></td>
24
+ * </tr>
25
+ * </tbody>
26
+ * </table>
27
+ *
28
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
29
+ * `$`_n_ are updated accordingly.
30
+ *
31
+ * @param [Regexp, #to_str] pattern
32
+ * @param [#to_str] replacement
33
+ * @return [U::String]
34
+ *
35
+ * @overload gsub(pattern, replacements)
36
+ *
37
+ * Returns the receiver with all matches of PATTERN replaced by
38
+ * REPLACEMENTS#[_match_], where _match_ is the matched substring, inheriting
39
+ * any taint and untrust from the receiver and from the
40
+ * REPLACEMENTS#[_match_]es, as well as any taint on REPLACEMENTS.
41
+ *
42
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
43
+ * `$`_n_ are updated accordingly.
44
+ *
45
+ * @param [Regexp, #to_str] pattern
46
+ * @param [#to_hash] replacements
47
+ * @raise [RuntimeError] If any replacement is the result being constructed
48
+ * @raise [Exception] Any error raised by REPLACEMENTS#default, if it gets
49
+ * called
50
+ * @return [U::String]
51
+ *
52
+ * @overload gsub(pattern){ |match| … }
53
+ *
54
+ * Returns the receiver with all matches of PATTERN replaced by the results
55
+ * of the given block, inheriting any taint and untrust from the receiver and
56
+ * from the results of the given block.
57
+ *
58
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
59
+ * `$`_n_ are updated accordingly.
60
+ *
61
+ * @param [Regexp, #to_str] pattern
62
+ * @yieldparam [U::String] match
63
+ * @yieldreturn [#to_str]
64
+ * @return [U::String]
65
+ *
66
+ * @overload gsub(pattern)
67
+ *
68
+ * Returns an Enumerator over the matches of PATTERN in the receiver.
69
+ *
70
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
71
+ * `$`_n_ will be updated accordingly.
72
+ *
73
+ * @param [Regexp, #to_str] pattern
74
+ * @return [Enumerator] */
75
+ VALUE
76
+ rb_u_string_gsub(int argc, VALUE *argv, VALUE self)
77
+ {
78
+ VALUE pattern, replacement;
79
+ VALUE replacements = Qnil;
80
+ bool use_block = false;
81
+ bool tainted = false;
82
+
83
+ if (argc == 1) {
84
+ RETURN_ENUMERATOR(self, argc, argv);
85
+ use_block = true;
86
+ }
87
+
88
+ if (rb_scan_args(argc, argv, "11", &pattern, &replacement) == 2) {
89
+ replacements = rb_check_convert_type(replacement, T_HASH,
90
+ "Hash", "to_hash");
91
+ if (NIL_P(replacements))
92
+ StringValue(replacement);
93
+ if (OBJ_TAINTED(replacement))
94
+ tainted = true;
95
+ }
96
+
97
+ pattern = rb_u_pattern_argument(pattern, true);
98
+
99
+ VALUE str = rb_str_to_str(self);
100
+ long begin = rb_reg_search(pattern, str, 0, 0);
101
+ if (begin < 0)
102
+ return self;
103
+
104
+ const char *base = RSTRING_PTR(str);
105
+ const char *p = base;
106
+ const char *end = RSTRING_END(str);
107
+ VALUE substituted = rb_u_str_buf_new(RSTRING_LEN(str) + 30);
108
+ do {
109
+ VALUE match = rb_backref_get();
110
+ struct re_registers *registers = RMATCH_REGS(match);
111
+ VALUE result;
112
+
113
+ if (use_block || !NIL_P(replacements)) {
114
+ if (use_block) {
115
+ VALUE ustr = rb_u_string_new_rb(rb_reg_nth_match(0, match));
116
+ result = rb_u_string_object_as_string(rb_yield(ustr));
117
+ } else {
118
+ VALUE ustr = rb_u_string_new_c(self,
119
+ base + registers->beg[0],
120
+ registers->end[0] - registers->beg[0]);
121
+ result = rb_u_string_object_as_string(rb_hash_aref(replacements, ustr));
122
+ }
123
+
124
+ if (result == substituted)
125
+ rb_u_raise(rb_eRuntimeError,
126
+ "result of block is string being built; please try not to cheat");
127
+ } else
128
+ result =
129
+ #ifdef HAVE_RB_REG_REGSUB4
130
+ rb_reg_regsub(replacement, str, registers, pattern);
131
+ #else
132
+ rb_reg_regsub(replacement, str, registers);
133
+ #endif
134
+
135
+ if (OBJ_TAINTED(result))
136
+ tainted = true;
137
+
138
+ const struct rb_u_string *value = RVAL2USTRING_ANY(result);
139
+
140
+ rb_str_buf_cat(substituted, p, registers->beg[0] - (p - base));
141
+ rb_str_buf_cat(substituted, USTRING_STR(value), USTRING_LENGTH(value));
142
+ OBJ_INFECT(substituted, result);
143
+
144
+ p = base + registers->end[0];
145
+ if (registers->beg[0] == registers->end[0])
146
+ p = u_next(p);
147
+ if (p >= end)
148
+ break;
149
+
150
+ begin = rb_reg_search(pattern, str, registers->end[0], 0);
151
+ } while (begin >= 0);
152
+
153
+ if (p < end)
154
+ rb_str_buf_cat(substituted, p, end - p);
155
+
156
+ rb_reg_search(pattern, str, end - p, 0);
157
+
158
+ RBASIC(substituted)->klass = rb_obj_class(str);
159
+ OBJ_INFECT(substituted, str);
160
+ if (tainted)
161
+ OBJ_TAINT(substituted);
162
+
163
+ return rb_u_string_new_rb(substituted);
164
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [Fixnum] The hash value of the receiver’s content */
4
+ VALUE
5
+ rb_u_string_hash(VALUE self)
6
+ {
7
+ const struct rb_u_string *string = RVAL2USTRING(self);
8
+
9
+ return INT2FIX(rb_memhash(USTRING_STR(string), USTRING_LENGTH(string)));
10
+ }
@@ -0,0 +1,9 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_to_inum.h"
3
+
4
+ /* @return [Integer] The result of {#to_i}(16) */
5
+ VALUE
6
+ rb_u_string_hex(VALUE self)
7
+ {
8
+ return rb_u_string_to_inum(self, 16, false);
9
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload include?(substring)
4
+ * @param [#to_str] substring
5
+ * @return [Boolean] True if {#index}(SUBSTRING) ≠ nil */
6
+ VALUE
7
+ rb_u_string_include(VALUE self, VALUE substring)
8
+ {
9
+ return rb_u_string_index(self, substring, 0) != -1 ? Qtrue : Qfalse;
10
+ }
@@ -0,0 +1,110 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ /* TODO: Return VALUE. */
5
+ long
6
+ rb_u_string_index_regexp(VALUE self, const char *begin, VALUE regex, bool reverse)
7
+ {
8
+ const struct rb_u_string *string = RVAL2USTRING(self);
9
+ VALUE rbstring = rb_str_to_str(self);
10
+
11
+ const char *base = USTRING_STR(string);
12
+
13
+ long index = rb_reg_search(regex, rbstring,
14
+ rb_reg_adjust_startpos(regex, rbstring,
15
+ begin - base,
16
+ reverse),
17
+ reverse);
18
+ if (index == -1)
19
+ return -1;
20
+
21
+ return u_pointer_to_offset(base, base + index);
22
+ }
23
+
24
+ /* TODO: Return VALUE. */
25
+ long
26
+ rb_u_string_index(VALUE self, VALUE rbsubstring, long offset)
27
+ {
28
+ const struct rb_u_string *string = RVAL2USTRING(self);
29
+ const struct rb_u_string *substring = RVAL2USTRING_ANY(rbsubstring);
30
+
31
+ const char *begin = rb_u_string_begin_from_offset(string, offset);
32
+ if (begin == NULL)
33
+ return -1;
34
+
35
+ const char *end = USTRING_END(string);
36
+
37
+ long substring_length = USTRING_LENGTH(substring);
38
+ if (end - begin < substring_length)
39
+ return -1;
40
+ if (substring_length == 0)
41
+ return offset;
42
+
43
+ /* TODO: Should we really be using rb_memsearch? Why not something
44
+ * more Unicodey? */
45
+ long index = rb_u_memsearch(USTRING_STR(substring), substring_length,
46
+ begin,
47
+ end - begin);
48
+ if (index < 0)
49
+ return -1;
50
+
51
+ return offset + u_pointer_to_offset(begin, begin + index);
52
+ }
53
+
54
+ /* @overload index(pattern, offset = 0)
55
+ *
56
+ * Returns the minimal index of the receiver where PATTERN matches, equal to or
57
+ * greater than _i_, where _i_ = OFFSET if OFFSET ≥ 0, _i_ = {#length} -
58
+ * abs(OFFSET) otherwise, or nil if there is no match.
59
+ *
60
+ * If PATTERN is a Regexp, the Regexp special variables `$&`, `$'`,
61
+ * <code>$\`</code>, `$1`, `$2`, …, `$`_n_ are updated accordingly.
62
+ *
63
+ * If PATTERN responds to #to_str, the matching is performed by byte
64
+ * comparison.
65
+ *
66
+ * @param [Regexp, #to_str] pattern
67
+ * @param [#to_int] offset
68
+ * @return [Integer, nil]
69
+ * @see #rindex */
70
+ VALUE
71
+ rb_u_string_index_m(int argc, VALUE *argv, VALUE self)
72
+ {
73
+ VALUE sub, rboffset;
74
+ long offset = 0;
75
+ if (rb_scan_args(argc, argv, "11", &sub, &rboffset) == 2)
76
+ offset = NUM2LONG(rboffset);
77
+
78
+ const struct rb_u_string *string = RVAL2USTRING(self);
79
+
80
+ const char *begin = rb_u_string_begin_from_offset(string, offset);
81
+ if (begin == NULL) {
82
+ if (TYPE(sub) == T_REGEXP)
83
+ rb_backref_set(Qnil);
84
+
85
+ return Qnil;
86
+ }
87
+
88
+ switch (TYPE(sub)) {
89
+ case T_REGEXP:
90
+ offset = rb_u_string_index_regexp(self, begin, sub, false);
91
+ break;
92
+ default: {
93
+ VALUE tmp = rb_check_string_type(sub);
94
+ if (NIL_P(tmp))
95
+ rb_u_raise(rb_eTypeError, "type mismatch: %s given",
96
+ rb_obj_classname(sub));
97
+
98
+ sub = tmp;
99
+ }
100
+ /* fall through */
101
+ case T_STRING:
102
+ offset = rb_u_string_index(self, sub, offset);
103
+ break;
104
+ }
105
+
106
+ if (offset < 0)
107
+ return Qnil;
108
+
109
+ return LONG2NUM(offset);
110
+ }