u 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,109 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define CATEGORY2ID(type, symbol) \
4
+ case U_GENERAL_CATEGORY_##type: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ category_to_symbol(enum u_general_category category)
13
+ {
14
+ switch (category) {
15
+ CATEGORY2ID(OTHER_CONTROL, other_control)
16
+ CATEGORY2ID(OTHER_FORMAT, other_format)
17
+ CATEGORY2ID(OTHER_NOT_ASSIGNED, other_not_assigned)
18
+ CATEGORY2ID(OTHER_PRIVATE_USE, other_private_use)
19
+ CATEGORY2ID(OTHER_SURROGATE, other_surrogate)
20
+ CATEGORY2ID(LETTER_LOWERCASE, letter_lowercase)
21
+ CATEGORY2ID(LETTER_MODIFIER, letter_modifier)
22
+ CATEGORY2ID(LETTER_OTHER, letter_other)
23
+ CATEGORY2ID(LETTER_TITLECASE, letter_titlecase)
24
+ CATEGORY2ID(LETTER_UPPERCASE, letter_uppercase)
25
+ CATEGORY2ID(MARK_SPACING_COMBINING, mark_spacing_combining)
26
+ CATEGORY2ID(MARK_ENCLOSING, mark_enclosing)
27
+ CATEGORY2ID(MARK_NON_SPACING, mark_non_spacing)
28
+ CATEGORY2ID(NUMBER_DECIMAL, number_decimal)
29
+ CATEGORY2ID(NUMBER_LETTER, number_letter)
30
+ CATEGORY2ID(NUMBER_OTHER, number_other)
31
+ CATEGORY2ID(PUNCTUATION_CONNECTOR, punctuation_connector)
32
+ CATEGORY2ID(PUNCTUATION_DASH, punctuation_dash)
33
+ CATEGORY2ID(PUNCTUATION_CLOSE, punctuation_close)
34
+ CATEGORY2ID(PUNCTUATION_FINAL_QUOTE, punctuation_final_quote)
35
+ CATEGORY2ID(PUNCTUATION_INITIAL_QUOTE, punctuation_initial_quote)
36
+ CATEGORY2ID(PUNCTUATION_OTHER, punctuation_other)
37
+ CATEGORY2ID(PUNCTUATION_OPEN, punctuation_open)
38
+ CATEGORY2ID(SYMBOL_CURRENCY, symbol_currency)
39
+ CATEGORY2ID(SYMBOL_MODIFIER, symbol_modifier)
40
+ CATEGORY2ID(SYMBOL_MATH, symbol_math)
41
+ CATEGORY2ID(SYMBOL_OTHER, symbol_other)
42
+ CATEGORY2ID(SEPARATOR_LINE, separator_line)
43
+ CATEGORY2ID(SEPARATOR_PARAGRAPH, separator_paragraph)
44
+ CATEGORY2ID(SEPARATOR_SPACE, separator_space)
45
+ default:
46
+ rb_u_raise(rb_eNotImpError, "unknown general category: %d", category);
47
+ }
48
+ }
49
+
50
+ /* Returns the general category of the characters of the receiver.
51
+ *
52
+ * The general category identifies what kind of symbol the character is.
53
+ *
54
+ * <table>
55
+ * <thead>
56
+ * <tr>
57
+ * <th>Category Major, minor</th>
58
+ * <th>Unicode Value</th>
59
+ * <th>Ruby Value</th>
60
+ * </tr>
61
+ * </thead>
62
+ * <tbody>
63
+ * <tr><td>Other, control</td><td>Cc</td><td>:other_control</td></tr>
64
+ * <tr><td>Other, format</td><td>Cf</td><td>:other_format</td></tr>
65
+ * <tr><td>Other, not assigned</td><td>Cn</td><td>:other_not_assigned</td></tr>
66
+ * <tr><td>Other, private use</td><td>Co</td><td>:other_private_use</td></tr>
67
+ * <tr><td>Other, surrogate</td><td>Cs</td><td>:other_surrogate</td></tr>
68
+ * <tr><td>Letter, lowercase</td><td>Ll</td><td>:letter_lowercase</td></tr>
69
+ * <tr><td>Letter, modifier</td><td>Lm</td><td>:letter_modifier</td></tr>
70
+ * <tr><td>Letter, other</td><td>Lo</td><td>:letter_other</td></tr>
71
+ * <tr><td>Letter, titlecase</td><td>Lt</td><td>:letter_titlecase</td></tr>
72
+ * <tr><td>Letter, uppercase</td><td>Lu</td><td>:letter_uppercase</td></tr>
73
+ * <tr><td>Mark, spacing combining</td><td>Mc</td><td>:mark_spacing_combining</td></tr>
74
+ * <tr><td>Mark, enclosing</td><td>Me</td><td>:mark_enclosing</td></tr>
75
+ * <tr><td>Mark, nonspacing</td><td>Mn</td><td>:mark_non_spacing</td></tr>
76
+ * <tr><td>Number, decimal digit</td><td>Nd</td><td>:number_decimal</td></tr>
77
+ * <tr><td>Number, letter</td><td>Nl</td><td>:number_letter</td></tr>
78
+ * <tr><td>Number, other</td><td>No</td><td>:number_other</td></tr>
79
+ * <tr><td>Punctuation, connector</td><td>Pc</td><td>:punctuation_connector</td></tr>
80
+ * <tr><td>Punctuation, dash</td><td>Pd</td><td>:punctuation_dash</td></tr>
81
+ * <tr><td>Punctuation, close</td><td>Pe</td><td>:punctuation_close</td></tr>
82
+ * <tr><td>Punctuation, final quote</td><td>Pf</td><td>:punctuation_final_quote</td></tr>
83
+ * <tr><td>Punctuation, initial quote</td><td>Pi</td><td>:punctuation_initial_quote</td></tr>
84
+ * <tr><td>Punctuation, other</td><td>Po</td><td>:punctuation_other</td></tr>
85
+ * <tr><td>Punctuation, open</td><td>Ps</td><td>:punctuation_open</td></tr>
86
+ * <tr><td>Symbol, currency</td><td>Sc</td><td>:symbol_currency</td></tr>
87
+ * <tr><td>Symbol, modifier</td><td>Sk</td><td>:symbol_modifier</td></tr>
88
+ * <tr><td>Symbol, math</td><td>Sm</td><td>:symbol_math</td></tr>
89
+ * <tr><td>Symbol, other</td><td>So</td><td>:symbol_other</td></tr>
90
+ * <tr><td>Separator, line</td><td>Zl</td><td>:separator_line</td></tr>
91
+ * <tr><td>Separator, paragraph</td><td>Zp</td><td>:separator_paragraph</td></tr>
92
+ * <tr><td>Separator, space</td><td>Zs</td><td>:separator_space</td></tr>
93
+ * </tbody>
94
+ * </table>
95
+ *
96
+ * @raise [ArgumentError] If the receiver contains two characters belonging to
97
+ * different general categories
98
+ * @raise [ArgumentError] If the receiver contains an incomplete UTF-8 sequence
99
+ * @raise [ArgumentError] If the receiver contains an invalid UTF-8 sequence
100
+ * @return [Symbol]
101
+ * @see http://www.unicode.org/notes/tn36/
102
+ * Unicode Technical Note #36: A Categorization of Unicode Characters */
103
+ VALUE
104
+ rb_u_string_general_category(VALUE self)
105
+ {
106
+ return _rb_u_string_property(self, "general category", U_GENERAL_CATEGORY_OTHER_NOT_ASSIGNED,
107
+ (int (*)(uint32_t))u_char_general_category,
108
+ (VALUE (*)(int))category_to_symbol);
109
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload getbyte(index)
4
+ * @param [#to_int] index
5
+ * @return [Fixnum, nil] The byte at byte-index _i_, where _i_ = INDEX if
6
+ * INDEX ≥ 0, _i_ = {#bytesize} - abs(INDEX) otherwise, or nil if _i_ lays
7
+ * outside of [0, {#bytesize}] */
8
+ VALUE
9
+ rb_u_string_getbyte(VALUE self, VALUE rbindex)
10
+ {
11
+ const struct rb_u_string *string = RVAL2USTRING(self);
12
+ long index = NUM2LONG(rbindex);
13
+
14
+ if (index < 0)
15
+ index += USTRING_LENGTH(string);
16
+
17
+ if (index < 0 || USTRING_LENGTH(string) <= index)
18
+ return Qnil;
19
+
20
+ return INT2FIX((unsigned char)USTRING_STR(string)[index]);
21
+ }
@@ -0,0 +1,21 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload graph?
4
+ *
5
+ * Returns true if the receiver contains only non-space “printable” characters.
6
+ *
7
+ * Non-space “printable” character are those not in the general categories
8
+ * Other or Space, separator (Zs):
9
+ *
10
+ * * Other, control (Cc)
11
+ * * Other, format (Cf)
12
+ * * Other, not assigned (Cn)
13
+ * * Other, surrogate (Cs)
14
+ * * Space, separator (Zs)
15
+ *
16
+ * @return [Boolean] */
17
+ VALUE
18
+ rb_u_string_graph(VALUE self)
19
+ {
20
+ return _rb_u_character_test(self, u_char_isgraph);
21
+ }
@@ -0,0 +1,61 @@
1
+ #include "rb_includes.h"
2
+
3
+ #define BREAK2ID(value, symbol) \
4
+ case U_GRAPHEME_BREAK_##value: { \
5
+ static ID id_##symbol; \
6
+ if (id_##symbol == 0) \
7
+ id_##symbol = rb_intern(#symbol); \
8
+ return ID2SYM(id_##symbol); \
9
+ }
10
+
11
+ static VALUE
12
+ break_to_symbol(enum u_grapheme_break value)
13
+ {
14
+ switch (value) {
15
+ BREAK2ID(CONTROL, control)
16
+ BREAK2ID(CR, cr)
17
+ BREAK2ID(EXTEND, extend)
18
+ BREAK2ID(L, l)
19
+ BREAK2ID(LF, lf)
20
+ BREAK2ID(LV, lv)
21
+ BREAK2ID(LVT, lvt)
22
+ BREAK2ID(OTHER, other)
23
+ BREAK2ID(PREPEND, prepend)
24
+ BREAK2ID(REGIONAL_INDICATOR, regional_indicator)
25
+ BREAK2ID(SPACINGMARK, spacingmark)
26
+ BREAK2ID(T, t)
27
+ BREAK2ID(V, v)
28
+ default:
29
+ rb_u_raise(rb_eNotImpError, "unknown grapheme break: %d", value);
30
+ }
31
+ }
32
+
33
+ /* Returns the grapheme break property value of the characters of the receiver.
34
+ *
35
+ * The possible break values are
36
+ *
37
+ * * :control
38
+ * * :cr
39
+ * * :extend
40
+ * * :l
41
+ * * :lf
42
+ * * :lv
43
+ * * :lvt
44
+ * * :other
45
+ * * :prepend
46
+ * * :regional_indicator
47
+ * * :spacingmark
48
+ * * :t
49
+ * * :v
50
+ *
51
+ * @raise [ArgumentError] If the string consists of more than one break type
52
+ * @return [Symbol]
53
+ * @see http://www.unicode.org/reports/tr29/
54
+ * Unicode Standard Annex #29: Unicode Text Segmentation */
55
+ VALUE
56
+ rb_u_string_grapheme_break(VALUE self)
57
+ {
58
+ return _rb_u_string_property(self, "grapheme break", U_GRAPHEME_BREAK_OTHER,
59
+ (int (*)(uint32_t))u_char_grapheme_break,
60
+ (VALUE (*)(int))break_to_symbol);
61
+ }
@@ -0,0 +1,164 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ /* @overload gsub(pattern, replacement)
5
+ *
6
+ * Returns the receiver with all matches of PATTERN replaced by REPLACEMENT,
7
+ * inheriting any taint and untrust from the receiver and from REPLACEMENT.
8
+ *
9
+ * The REPLACEMENT is used as a specification for what to replace matches
10
+ * with:
11
+ *
12
+ * <table>
13
+ * <thead>
14
+ * <tr><th>Specification</th><th>Replacement</th></tr>
15
+ * </thead>
16
+ * <tbody>
17
+ * <tr>
18
+ * <td><code>\1</code>, <code>\2</code>, …, <code>\</code><em>n</em></td>
19
+ * <td>Numbered sub-match <em>n</em></td>
20
+ * </tr>
21
+ * <tr>
22
+ * <td><code>\k&lt;</code><em>name</em><code>></code></td>
23
+ * <td>Named sub-match <em>name</em></td>
24
+ * </tr>
25
+ * </tbody>
26
+ * </table>
27
+ *
28
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
29
+ * `$`_n_ are updated accordingly.
30
+ *
31
+ * @param [Regexp, #to_str] pattern
32
+ * @param [#to_str] replacement
33
+ * @return [U::String]
34
+ *
35
+ * @overload gsub(pattern, replacements)
36
+ *
37
+ * Returns the receiver with all matches of PATTERN replaced by
38
+ * REPLACEMENTS#[_match_], where _match_ is the matched substring, inheriting
39
+ * any taint and untrust from the receiver and from the
40
+ * REPLACEMENTS#[_match_]es, as well as any taint on REPLACEMENTS.
41
+ *
42
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
43
+ * `$`_n_ are updated accordingly.
44
+ *
45
+ * @param [Regexp, #to_str] pattern
46
+ * @param [#to_hash] replacements
47
+ * @raise [RuntimeError] If any replacement is the result being constructed
48
+ * @raise [Exception] Any error raised by REPLACEMENTS#default, if it gets
49
+ * called
50
+ * @return [U::String]
51
+ *
52
+ * @overload gsub(pattern){ |match| … }
53
+ *
54
+ * Returns the receiver with all matches of PATTERN replaced by the results
55
+ * of the given block, inheriting any taint and untrust from the receiver and
56
+ * from the results of the given block.
57
+ *
58
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
59
+ * `$`_n_ are updated accordingly.
60
+ *
61
+ * @param [Regexp, #to_str] pattern
62
+ * @yieldparam [U::String] match
63
+ * @yieldreturn [#to_str]
64
+ * @return [U::String]
65
+ *
66
+ * @overload gsub(pattern)
67
+ *
68
+ * Returns an Enumerator over the matches of PATTERN in the receiver.
69
+ *
70
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
71
+ * `$`_n_ will be updated accordingly.
72
+ *
73
+ * @param [Regexp, #to_str] pattern
74
+ * @return [Enumerator] */
75
+ VALUE
76
+ rb_u_string_gsub(int argc, VALUE *argv, VALUE self)
77
+ {
78
+ VALUE pattern, replacement;
79
+ VALUE replacements = Qnil;
80
+ bool use_block = false;
81
+ bool tainted = false;
82
+
83
+ if (argc == 1) {
84
+ RETURN_ENUMERATOR(self, argc, argv);
85
+ use_block = true;
86
+ }
87
+
88
+ if (rb_scan_args(argc, argv, "11", &pattern, &replacement) == 2) {
89
+ replacements = rb_check_convert_type(replacement, T_HASH,
90
+ "Hash", "to_hash");
91
+ if (NIL_P(replacements))
92
+ StringValue(replacement);
93
+ if (OBJ_TAINTED(replacement))
94
+ tainted = true;
95
+ }
96
+
97
+ pattern = rb_u_pattern_argument(pattern, true);
98
+
99
+ VALUE str = rb_str_to_str(self);
100
+ long begin = rb_reg_search(pattern, str, 0, 0);
101
+ if (begin < 0)
102
+ return self;
103
+
104
+ const char *base = RSTRING_PTR(str);
105
+ const char *p = base;
106
+ const char *end = RSTRING_END(str);
107
+ VALUE substituted = rb_u_str_buf_new(RSTRING_LEN(str) + 30);
108
+ do {
109
+ VALUE match = rb_backref_get();
110
+ struct re_registers *registers = RMATCH_REGS(match);
111
+ VALUE result;
112
+
113
+ if (use_block || !NIL_P(replacements)) {
114
+ if (use_block) {
115
+ VALUE ustr = rb_u_string_new_rb(rb_reg_nth_match(0, match));
116
+ result = rb_u_string_object_as_string(rb_yield(ustr));
117
+ } else {
118
+ VALUE ustr = rb_u_string_new_c(self,
119
+ base + registers->beg[0],
120
+ registers->end[0] - registers->beg[0]);
121
+ result = rb_u_string_object_as_string(rb_hash_aref(replacements, ustr));
122
+ }
123
+
124
+ if (result == substituted)
125
+ rb_u_raise(rb_eRuntimeError,
126
+ "result of block is string being built; please try not to cheat");
127
+ } else
128
+ result =
129
+ #ifdef HAVE_RB_REG_REGSUB4
130
+ rb_reg_regsub(replacement, str, registers, pattern);
131
+ #else
132
+ rb_reg_regsub(replacement, str, registers);
133
+ #endif
134
+
135
+ if (OBJ_TAINTED(result))
136
+ tainted = true;
137
+
138
+ const struct rb_u_string *value = RVAL2USTRING_ANY(result);
139
+
140
+ rb_str_buf_cat(substituted, p, registers->beg[0] - (p - base));
141
+ rb_str_buf_cat(substituted, USTRING_STR(value), USTRING_LENGTH(value));
142
+ OBJ_INFECT(substituted, result);
143
+
144
+ p = base + registers->end[0];
145
+ if (registers->beg[0] == registers->end[0])
146
+ p = u_next(p);
147
+ if (p >= end)
148
+ break;
149
+
150
+ begin = rb_reg_search(pattern, str, registers->end[0], 0);
151
+ } while (begin >= 0);
152
+
153
+ if (p < end)
154
+ rb_str_buf_cat(substituted, p, end - p);
155
+
156
+ rb_reg_search(pattern, str, end - p, 0);
157
+
158
+ RBASIC(substituted)->klass = rb_obj_class(str);
159
+ OBJ_INFECT(substituted, str);
160
+ if (tainted)
161
+ OBJ_TAINT(substituted);
162
+
163
+ return rb_u_string_new_rb(substituted);
164
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [Fixnum] The hash value of the receiver’s content */
4
+ VALUE
5
+ rb_u_string_hash(VALUE self)
6
+ {
7
+ const struct rb_u_string *string = RVAL2USTRING(self);
8
+
9
+ return INT2FIX(rb_memhash(USTRING_STR(string), USTRING_LENGTH(string)));
10
+ }
@@ -0,0 +1,9 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_to_inum.h"
3
+
4
+ /* @return [Integer] The result of {#to_i}(16) */
5
+ VALUE
6
+ rb_u_string_hex(VALUE self)
7
+ {
8
+ return rb_u_string_to_inum(self, 16, false);
9
+ }
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload include?(substring)
4
+ * @param [#to_str] substring
5
+ * @return [Boolean] True if {#index}(SUBSTRING) ≠ nil */
6
+ VALUE
7
+ rb_u_string_include(VALUE self, VALUE substring)
8
+ {
9
+ return rb_u_string_index(self, substring, 0) != -1 ? Qtrue : Qfalse;
10
+ }
@@ -0,0 +1,110 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ /* TODO: Return VALUE. */
5
+ long
6
+ rb_u_string_index_regexp(VALUE self, const char *begin, VALUE regex, bool reverse)
7
+ {
8
+ const struct rb_u_string *string = RVAL2USTRING(self);
9
+ VALUE rbstring = rb_str_to_str(self);
10
+
11
+ const char *base = USTRING_STR(string);
12
+
13
+ long index = rb_reg_search(regex, rbstring,
14
+ rb_reg_adjust_startpos(regex, rbstring,
15
+ begin - base,
16
+ reverse),
17
+ reverse);
18
+ if (index == -1)
19
+ return -1;
20
+
21
+ return u_pointer_to_offset(base, base + index);
22
+ }
23
+
24
+ /* TODO: Return VALUE. */
25
+ long
26
+ rb_u_string_index(VALUE self, VALUE rbsubstring, long offset)
27
+ {
28
+ const struct rb_u_string *string = RVAL2USTRING(self);
29
+ const struct rb_u_string *substring = RVAL2USTRING_ANY(rbsubstring);
30
+
31
+ const char *begin = rb_u_string_begin_from_offset(string, offset);
32
+ if (begin == NULL)
33
+ return -1;
34
+
35
+ const char *end = USTRING_END(string);
36
+
37
+ long substring_length = USTRING_LENGTH(substring);
38
+ if (end - begin < substring_length)
39
+ return -1;
40
+ if (substring_length == 0)
41
+ return offset;
42
+
43
+ /* TODO: Should we really be using rb_memsearch? Why not something
44
+ * more Unicodey? */
45
+ long index = rb_u_memsearch(USTRING_STR(substring), substring_length,
46
+ begin,
47
+ end - begin);
48
+ if (index < 0)
49
+ return -1;
50
+
51
+ return offset + u_pointer_to_offset(begin, begin + index);
52
+ }
53
+
54
+ /* @overload index(pattern, offset = 0)
55
+ *
56
+ * Returns the minimal index of the receiver where PATTERN matches, equal to or
57
+ * greater than _i_, where _i_ = OFFSET if OFFSET ≥ 0, _i_ = {#length} -
58
+ * abs(OFFSET) otherwise, or nil if there is no match.
59
+ *
60
+ * If PATTERN is a Regexp, the Regexp special variables `$&`, `$'`,
61
+ * <code>$\`</code>, `$1`, `$2`, …, `$`_n_ are updated accordingly.
62
+ *
63
+ * If PATTERN responds to #to_str, the matching is performed by byte
64
+ * comparison.
65
+ *
66
+ * @param [Regexp, #to_str] pattern
67
+ * @param [#to_int] offset
68
+ * @return [Integer, nil]
69
+ * @see #rindex */
70
+ VALUE
71
+ rb_u_string_index_m(int argc, VALUE *argv, VALUE self)
72
+ {
73
+ VALUE sub, rboffset;
74
+ long offset = 0;
75
+ if (rb_scan_args(argc, argv, "11", &sub, &rboffset) == 2)
76
+ offset = NUM2LONG(rboffset);
77
+
78
+ const struct rb_u_string *string = RVAL2USTRING(self);
79
+
80
+ const char *begin = rb_u_string_begin_from_offset(string, offset);
81
+ if (begin == NULL) {
82
+ if (TYPE(sub) == T_REGEXP)
83
+ rb_backref_set(Qnil);
84
+
85
+ return Qnil;
86
+ }
87
+
88
+ switch (TYPE(sub)) {
89
+ case T_REGEXP:
90
+ offset = rb_u_string_index_regexp(self, begin, sub, false);
91
+ break;
92
+ default: {
93
+ VALUE tmp = rb_check_string_type(sub);
94
+ if (NIL_P(tmp))
95
+ rb_u_raise(rb_eTypeError, "type mismatch: %s given",
96
+ rb_obj_classname(sub));
97
+
98
+ sub = tmp;
99
+ }
100
+ /* fall through */
101
+ case T_STRING:
102
+ offset = rb_u_string_index(self, sub, offset);
103
+ break;
104
+ }
105
+
106
+ if (offset < 0)
107
+ return Qnil;
108
+
109
+ return LONG2NUM(offset);
110
+ }