u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload soft_dotted?
4
+ * @return [Boolean] True if this {U::String} only contains soft-dotted
5
+ * characters
6
+ * @note Soft-dotted characters have the soft-dotted property and thus lose
7
+ * their dot if an accent is applied to them, for example, ‘i’ and ‘j’.
8
+ * @see http://unicode.org/review/pr-11.html Unicode Public Review Issue #11 */
9
+ VALUE
10
+ rb_u_string_soft_dotted(VALUE self)
11
+ {
12
+ return _rb_u_character_test(self, u_char_issoftdotted);
13
+ }
@@ -0,0 +1,24 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload space?
4
+ *
5
+ * Returns true if the receiver contains only “space” characters. Space
6
+ * characters are those in the general category Separator:
7
+ *
8
+ * * Separator, space (Zs)
9
+ * * Separator, line (Zl)
10
+ * * Separator, paragraph (Zp)
11
+ *
12
+ * such as ‘ ’, or a control character acting as such, namely
13
+ *
14
+ * * U+0009 CHARACTER TABULATION (HT)
15
+ * * U+000A LINE FEED (LF)
16
+ * * U+000C FORM FEED (FF)
17
+ * * U+000D CARRIAGE RETURN (CR)
18
+ *
19
+ * @return [Boolean] */
20
+ VALUE
21
+ rb_u_string_space(VALUE self)
22
+ {
23
+ return _rb_u_character_test(self, u_char_isspace);
24
+ }
@@ -0,0 +1,245 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ static VALUE
5
+ rb_u_string_split_trim(VALUE result, bool limit_given, int limit)
6
+ {
7
+ if (limit_given || limit != 0)
8
+ return result;
9
+
10
+ long length;
11
+ while ((length = RARRAY_LEN(result)) > 0 &&
12
+ USTRING_LENGTH(RVAL2USTRING(RARRAY_PTR(result)[length - 1])) == 0)
13
+ rb_ary_pop(result);
14
+
15
+ return result;
16
+ }
17
+
18
+ static VALUE
19
+ rb_u_string_split_rest(VALUE self, long offset, bool limit_given, int limit, VALUE result)
20
+ {
21
+ const struct rb_u_string *string = RVAL2USTRING(self);
22
+ long length = USTRING_LENGTH(string);
23
+
24
+ if (length > 0 && (limit_given || length > offset || limit < 0))
25
+ rb_ary_push(result,
26
+ length == offset ?
27
+ rb_u_string_new_empty(self) :
28
+ rb_u_string_new_subsequence(self,
29
+ offset,
30
+ length - offset));
31
+
32
+ return rb_u_string_split_trim(result, limit_given, limit);
33
+ }
34
+
35
+ static VALUE
36
+ rb_u_string_split_awk(VALUE self, bool limit_given, int limit)
37
+ {
38
+ VALUE result = rb_ary_new();
39
+
40
+ const struct rb_u_string *string = RVAL2USTRING(self);
41
+ const char *begin = USTRING_STR(string);
42
+ const char *p = begin;
43
+ const char *end = USTRING_END(string);
44
+ int i = 1;
45
+ while (p < end) {
46
+ const char *q;
47
+ while (p < end && u_char_isspace(u_decode(&q, p, end)))
48
+ p = q;
49
+
50
+ if (p == end || (limit_given && i >= limit))
51
+ break;
52
+ i++;
53
+
54
+ q = p;
55
+ const char *r;
56
+ while (q < end && !u_char_isspace(u_decode(&r, q, end)))
57
+ q = r;
58
+
59
+ rb_ary_push(result,
60
+ rb_u_string_new_subsequence(self,
61
+ p - begin,
62
+ q - p));
63
+ p = q;
64
+ }
65
+
66
+ return rb_u_string_split_rest(self, p - begin, limit_given, limit, result);
67
+ }
68
+
69
+ static VALUE
70
+ rb_u_string_split_string(VALUE self, VALUE rbseparator, bool limit_given, int limit)
71
+ {
72
+ const struct rb_u_string *string = RVAL2USTRING(self);
73
+ const struct rb_u_string *separator = RVAL2USTRING_ANY(rbseparator);
74
+
75
+ const char *begin = USTRING_STR(string);
76
+ const char *p = begin;
77
+ const char *end = USTRING_END(string);
78
+
79
+ const char *s_p = USTRING_STR(separator);
80
+ long s_len = USTRING_LENGTH(separator);
81
+
82
+ rb_u_validate(p, USTRING_LENGTH(string));
83
+ rb_u_validate(s_p, s_len);
84
+
85
+ VALUE result = rb_ary_new();
86
+
87
+ /* TODO: Better variable name. */
88
+ long offset;
89
+ for (int i = 1; (!limit_given || i < limit) && p < end; i++) {
90
+ if ((offset = rb_u_memsearch(s_p, s_len, p, end - p)) < 0)
91
+ break;
92
+ rb_ary_push(result, rb_u_string_new_subsequence(self, p - begin, offset));
93
+ p += offset + s_len;
94
+ }
95
+
96
+ return rb_u_string_split_rest(self, p - begin, limit_given, limit, result);
97
+ }
98
+
99
+ static void
100
+ rb_u_string_split_pattern_push_registers(VALUE self,
101
+ struct re_registers *registers,
102
+ VALUE result)
103
+ {
104
+ for (int i = 1; i < registers->num_regs; i++) {
105
+ if (registers->beg[i] == -1)
106
+ continue;
107
+ rb_ary_push(result,
108
+ registers->beg[i] == registers->end[i] ?
109
+ rb_u_string_new_empty(self) :
110
+ rb_u_string_new_subsequence(self,
111
+ registers->beg[i],
112
+ registers->end[i] - registers->beg[i]));
113
+ }
114
+ }
115
+
116
+ static VALUE
117
+ rb_u_string_split_pattern(VALUE self, VALUE pattern, bool limit_given, int limit)
118
+ {
119
+ VALUE str = rb_str_to_str(self);
120
+
121
+ const char *begin = RSTRING_PTR(str);
122
+ const char *p = begin;
123
+ const char *end = RSTRING_END(str);
124
+
125
+ VALUE result = rb_ary_new();
126
+
127
+ bool last_was_empty = false;
128
+
129
+ long start = 0;
130
+ /* TODO: Better variable name. */
131
+ long offset;
132
+ int i = 1;
133
+ while ((offset = rb_reg_search(pattern, str, start, 0)) >= 0) {
134
+ struct re_registers *registers = RMATCH_REGS(rb_backref_get());
135
+ if (start == offset && registers->beg[0] == registers->end[0]) {
136
+ if (begin == NULL) {
137
+ rb_ary_push(result, rb_u_string_new_empty(self));
138
+ break;
139
+ } else if (last_was_empty) {
140
+ const char *q;
141
+ u_decode(&q, p, end);
142
+ rb_ary_push(result,
143
+ rb_u_string_new_subsequence(self,
144
+ p - begin,
145
+ q - p));
146
+ } else {
147
+ if (begin + start == end)
148
+ start++;
149
+ else {
150
+ const char *q;
151
+ u_decode(&q, p, end);
152
+ start += q - p;
153
+ }
154
+ last_was_empty = true;
155
+ continue;
156
+ }
157
+ } else {
158
+ rb_ary_push(result,
159
+ rb_u_string_new_subsequence(self,
160
+ p - begin,
161
+ offset - (p - begin)));
162
+ start = registers->end[0];
163
+ }
164
+ last_was_empty = false;
165
+ p = begin + start;
166
+
167
+ rb_u_string_split_pattern_push_registers(self, registers, result);
168
+
169
+ i++;
170
+ if (limit_given && i == limit)
171
+ break;
172
+ }
173
+
174
+ return rb_u_string_split_rest(self, p - begin, limit_given, limit, result);
175
+ }
176
+
177
+ /* @overload split(pattern = $;, limit = 0)
178
+ *
179
+ * Returns the receiver split into LIMIT substrings separated by PATTERN,
180
+ * each inheriting any taint and untrust.
181
+ *
182
+ * If PATTERN = `$;` = nil or PATTERN = `' '`, splits according to AWK rules,
183
+ * that is, any {#space?} prefix is skipped, then substrings are separated by
184
+ * non-empty {#space?} substrings.
185
+ *
186
+ * If LIMIT < 0, then no limit is imposed and trailing {#empty?} substrings
187
+ * aren’t removed.
188
+ *
189
+ * If LIMIT = 0, then no limit is imposed and trailing {#empty?} substrings
190
+ * are removed.
191
+ *
192
+ * If LIMIT = 1, then, if {#length} = 0, the result will be empty, otherwise
193
+ * it will consist of the receiver only.
194
+ *
195
+ * If LIMIT > 1, then the receiver is split into at most LIMIT substrings.
196
+ *
197
+ * @param [Regexp, #to_str] pattern
198
+ * @param [#to_int] limit
199
+ * @return [Array<U::String>] */
200
+ VALUE
201
+ rb_u_string_split_m(int argc, VALUE *argv, VALUE self)
202
+ {
203
+ VALUE rbpattern, rblimit;
204
+ int limit = 0;
205
+ bool limit_given;
206
+
207
+ if (rb_scan_args(argc, argv, "02", &rbpattern, &rblimit) == 2)
208
+ limit = NUM2INT(rblimit);
209
+
210
+ const struct rb_u_string *string = RVAL2USTRING(self);
211
+
212
+ if (limit == 1) {
213
+ if (USTRING_LENGTH(string) == 0)
214
+ return rb_ary_new2(0);
215
+
216
+ return rb_ary_new3(1, self);
217
+ }
218
+
219
+ limit_given = !NIL_P(rblimit) && limit >= 0;
220
+
221
+ if (NIL_P(rbpattern) && NIL_P(rb_fs))
222
+ return rb_u_string_split_awk(self, limit_given, limit);
223
+ else if (NIL_P(rbpattern))
224
+ rbpattern = rb_fs;
225
+
226
+ if (TYPE(rbpattern) != T_STRING && !RTEST(rb_obj_is_kind_of(rbpattern, rb_cUString)))
227
+ return rb_u_string_split_pattern(self,
228
+ rb_u_pattern_argument(rbpattern, true),
229
+ limit_given,
230
+ limit);
231
+
232
+ const struct rb_u_string *pattern = RVAL2USTRING_ANY(rbpattern);
233
+ const char *p = USTRING_STR(pattern);
234
+ long length = USTRING_LENGTH(pattern);
235
+
236
+ if (length == 0)
237
+ return rb_u_string_split_pattern(self,
238
+ rb_reg_regcomp(rb_str_to_str(rbpattern)),
239
+ limit_given,
240
+ limit);
241
+ else if (length == 1 && *p == ' ')
242
+ return rb_u_string_split_awk(self, limit_given, limit);
243
+ else
244
+ return rb_u_string_split_string(self, rbpattern, limit_given, limit);
245
+ }
@@ -0,0 +1,75 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_string_internal_tr.h"
3
+
4
+ static long
5
+ rb_u_string_squeeze_loop(const struct rb_u_string *string, struct tr_table *table,
6
+ char *result)
7
+ {
8
+ long count = 0;
9
+
10
+ const char *p = USTRING_STR(string);
11
+ const char *end = USTRING_END(string);
12
+
13
+ uint32_t previous = U_N_CODEPOINTS;
14
+ char *base = result;
15
+ while (p < end) {
16
+ const char *q;
17
+ uint32_t c = u_decode(&q, p, end);
18
+ if (c != previous ||
19
+ (table != NULL && !tr_table_lookup(table, c))) {
20
+ long run = q - p;
21
+ if (base != NULL) {
22
+ memcpy(base, p, run);
23
+ base += run;
24
+ }
25
+ count += run;
26
+ previous = c;
27
+ }
28
+ p = q;
29
+ }
30
+
31
+ return count;
32
+ }
33
+
34
+ /* @overload squeeze(*sets)
35
+ *
36
+ * Returns the receiver, replacing any substrings of {#length} > 1 consisting
37
+ * of the same character _c_ with _c_, where _c_ is a member of the
38
+ * intersection of the character sets in SETS, inheriting any taint and
39
+ * untrust.
40
+ *
41
+ * If SETS is empty, then the set of all Unicode characters is used.
42
+ *
43
+ * The complement of all Unicode characters and a given set of characters may
44
+ * be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
45
+ * ACCENT).
46
+ *
47
+ * Any sequence of characters _a_-_b_ inside a set will expand to also
48
+ * include all characters whose code points lay between those of _a_ and _b_.
49
+ *
50
+ * @param [Array<U::String, #to_str>] sets
51
+ * @return [U::String] */
52
+ VALUE
53
+ rb_u_string_squeeze(int argc, VALUE *argv, VALUE self)
54
+ {
55
+ const struct rb_u_string *string = RVAL2USTRING(self);
56
+
57
+ if (USTRING_LENGTH(string) == 0)
58
+ return Qnil;
59
+
60
+ struct tr_table table;
61
+ if (argc > 0)
62
+ tr_table_initialize_from_strings(&table, argc, argv);
63
+
64
+ struct tr_table *table_pointer = (argc > 0) ? &table : NULL;
65
+
66
+ long count = rb_u_string_squeeze_loop(string, table_pointer, NULL);
67
+ if (count == 0)
68
+ return self;
69
+
70
+ char *remaining = ALLOC_N(char, count + 1);
71
+ rb_u_string_squeeze_loop(string, table_pointer, remaining);
72
+ remaining[count] = '\0';
73
+
74
+ return rb_u_string_new_c_own(self, remaining, count);
75
+ }
@@ -0,0 +1,31 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload start_with?(*prefixes)
4
+ * @param [Array] prefixes
5
+ * @return [Boolean] True if any element of PREFIXES that responds to #to_str
6
+ * is a byte-level prefix of the receiver */
7
+ VALUE
8
+ rb_u_string_start_with(int argc, VALUE *argv, VALUE self)
9
+ {
10
+ const struct rb_u_string *string = RVAL2USTRING(self);
11
+ const char *p = USTRING_STR(string);
12
+ long p_length = USTRING_LENGTH(string);
13
+
14
+ for (int i = 0; i < argc; i++) {
15
+ VALUE tmp = rb_u_string_check_type(argv[i]);
16
+ if (NIL_P(tmp))
17
+ continue;
18
+
19
+ const struct rb_u_string *other = RVAL2USTRING_ANY(tmp);
20
+ const char *q = USTRING_STR(other);
21
+ long q_length = USTRING_LENGTH(other);
22
+
23
+ if (p_length < q_length)
24
+ continue;
25
+
26
+ if (memcmp(p, q, q_length) == 0)
27
+ return Qtrue;
28
+ }
29
+
30
+ return Qfalse;
31
+ }
@@ -0,0 +1,36 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @return [U::String] The receiver with its maximum {#space?} prefix and
4
+ * suffix removed, inheriting any taint and untrust
5
+ * @see #lstrip
6
+ * @see #rstrip */
7
+ VALUE
8
+ rb_u_string_strip(VALUE self)
9
+ {
10
+ const struct rb_u_string *string = RVAL2USTRING(self);
11
+
12
+ const char *begin = USTRING_STR(string);
13
+ if (begin == NULL)
14
+ return self;
15
+
16
+ const char *end = USTRING_END(string);
17
+ const char *s = begin;
18
+ uint32_t c;
19
+ const char *t;
20
+ while (s < end && u_char_isspace(u_decode(&t, s, end)))
21
+ s = t;
22
+
23
+ t = end;
24
+ while (begin < t) {
25
+ const char *p;
26
+ c = u_decode_r(&p, begin, t);
27
+ if (c != '\0' && !u_char_isspace(c))
28
+ break;
29
+ t = p;
30
+ }
31
+
32
+ if (s == begin && t == end)
33
+ return self;
34
+
35
+ return rb_u_string_new_c(self, s, t - s);
36
+ }
@@ -0,0 +1,147 @@
1
+ #include "rb_includes.h"
2
+ #include "rb_u_re.h"
3
+
4
+ /* @overload sub(pattern, replacement)
5
+ *
6
+ * Returns the receiver with the first match of PATTERN replaced by
7
+ * REPLACEMENT, inheriting any taint and untrust from the receiver and from
8
+ * REPLACEMENT, or nil if there’s no match.
9
+ *
10
+ * The REPLACEMENT is used as a specification for what to replace matches
11
+ * with:
12
+ *
13
+ * <table>
14
+ * <thead>
15
+ * <tr><th>Specification</th><th>Replacement</th></tr>
16
+ * </thead>
17
+ * <tbody>
18
+ * <tr>
19
+ * <td><code>\1</code>, <code>\2</code>, …, <code>\</code><em>n</em></td>
20
+ * <td>Numbered sub-match <em>n</em></td>
21
+ * </tr>
22
+ * <tr>
23
+ * <td><code>\k&lt;</code><em>name</em><code>></code></td>
24
+ * <td>Named sub-match <em>name</em></td>
25
+ * </tr>
26
+ * </tbody>
27
+ * </table>
28
+ *
29
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
30
+ * `$`_n_ are updated accordingly.
31
+ *
32
+ * @param [Regexp, #to_str] pattern
33
+ * @param [#to_str] replacement
34
+ * @return [U::String, nil]
35
+ *
36
+ * @overload sub(pattern, replacements)
37
+ *
38
+ * Returns the receiver with the first match of PATTERN replaced by
39
+ * REPLACEMENTS#[_match_], where _match_ is the matched substring, inheriting
40
+ * any taint and untrust from the receiver, REPLACEMENTS, and
41
+ * REPLACEMENTS#[_match_], or nil if there’s no match.
42
+ *
43
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
44
+ * `$`_n_ are updated accordingly.
45
+ *
46
+ * @param [Regexp, #to_str] pattern
47
+ * @param [#to_hash] replacements
48
+ * @raise [Exception] Any error raised by REPLACEMENTS#default, if it gets
49
+ * called
50
+ * @return [U::String, nil]
51
+ *
52
+ * @overload sub(pattern){ |match| … }
53
+ *
54
+ * Returns the receiver with all instances of PATTERN replaced by the results
55
+ * of the given block, inheriting any taint and untrust from the receiver and
56
+ * from the results of the given block, or nil if there’s no match.
57
+ *
58
+ * The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`, `$2`, …,
59
+ * `$`_n_ are updated accordingly.
60
+ *
61
+ * @param [Regexp, #to_str] pattern
62
+ * @yieldparam [U::String] match
63
+ * @yieldreturn [#to_str]
64
+ * @return [U::String, nil] */
65
+ VALUE
66
+ rb_u_string_sub(int argc, VALUE *argv, VALUE self)
67
+ {
68
+ VALUE pattern, replacement;
69
+ VALUE replacements = Qnil;
70
+ bool use_block = false;
71
+ bool tainted = false;
72
+ bool untrusted = false;
73
+
74
+ if (argc == 1)
75
+ use_block = true;
76
+
77
+ if (rb_scan_args(argc, argv, "11", &pattern, &replacement) == 2) {
78
+ replacements = rb_check_convert_type(replacement, T_HASH,
79
+ "Hash", "to_hash");
80
+ if (NIL_P(replacements))
81
+ StringValue(replacement);
82
+ if (OBJ_TAINTED(replacement))
83
+ tainted = true;
84
+ if (OBJ_UNTRUSTED(replacement))
85
+ untrusted = true;
86
+ }
87
+
88
+ pattern = rb_u_pattern_argument(pattern, true);
89
+
90
+ VALUE str = rb_str_to_str(self);
91
+ long begin = rb_reg_search(pattern, str, 0, 0);
92
+ if (begin < 0)
93
+ return Qnil;
94
+
95
+ VALUE match = rb_backref_get();
96
+ struct re_registers *registers = RMATCH_REGS(match);
97
+ VALUE result;
98
+ if (use_block || !NIL_P(replacements)) {
99
+ if (use_block) {
100
+ VALUE ustr = rb_u_string_new_rb(rb_reg_nth_match(0, match));
101
+ result = rb_u_string_object_as_string(rb_yield(ustr));
102
+ } else {
103
+ VALUE ustr = rb_u_string_new_c(self,
104
+ RSTRING_PTR(str) + registers->beg[0],
105
+ registers->end[0] - registers->beg[0]);
106
+ result = rb_u_string_object_as_string(rb_hash_aref(replacements, ustr));
107
+ }
108
+ } else
109
+ result =
110
+ #ifdef HAVE_RB_REG_REGSUB4
111
+ rb_reg_regsub(replacement, str, registers, pattern);
112
+ #else
113
+ rb_reg_regsub(replacement, str, registers);
114
+ #endif
115
+
116
+ if (OBJ_TAINTED(result))
117
+ tainted = true;
118
+ if (OBJ_UNTRUSTED(result))
119
+ untrusted = true;
120
+
121
+ const struct rb_u_string *value = RVAL2USTRING_ANY(result);
122
+
123
+ size_t length = registers->beg[0] +
124
+ USTRING_LENGTH(value) +
125
+ (RSTRING_LEN(str) - registers->end[0]);
126
+ char *base = ALLOC_N(char, length + 1);
127
+ MEMCPY(base,
128
+ RSTRING_PTR(str),
129
+ char,
130
+ registers->beg[0]);
131
+ MEMCPY(base + registers->beg[0],
132
+ USTRING_STR(value),
133
+ char,
134
+ USTRING_LENGTH(value));
135
+ MEMCPY(base + registers->beg[0] + USTRING_LENGTH(value),
136
+ RSTRING_PTR(str) + registers->end[0],
137
+ char,
138
+ RSTRING_LEN(str) - registers->end[0]);
139
+ base[length] = '\0';
140
+
141
+ VALUE substituted = rb_u_string_new_c_own(self, base, length);
142
+ if (tainted)
143
+ OBJ_TAINT(substituted);
144
+ if (untrusted)
145
+ OBJ_UNTRUST(substituted);
146
+ return substituted;
147
+ }