u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,10 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload digit?
4
+ * @return [Boolean] True if the receiver contains only characters in the
5
+ * general category Number, decimal digit (Nd) */
6
+ VALUE
7
+ rb_u_string_digit(VALUE self)
8
+ {
9
+ return _rb_u_character_test(self, u_char_isdigit);
10
+ }
@@ -0,0 +1,13 @@
1
+ #include "rb_includes.h"
2
+
3
+ /* @overload downcase(locale = ENV['LC_CTYPE'])
4
+ * @param [#to_str] locale
5
+ * @return [U::String] The downcasing of the receiver according to the rules
6
+ * of the language of LOCALE, which may be empty to specifically use the
7
+ * default, language-independent, rules, inheriting any taint and
8
+ * untrust */
9
+ VALUE
10
+ rb_u_string_downcase(int argc, VALUE *argv, VALUE self)
11
+ {
12
+ return _rb_u_string_convert_locale(argc, argv, self, u_downcase, NULL);
13
+ }
@@ -0,0 +1,153 @@
1
+ #include <ctype.h>
2
+
3
+ #include "rb_includes.h"
4
+ #include "rb_u_buffer.h"
5
+
6
+ static inline bool
7
+ rb_u_string_dump_escape(VALUE buffer, unsigned char c)
8
+ {
9
+ const char *escape = NULL;
10
+
11
+ switch (c) {
12
+ case '"': escape = "\\\""; break;
13
+ case '\\': escape = "\\\\"; break;
14
+ case '\n': escape = "\\n"; break;
15
+ case '\r': escape = "\\r"; break;
16
+ case '\t': escape = "\\t"; break;
17
+ case '\f': escape = "\\f"; break;
18
+ case '\013': escape = "\\v"; break;
19
+ case '\010': escape = "\\b"; break;
20
+ case '\007': escape = "\\a"; break;
21
+ case '\033': escape = "\\e"; break;
22
+ default:
23
+ return false;
24
+ }
25
+
26
+ rb_u_buffer_append(buffer, escape, 2);
27
+
28
+ return true;
29
+ }
30
+
31
+ #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
32
+
33
+ static inline bool
34
+ rb_u_string_dump_hash(VALUE buffer, unsigned char c, const char *p, const char *end)
35
+ {
36
+ if (c != '#' || !IS_EVSTR(p + 1, end))
37
+ return false;
38
+
39
+ rb_u_buffer_append(buffer, "\\#", 2);
40
+
41
+ return true;
42
+ }
43
+
44
+ static inline bool
45
+ rb_u_string_dump_ascii_printable(VALUE buffer, unsigned char c)
46
+ {
47
+ if (c > 0x7f || !u_char_isprint(c))
48
+ return false;
49
+
50
+ rb_u_buffer_append_char(buffer, c);
51
+
52
+ return true;
53
+ }
54
+
55
+ #define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
56
+
57
+ static inline void
58
+ rb_u_string_dump_hex(VALUE buffer, unsigned char c)
59
+ {
60
+ char escaped[4 + 1];
61
+ int length = snprintf(escaped, sizeof(escaped), "\\x%02X", c);
62
+ rb_u_buffer_append(buffer, escaped, length);
63
+ }
64
+
65
+ static inline bool
66
+ rb_u_string_dump_codepoint(VALUE buffer, const char **p, const char *end)
67
+ {
68
+ const char *q;
69
+ uint32_t c = u_decode(&q, *p, end);
70
+ if (c == REPLACEMENT_CHARACTER && !u_valid(*p, q - *p, NULL)) {
71
+ for (const char *r = *p; r < q; r++)
72
+ rb_u_string_dump_hex(buffer, (unsigned char)*r);
73
+ /* -1, since we increase p inside the loop. */
74
+ *p = q - 1;
75
+ return true;
76
+ }
77
+ /* -1, since we increase p inside the loop. */
78
+ *p = q - 1;
79
+ char escaped[3 + sizeof(c) * CHAR_BIT + 2 + 1];
80
+ int length = snprintf(escaped, sizeof(escaped), "\\u{%x}", c);
81
+ rb_u_buffer_append(buffer, escaped, length);
82
+ return true;
83
+ }
84
+
85
+ /* Returns the receiver in a reader-friendly format, inheriting any taint and
86
+ * untrust.
87
+ *
88
+ * The reader-friendly format looks like “`"…".u`”. Inside the “…”, any
89
+ * {#print?} characters in the ASCII range are output as-is, the following
90
+ * special characters are escaped according to the following table:
91
+ *
92
+ * <table>
93
+ * <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
94
+ * <tbody>
95
+ * <tr><td>U+0022 QUOTATION MARK</td><td><code>\"</code></td></tr>
96
+ * <tr><td>U+005C REVERSE SOLIDUS</td><td><code>\\</code></td></tr>
97
+ * <tr><td>U+000A LINE FEED (LF)</td><td><code>\n</code></td></tr>
98
+ * <tr><td>U+000D CARRIAGE RETURN (CR)</td><td><code>\r</code></td></tr>
99
+ * <tr><td>U+0009 CHARACTER TABULATION</td><td><code>\t</code></td></tr>
100
+ * <tr><td>U+000C FORM FEED (FF)</td><td><code>\f</code></td></tr>
101
+ * <tr><td>U+000B LINE TABULATION</td><td><code>\v</code></td></tr>
102
+ * <tr><td>U+0008 BACKSPACE</td><td><code>\b</code></td></tr>
103
+ * <tr><td>U+0007 BELL</td><td><code>\a</code></td></tr>
104
+ * <tr><td>U+001B ESCAPE</td><td><code>\e</code></td></tr>
105
+ * </tbody>
106
+ * </table>
107
+ *
108
+ * the following special sequences are also escaped:
109
+ *
110
+ * <table>
111
+ * <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
112
+ * <tbody>
113
+ * <tr><td><code>#$</code></td><td><code>\#$</code></td></tr>
114
+ * <tr><td><code>#@</code></td><td><code>\#@</code></td></tr>
115
+ * <tr><td><code>#{</code></td><td><code>\#{</code></td></tr>
116
+ * </tbody>
117
+ * </table>
118
+ *
119
+ * any valid UTF-8 byte sequences are output as “`\u{`_n_`}`”, where _n_ is the
120
+ * lowercase hexadecimal representation of the code point encoded by the UTF-8
121
+ * sequence, and any other byte is output as “`\x`_n_”, where _n_ is the
122
+ * two-digit uppercase hexadecimal representation of the byte’s value.
123
+ *
124
+ * @return [U::String] */
125
+ VALUE
126
+ rb_u_string_dump(VALUE self)
127
+ {
128
+ const struct rb_u_string *string = RVAL2USTRING(self);
129
+ const char *p = USTRING_STR(string);
130
+ const char *end = USTRING_END(string);
131
+
132
+ VALUE buffer = rb_u_buffer_new_sized(7);
133
+
134
+ rb_u_buffer_append(buffer, "\"", 1);
135
+ while (p < end) {
136
+ unsigned char c = *p;
137
+
138
+ if (!rb_u_string_dump_escape(buffer, c) &&
139
+ !rb_u_string_dump_hash(buffer, c, p, end) &&
140
+ !rb_u_string_dump_ascii_printable(buffer, c) &&
141
+ !rb_u_string_dump_codepoint(buffer, &p, end))
142
+ rb_u_string_dump_hex(buffer, c);
143
+
144
+ p++;
145
+ }
146
+ rb_u_buffer_append(buffer, "\".u", 3);
147
+
148
+ VALUE result = rb_u_buffer_to_u_bang(buffer);
149
+
150
+ OBJ_INFECT(result, self);
151
+
152
+ return result;
153
+ }
@@ -0,0 +1,46 @@
1
+ #include "rb_includes.h"
2
+ #include "yield.h"
3
+
4
+ static void
5
+ each(VALUE self, struct yield *yield)
6
+ {
7
+ const struct rb_u_string *string = RVAL2USTRING(self);
8
+ const char *end = USTRING_END(string);
9
+ for (const char *p = USTRING_STR(string); p < end; p++)
10
+ yield_call(yield, INT2FIX(*p & 0xff));
11
+ }
12
+
13
+ UNUSED(static VALUE
14
+ size(VALUE self, UNUSED(VALUE args)))
15
+ {
16
+ return LONG2NUM(USTRING_LENGTH(RVAL2USTRING(self)));
17
+ }
18
+
19
+ /* @overload each_byte{ |byte| … }
20
+ *
21
+ * Enumerates the bytes in the receiver.
22
+ *
23
+ * @yieldparam [Fixnum] byte
24
+ * @return [self]
25
+ *
26
+ * @overload each_byte
27
+ *
28
+ * @return [Enumerator] An Enumerator over the bytes in the receiver
29
+ */
30
+ VALUE
31
+ rb_u_string_each_byte(VALUE self)
32
+ {
33
+ RETURN_SIZED_ENUMERATOR(self, 0, NULL, size);
34
+ struct yield y = YIELD_INIT;
35
+ each(self, &y);
36
+ return self;
37
+ }
38
+
39
+ /* @return [Array<Fixnum>] The bytes of the receiver. */
40
+ VALUE
41
+ rb_u_string_bytes(VALUE self)
42
+ {
43
+ struct yield_array y = YIELD_ARRAY_INIT;
44
+ each(self, &y.yield);
45
+ return y.array;
46
+ }
@@ -0,0 +1,49 @@
1
+ #include "rb_includes.h"
2
+ #include "yield.h"
3
+
4
+ static void
5
+ each(VALUE self, struct yield *yield)
6
+ {
7
+ const struct rb_u_string *s = RVAL2USTRING(self);
8
+ for (const char *p = USTRING_STR(s), *q, *end = USTRING_END(s); p < end; p = q) {
9
+ u_decode(&q, p, end);
10
+ yield_call(yield, rb_u_string_new_c(self, p, q - p));
11
+ }
12
+ }
13
+
14
+ UNUSED(static VALUE
15
+ size(VALUE self, UNUSED(VALUE args)))
16
+ {
17
+ const struct rb_u_string *string = RVAL2USTRING(self);
18
+ return UINT2NUM(u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string)));
19
+ }
20
+
21
+ /* @overload each_char{ |char| … }
22
+ *
23
+ * Enumerates the characters in the receiver, each inheriting any taint and
24
+ * untrust.
25
+ *
26
+ * @yieldparam [U::String] char
27
+ * @return [self]
28
+ *
29
+ * @overload each_char
30
+ *
31
+ * @return [Enumerator] An Enumerator over the characters in the receiver */
32
+ VALUE
33
+ rb_u_string_each_char(VALUE self)
34
+ {
35
+ RETURN_SIZED_ENUMERATOR(self, 0, NULL, size);
36
+ struct yield y = YIELD_INIT;
37
+ each(self, &y);
38
+ return self;
39
+ }
40
+
41
+ /* @return [Array<U::String>] The characters of the receiver, each inheriting
42
+ * any taint and untrust. */
43
+ VALUE
44
+ rb_u_string_chars(VALUE self)
45
+ {
46
+ struct yield_array y = YIELD_ARRAY_INIT;
47
+ each(self, &y.yield);
48
+ return y.array;
49
+ }
@@ -0,0 +1,45 @@
1
+ #include "rb_includes.h"
2
+ #include "yield.h"
3
+
4
+ static void
5
+ each(VALUE self, struct yield *yield)
6
+ {
7
+ const struct rb_u_string *s = RVAL2USTRING(self);
8
+ for (const char *p = USTRING_STR(s), *end = USTRING_END(s); p < end; )
9
+ yield_call(yield, UINT2NUM(u_decode(&p, p, end)));
10
+ }
11
+
12
+ UNUSED(static VALUE
13
+ size(VALUE self, UNUSED(VALUE args)))
14
+ {
15
+ const struct rb_u_string *string = RVAL2USTRING(self);
16
+ return UINT2NUM(u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string)));
17
+ }
18
+
19
+ /* @overload each_codepoint{ |codepoint| … }
20
+ *
21
+ * Enumerates the code points of the receiver.
22
+ *
23
+ * @yieldparam [Integer] codepoint
24
+ * @return [self]
25
+ *
26
+ * @overload each_codepoint
27
+ * @return [Enumerator] An Enumerator over the code points of the receiver
28
+ */
29
+ VALUE
30
+ rb_u_string_each_codepoint(VALUE self)
31
+ {
32
+ RETURN_SIZED_ENUMERATOR(self, 0, NULL, size);
33
+ struct yield y = YIELD_INIT;
34
+ each(self, &y);
35
+ return self;
36
+ }
37
+
38
+ /* @return [Array<Integer>] The code points of the receiver. */
39
+ VALUE
40
+ rb_u_string_codepoints(VALUE self)
41
+ {
42
+ struct yield_array y = YIELD_ARRAY_INIT;
43
+ each(self, &y.yield);
44
+ return y.array;
45
+ }
@@ -0,0 +1,36 @@
1
+ #include "rb_includes.h"
2
+
3
+ static void
4
+ each(const char *p, size_t n, VALUE *self)
5
+ {
6
+ rb_yield(rb_u_string_new_c(*self, p, n));
7
+ }
8
+
9
+ /* @overload each_grapheme_cluster{ |cluster| … }
10
+ *
11
+ * Enumerates the grapheme clusters in the receiver, each inheriting any
12
+ * taint and untrust.
13
+ *
14
+ * @yieldparam [U::String] cluster
15
+ * @return [self]
16
+ * @see http://www.unicode.org/reports/tr29/
17
+ * Unicode Standard Annex #29: Unicode Text Segmentation
18
+ *
19
+ * @overload each_grapheme_cluster
20
+ *
21
+ * @return [Enumerator] An Enumerator over the grapheme clusters in the
22
+ * receiver
23
+ * @see http://www.unicode.org/reports/tr29/
24
+ * Unicode Standard Annex #29: Unicode Text Segmentation */
25
+ VALUE
26
+ rb_u_string_each_grapheme_cluster(VALUE self)
27
+ {
28
+ RETURN_ENUMERATOR(self, 0, NULL);
29
+
30
+ const struct rb_u_string *string = RVAL2USTRING(self);
31
+ const char *p = USTRING_STR(string);
32
+ const char *end = USTRING_END(string);
33
+ size_t length = end - p;
34
+ u_grapheme_clusters(p, length, (u_substring_fn)each, &self);
35
+ return self;
36
+ }
@@ -0,0 +1,142 @@
1
+ #include "rb_includes.h"
2
+ #include "yield.h"
3
+
4
+ static void
5
+ rb_u_string_each_line_default(VALUE self, struct yield *yield)
6
+ {
7
+ const struct rb_u_string *string = RVAL2USTRING(self);
8
+
9
+ const char *begin = USTRING_STR(string);
10
+ const char *base = begin;
11
+ const char *p = begin;
12
+ const char *end = USTRING_END(string);
13
+
14
+ while (p < end) {
15
+ p = memchr(p, '\n', end - p);
16
+ if (p == NULL)
17
+ break;
18
+ p++;
19
+
20
+ yield_call(yield, rb_u_string_new_c(self, base, p - base));
21
+
22
+ base = p;
23
+ }
24
+
25
+ if (base != end)
26
+ yield_call(yield, rb_u_string_new_c(self, base, end - base));
27
+ }
28
+
29
+ static void
30
+ rb_u_string_each_line_separator(VALUE self, const struct rb_u_string *separator,
31
+ struct yield *yield)
32
+ {
33
+ const struct rb_u_string *string = RVAL2USTRING(self);
34
+
35
+ long separator_length = USTRING_LENGTH(separator);
36
+ const char *q;
37
+ uint32_t first = separator_length == 0 ?
38
+ '\n' :
39
+ u_decode(&q, USTRING_STR(separator), USTRING_END(separator));
40
+
41
+ const char *begin = USTRING_STR(string);
42
+ const char *base = begin;
43
+ const char *p = begin;
44
+ const char *end = USTRING_END(string);
45
+
46
+ while (p < end) {
47
+ uint32_t c = u_decode(&q, p, end);
48
+ again:
49
+ if (separator_length == 0 && c == first) {
50
+ p = q;
51
+ if (p < end) {
52
+ c = u_decode(&q, p, end);
53
+ if (c != first)
54
+ goto again;
55
+ }
56
+ while (p < end) {
57
+ if (u_decode(&q, p, end) != first)
58
+ break;
59
+ p = q;
60
+ }
61
+ }
62
+
63
+ if (c == first &&
64
+ (separator_length < 2 ||
65
+ (end - p >= separator_length &&
66
+ memcmp(USTRING_STR(separator), p, separator_length) == 0))) {
67
+ p += separator_length;
68
+ yield_call(yield, rb_u_string_new_c(self, base, p - base));
69
+ base = p;
70
+ } else
71
+ p = q;
72
+ }
73
+
74
+ if (base != end)
75
+ yield_call(yield, rb_u_string_new_c(self, base, end - base));
76
+ }
77
+
78
+ static void
79
+ each(int argc, VALUE *argv, VALUE self, struct yield *yield)
80
+ {
81
+ VALUE rs;
82
+ if (argc == 0)
83
+ rs = rb_rs;
84
+ else
85
+ rb_scan_args(argc, argv, "01", &rs);
86
+ if (NIL_P(rs)) {
87
+ yield_call(yield, self);
88
+ return;
89
+ }
90
+ const struct rb_u_string *separator = RVAL2USTRING_ANY(rs);
91
+ if (rs == rb_default_rs)
92
+ rb_u_string_each_line_default(self, yield);
93
+ else
94
+ rb_u_string_each_line_separator(self, separator, yield);
95
+ }
96
+
97
+ /* @overload each_line(separator = $/){ |lp| … }
98
+ *
99
+ * Enumerates the lines of the receiver, inheriting any taint and untrust.
100
+ *
101
+ * If SEPARATOR is nil, yields self. If SEPARATOR is {#empty?}, separates
102
+ * each line (paragraph) by two or more U+000A LINE FEED characters.
103
+ *
104
+ * @param [U::String, #to_str] separator
105
+ * @yieldparam [U::String, self] lp
106
+ * @return [self]
107
+ *
108
+ * @overload each_line(separator = $/)
109
+ *
110
+ * Returns an Enumerator over the lines of the receiver.
111
+ *
112
+ * If SEPARATOR is nil, self will be yielded. If SEPARATOR is {#empty?},
113
+ * separates each line (paragraph) by two or more U+000A LINE FEED
114
+ * characters.
115
+ *
116
+ * @param [U::String, #to_str] separator
117
+ * @return [Enumerator] */
118
+ VALUE
119
+ rb_u_string_each_line(int argc, VALUE *argv, VALUE self)
120
+ {
121
+ RETURN_ENUMERATOR(self, argc, argv);
122
+ struct yield y = YIELD_INIT;
123
+ each(argc, argv, self, &y);
124
+ return self;
125
+ }
126
+
127
+ /* @overload lines(separator = $/)
128
+ *
129
+ * Returns the lines of the receiver, inheriting any taint and untrust.
130
+ *
131
+ * If SEPARATOR is nil, yields self. If SEPARATOR is {#empty?}, separates
132
+ * each line (paragraph) by two or more U+000A LINE FEED characters.
133
+ *
134
+ * @param [U::String, #to_str] separator
135
+ * @return [Array<U::String>] */
136
+ VALUE
137
+ rb_u_string_lines(int argc, VALUE *argv, VALUE self)
138
+ {
139
+ struct yield_array y = YIELD_ARRAY_INIT;
140
+ each(argc, argv, self, &y.yield);
141
+ return y.array;
142
+ }