u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,443 @@
1
+ /* -*- coding: utf-8 -*- */
2
+
3
+ #include <ruby.h>
4
+ #include <stdarg.h>
5
+ #include <stdbool.h>
6
+ #include <stddef.h>
7
+ #include <stdint.h>
8
+ #include <limits.h>
9
+ #include "u.h"
10
+ #include "private.h"
11
+ #include "rb_private.h"
12
+ #include "rb_u_buffer.h"
13
+ #include "rb_u_string.h"
14
+
15
+
16
+ #define RVAL2RBUBUFFER(object) \
17
+ (Check_Type(object, T_DATA), (struct rb_u_buffer *)DATA_PTR(object))
18
+
19
+ #define UBUFFER2RVAL(buffer) \
20
+ Data_Wrap_Struct(rb_cUBuffer, NULL, rb_u_buffer_free, buffer)
21
+
22
+
23
+ struct rb_u_buffer {
24
+ char *c;
25
+ long length;
26
+ long allocated;
27
+ long initially_allocated;
28
+ };
29
+
30
+
31
+ static VALUE rb_cUBuffer;
32
+
33
+
34
+ static void
35
+ rb_u_buffer_free(struct rb_u_buffer *buffer)
36
+ {
37
+ free(buffer->c);
38
+ free(buffer);
39
+ }
40
+
41
+ static inline long
42
+ nearest_power(long base, long minimum)
43
+ {
44
+ if (minimum < 0 || minimum * 2 < 0)
45
+ return -1;
46
+
47
+ long n = base;
48
+ while (n < minimum)
49
+ n *= 2;
50
+
51
+ return n;
52
+ }
53
+
54
+ static void
55
+ u_buffer_maybe_expand(struct rb_u_buffer *buffer, long additional)
56
+ {
57
+ if (buffer->length + additional < buffer->allocated)
58
+ return;
59
+
60
+ long allocate = nearest_power(1, buffer->length + additional);
61
+ if (allocate < 0)
62
+ rb_u_raise(rb_eNoMemError,
63
+ "buffer would be too large: %ld + %ld + 1 > %ld",
64
+ buffer->length, additional, LONG_MAX);
65
+ REALLOC_N(buffer->c, char, allocate);
66
+ buffer->allocated = allocate;
67
+ }
68
+
69
+ static void
70
+ rb_u_buffer_reset(struct rb_u_buffer *buffer)
71
+ {
72
+ buffer->c = NULL;
73
+ buffer->length = 0;
74
+ buffer->allocated = 0;
75
+
76
+ if (buffer->initially_allocated > 0)
77
+ u_buffer_maybe_expand(buffer, buffer->initially_allocated);
78
+ }
79
+
80
+ static VALUE
81
+ rb_u_buffer_create(long size)
82
+ {
83
+ struct rb_u_buffer *buffer = ALLOC(struct rb_u_buffer);
84
+
85
+ buffer->initially_allocated = size;
86
+
87
+ rb_u_buffer_reset(buffer);
88
+
89
+ return UBUFFER2RVAL(buffer);
90
+ }
91
+
92
+ static VALUE
93
+ rb_u_buffer_alloc(UNUSED(VALUE klass))
94
+ {
95
+ return rb_u_buffer_create(0);
96
+ }
97
+
98
+ VALUE
99
+ rb_u_buffer_new(void)
100
+ {
101
+ return rb_u_buffer_create(0);
102
+ }
103
+
104
+ VALUE
105
+ rb_u_buffer_new_sized(long size)
106
+ {
107
+ return rb_u_buffer_create(size);
108
+ }
109
+
110
+ VALUE
111
+ rb_u_buffer_append(VALUE self, const char *str, long length)
112
+ {
113
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
114
+
115
+ u_buffer_maybe_expand(buffer, length);
116
+ memcpy(buffer->c + buffer->length, str, length);
117
+ buffer->length += length;
118
+
119
+ return self;
120
+ }
121
+
122
+ VALUE
123
+ rb_u_buffer_append_char(VALUE self, uint32_t c)
124
+ {
125
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
126
+
127
+ u_buffer_maybe_expand(buffer, U_CHAR_MAX_BYTE_LENGTH);
128
+ buffer->length += u_char_to_u(c, buffer->c + buffer->length);
129
+
130
+ return self;
131
+ }
132
+
133
+ VALUE
134
+ rb_u_buffer_append_char_n(VALUE self, uint32_t c, long n)
135
+ {
136
+ if (n < 1)
137
+ return self;
138
+
139
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
140
+
141
+ if (c < 128) {
142
+ u_buffer_maybe_expand(buffer, n);
143
+ memset(buffer->c + buffer->length, c & 0x7f, n);
144
+ buffer->length += n;
145
+ return self;
146
+ }
147
+
148
+ char buf[U_CHAR_MAX_BYTE_LENGTH];
149
+ int length = u_char_to_u(c, buf);
150
+ u_buffer_maybe_expand(buffer, length * n);
151
+ for (int i = 0; i < n; i++)
152
+ memcpy(buffer->c + buffer->length + length * i, buf, length);
153
+ buffer->length += length * n;
154
+
155
+ return self;
156
+ }
157
+
158
+ #pragma GCC diagnostic ignored "-Wformat-nonliteral"
159
+ VALUE
160
+ rb_u_buffer_append_printf(VALUE self, size_t needed, const char *format, ...)
161
+ {
162
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
163
+
164
+ u_buffer_maybe_expand(buffer, needed);
165
+
166
+ va_list arguments;
167
+ va_start(arguments, format);
168
+ int length = vsnprintf(buffer->c + buffer->length, needed,
169
+ format, arguments);
170
+ va_end(arguments);
171
+
172
+ if (length < 0)
173
+ rb_sys_fail("system vsnprintf(3) failed");
174
+
175
+ if ((size_t)length >= needed)
176
+ rb_u_raise(rb_eNotImpError,
177
+ "format string buffer calculation is wrong: %s (%zu < %zu)",
178
+ format, needed, (size_t)length);
179
+
180
+ buffer->length += length;
181
+
182
+ return self;
183
+ }
184
+ #pragma GCC diagnostic warning "-Wformat-nonliteral"
185
+
186
+ /* @!visibility public
187
+ * @overload new(size = 128)
188
+ *
189
+ * Sets up a new buffer of SIZE bytes.
190
+ *
191
+ * @param [#to_int] size */
192
+ static VALUE
193
+ rb_u_buffer_initialize(int argc, VALUE *argv, VALUE self)
194
+ {
195
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
196
+ VALUE rbsize;
197
+
198
+ rb_scan_args(argc, argv, "01", &rbsize);
199
+ long size = NIL_P(rbsize) ? 128 : NUM2LONG(rbsize);
200
+
201
+ u_buffer_maybe_expand(buffer, size);
202
+
203
+ return Qnil;
204
+ }
205
+
206
+ static VALUE
207
+ rb_u_buffer_initialize_copy(VALUE self, VALUE rboriginal)
208
+ {
209
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
210
+ const struct rb_u_buffer *original = RVAL2RBUBUFFER(rboriginal);
211
+
212
+ if (buffer == original)
213
+ return self;
214
+
215
+ buffer->initially_allocated = original->initially_allocated;
216
+
217
+ rb_u_buffer_append(self, original->c, original->length);
218
+
219
+ OBJ_INFECT(self, rboriginal);
220
+
221
+ return self;
222
+ }
223
+
224
+ /* @overload <<(*parts)
225
+ *
226
+ * Append each _p_ in PARTS, append _q_ to the receiver, where _q_ =
227
+ * _p_#to_s, if _p_ is a U::Buffer, _q_ = _p_#chr, if _p_ is a Fixnum or
228
+ * Bignum, _q_ = _p_#to_str, if _p_ is a U::String or responds to #to_str.
229
+ *
230
+ * @param [U::Buffer, Fixnum, Bignum, U::String, #to_str] parts
231
+ * @raise [RangeError] If a _p_ is a Fixnum or Bignum and ¬_p_#chr#valid?
232
+ * @return [self] */
233
+ VALUE
234
+ rb_u_buffer_append_m(int argc, VALUE *argv, VALUE self)
235
+ {
236
+ need_at_least_n_arguments(argc, 1);
237
+
238
+ for (int i = 0; i < argc; i++)
239
+ if (RTEST(rb_obj_is_kind_of(argv[i], rb_cUBuffer))) {
240
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(argv[i]);
241
+
242
+ rb_u_buffer_append(self, buffer->c, buffer->length);
243
+ OBJ_INFECT(self, argv[i]);
244
+ } else if (FIXNUM_P(argv[i]) || TYPE(argv[i]) == T_BIGNUM) {
245
+ uint32_t c = NUM2UINT(argv[i]);
246
+
247
+ /* TODO: This depends on an experimental API. Modify this once the API is
248
+ * stable. */
249
+ #if 0
250
+ if (rb_num_to_uint(argv[i], &c) != 0) {
251
+ if (FIXNUM_P(argv[i]))
252
+ rb_u_raise(rb_eRangeError,
253
+ "%ld out of char range",
254
+ FIX2LONG(argv[i]));
255
+ else
256
+ rb_u_raise(rb_eRangeError,
257
+ "Bignum out of char range");
258
+ }
259
+ #endif
260
+
261
+ if (!u_char_isvalid(c))
262
+ rb_u_raise(rb_eRangeError,
263
+ "invalid Unicode character: %u",
264
+ c);
265
+
266
+ rb_u_buffer_append_char(self, c);
267
+ } else {
268
+ const struct rb_u_string *string = RVAL2USTRING_ANY(argv[i]);
269
+
270
+ rb_u_buffer_append(self,
271
+ USTRING_STR(string),
272
+ USTRING_LENGTH(string));
273
+ OBJ_INFECT(self, argv[i]);
274
+ }
275
+
276
+ return self;
277
+ }
278
+
279
+ /* @return [U::String] A UTF-8-encoded string of the receiver’s content */
280
+ VALUE
281
+ rb_u_buffer_to_u(VALUE self)
282
+ {
283
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
284
+
285
+ return rb_u_string_new_c(self, buffer->c, buffer->length);
286
+ }
287
+
288
+ /* @return [U::String] The UTF-8-encoded string of the receiver’s content after
289
+ * clearing it from the receiver
290
+ * @note This method differs from {#to_u} in that it doesn’t copy the result,
291
+ * so it’s generally faster; call it when you’re done building your
292
+ * {U::String}. */
293
+ VALUE
294
+ rb_u_buffer_to_u_bang(VALUE self)
295
+ {
296
+ struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
297
+
298
+ char *c = buffer->c;
299
+ long length = buffer->length;
300
+ rb_u_buffer_reset(buffer);
301
+
302
+ REALLOC_N(c, char, length + 1);
303
+ c[length] = '\0';
304
+
305
+ return rb_u_string_new_c_own(self, c, length);
306
+ }
307
+
308
+ /* @return [String] A UTF-8-encoded string of the receiver’s content */
309
+ VALUE
310
+ rb_u_buffer_to_s(VALUE self)
311
+ {
312
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
313
+
314
+ VALUE result = rb_u_str_new(buffer->c, buffer->length);
315
+ OBJ_INFECT(result, self);
316
+ return result;
317
+ }
318
+
319
+ /* @return [Integer] The number of characters in the receiver */
320
+ VALUE
321
+ rb_u_buffer_length(VALUE self)
322
+ {
323
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
324
+ return UINT2NUM(u_n_chars_n(buffer->c, buffer->length));
325
+ }
326
+
327
+ /* @return [Integer] The number of bytes required to represent the receiver */
328
+ VALUE
329
+ rb_u_buffer_bytesize(VALUE self)
330
+ {
331
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
332
+ return UINT2NUM(buffer->length);
333
+ }
334
+
335
+ /* Returns the width of the receiver. The width is defined as the sum of the
336
+ * number of “cells” on a terminal or similar cell-based display that the
337
+ * characters in the string will require.
338
+ *
339
+ * Characters that are {U::String#wide?} have a width of 2. Characters that
340
+ * are {U::String#zero_width?} have a width of 0. Other characters have a
341
+ * width of 1.
342
+ *
343
+ * @return [Integer]
344
+ * @see http://www.unicode.org/reports/tr11/
345
+ * Unicode Standard Annex #11: East Asian Width */
346
+ VALUE
347
+ rb_u_buffer_width(VALUE self)
348
+ {
349
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
350
+ return UINT2NUM(u_width_n(buffer->c, buffer->length));
351
+ }
352
+
353
+ /* @overload ==(other)
354
+ * @param [U::Buffer] other
355
+ * @return [Boolean] True if the receiver’s class and content equal those of
356
+ * OTHER */
357
+ VALUE
358
+ rb_u_buffer_eql(VALUE self, VALUE rbother)
359
+ {
360
+ if (self == rbother)
361
+ return Qtrue;
362
+
363
+ if (!RTEST(rb_obj_is_kind_of(rbother, rb_cUBuffer)))
364
+ return Qfalse;
365
+
366
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
367
+ const struct rb_u_buffer *other = RVAL2RBUBUFFER(rbother);
368
+
369
+ return buffer->length == other->length &&
370
+ memcmp(buffer->c, other->c, other->length) == 0 ?
371
+ Qtrue : Qfalse;
372
+ }
373
+
374
+ /* @return [Fixnum] The hash value of the receiver’s content */
375
+ VALUE
376
+ rb_u_buffer_hash(VALUE self)
377
+ {
378
+ const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
379
+
380
+ return INT2FIX(rb_memhash(buffer->c, buffer->length));
381
+ }
382
+
383
+ /* Document-class: U::Buffer
384
+ *
385
+ * A buffer for building {U::String}s. Buffers should be used when you want to
386
+ * create U::Strings step-wise, for example, when joining them together, or
387
+ * reading some input. Create a new buffer with {#initialize}, optionally
388
+ * specifying an initial size. Then, {#append} (or {#<<}) content to it. You
389
+ * can also {#append_format}ted content. You can check the {#length}
390
+ * ({#size}), {#bytesize}, and {#width} of the buffer, which can be useful if
391
+ * you want to limit how much content you want to generate. Once all content
392
+ * has been appended, a buffer can be converted to a U::String via {#to_u} or
393
+ * {#to_u!} depending on whether you want to let the buffer keep its content or
394
+ * not. You can also convert it to a String with {#to_s}.
395
+ *
396
+ * @example Benchmarking U::String#+ Versus U::Buffer#append/U::Buffer#to_u!
397
+ * require 'benchmark'
398
+ * require 'u-1.0'
399
+ * Benchmark.bm do |x|
400
+ * x.report do
401
+ * a = ''.u
402
+ * 100000.times do
403
+ * a = a + 'a'
404
+ * end
405
+ * end
406
+ * x.report do
407
+ * b = U::Buffer.new
408
+ * 100000.times do
409
+ * b.append 'a'
410
+ * end
411
+ * a = b.to_u!
412
+ * end
413
+ * end
414
+ * # ⇒
415
+ * # user system total real
416
+ * # 3.560000 0.650000 4.210000 ( 4.726064)
417
+ * # 0.060000 0.000000 0.060000 ( 0.057134) */
418
+ void
419
+ Init_u_buffer(VALUE mU)
420
+ {
421
+ rb_cUBuffer = rb_define_class_under(mU, "Buffer", rb_cData);
422
+
423
+ rb_define_alloc_func(rb_cUBuffer, rb_u_buffer_alloc);
424
+ rb_define_private_method(rb_cUBuffer, "initialize", rb_u_buffer_initialize, -1);
425
+ rb_define_private_method(rb_cUBuffer, "initialize_copy", rb_u_buffer_initialize_copy, 1);
426
+
427
+ rb_define_method(rb_cUBuffer, "append", rb_u_buffer_append_m, -1);
428
+ rb_define_alias(rb_cUBuffer, "<<", "append");
429
+ rb_define_method(rb_cUBuffer, "append_format", rb_u_buffer_append_format_m, -1); /* in ext/u/rb_u_string_format.c */
430
+
431
+ rb_define_method(rb_cUBuffer, "to_u", rb_u_buffer_to_u, 0);
432
+ rb_define_method(rb_cUBuffer, "to_u!", rb_u_buffer_to_u_bang, 0);
433
+ rb_define_method(rb_cUBuffer, "to_s", rb_u_buffer_to_s, 0);
434
+
435
+ rb_define_method(rb_cUBuffer, "length", rb_u_buffer_length, 0);
436
+ rb_define_alias(rb_cUBuffer, "size", "length");
437
+ rb_define_method(rb_cUBuffer, "bytesize", rb_u_buffer_bytesize, 0);
438
+ rb_define_method(rb_cUBuffer, "width", rb_u_buffer_width, 0);
439
+
440
+ rb_define_method(rb_cUBuffer, "==", rb_u_buffer_eql, 1);
441
+ rb_define_alias(rb_cUBuffer, "eql?", "==");
442
+ rb_define_method(rb_cUBuffer, "hash", rb_u_buffer_hash, 0);
443
+ }
@@ -0,0 +1,24 @@
1
+ VALUE rb_u_buffer_new(void);
2
+ VALUE rb_u_buffer_new_sized(long size);
3
+
4
+ VALUE rb_u_buffer_append_printf(VALUE self, size_t needed,
5
+ const char *format, ...) PRINTF(3, 4);
6
+
7
+ VALUE rb_u_buffer_append(VALUE self, const char *str, long length);
8
+ VALUE rb_u_buffer_append_format(int argc, const VALUE *argv, VALUE self, VALUE format);
9
+ VALUE rb_u_buffer_append_format_m(int argc, const VALUE *argv, VALUE self);
10
+ VALUE rb_u_buffer_append_char(VALUE self, uint32_t c);
11
+ VALUE rb_u_buffer_append_char_n(VALUE self, uint32_t c, long n);
12
+
13
+ VALUE rb_u_buffer_append_m(int argc, VALUE *argv, VALUE self);
14
+ VALUE rb_u_buffer_bytesize(VALUE self);
15
+ VALUE rb_u_buffer_eql(VALUE self, VALUE rbother);
16
+ VALUE rb_u_buffer_hash(VALUE self);
17
+ VALUE rb_u_buffer_inspect(VALUE self);
18
+ VALUE rb_u_buffer_length(VALUE self);
19
+ VALUE rb_u_buffer_to_s(VALUE self);
20
+ VALUE rb_u_buffer_to_u(VALUE self);
21
+ VALUE rb_u_buffer_to_u_bang(VALUE self);
22
+ VALUE rb_u_buffer_width(VALUE self);
23
+
24
+ void Init_u_buffer(VALUE mU);