u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -1,38 +1,34 @@
1
- /*
2
- * contents: Private functions used by the UTF-8 character-encoding library.
3
- *
4
- * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
- */
6
-
7
1
  #include <ruby.h>
8
2
  #include <stdbool.h>
9
3
  #include <stddef.h>
10
4
  #include <stdint.h>
11
5
  #include <stdlib.h>
12
6
 
13
- #include "unicode.h"
7
+ #include "u.h"
14
8
 
15
9
  #include "private.h"
16
10
 
17
11
  /* Lookup C in the sorted TABLE using binary search. TABLE consists of N
18
12
  * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
19
- * component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
13
+ * component is a uint32_t of size SIZEOF_CHAR. If C is found in TABLE, its
20
14
  * index is stored in INDEX and true is returned. Otherwise, false is returned
21
15
  * and INDEX is left untouched. */
22
16
  bool
23
- binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
17
+ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, uint32_t c, size_t *index)
24
18
  {
25
- #define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
19
+ #define ENTRY(index) (*(uint32_t *)(void *)((const char *)table + ((index) * sizeof_entry)) & char_mask)
26
20
 
27
- int begin = 0;
28
- int end = n - 1;
29
- int middle;
21
+ size_t begin = 0;
22
+ size_t end = n - 1;
23
+ size_t middle;
30
24
 
31
25
  /* This is ugly, but not all tables use unichars as their lookup
32
26
  * character. The casefold table, for example, uses uint16_t-sized
33
27
  * characters. To only get the interesting part of our table entry
34
28
  * we’ll have to mask the retrieved value. */
35
- int char_mask = (1 << (8 * sizeof_char)) - 1;
29
+ uint32_t char_mask = sizeof_char < sizeof(uint32_t) ?
30
+ ((uint32_t)1 << (CHAR_BIT * sizeof_char)) - 1 :
31
+ (uint32_t)-1;
36
32
 
37
33
  /* Drop out early if we know for certain that C can’t be in the
38
34
  * decomposition table. */
@@ -42,7 +38,7 @@ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, si
42
38
  while (begin <= end) {
43
39
  middle = binary_search_middle_of(begin, end);
44
40
 
45
- unichar probe = ENTRY(middle);
41
+ uint32_t probe = ENTRY(middle);
46
42
  if (c < probe)
47
43
  end = middle - 1;
48
44
  else if (c > probe)
@@ -0,0 +1,58 @@
1
+ #define IS(category, class) (((unsigned int)1 << (category)) & (class))
2
+ #define OR(class, rest) (((unsigned int)1 << (class)) | (rest))
3
+
4
+ #define P_WITHIN_STR(p, end, use_end) \
5
+ ((use_end) ? (p) < (end) : *(p) != '\0')
6
+
7
+ #define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
8
+
9
+ #if defined(_WIN32) || defined(__CYGWIN__)
10
+ # ifdef U_COMPILATION
11
+ # define U_EXTERN __declspec(dllexport) extern
12
+ # else
13
+ # define U_EXTERN __declspec(dllimport) extern
14
+ # endif
15
+ #elif __GNUC__ >= 4
16
+ # define U_EXTERN __attribute__((visibility("default"))) extern
17
+ #else
18
+ # define U_EXTERN extern
19
+ #endif
20
+
21
+ #if defined(__GNUC__) && __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
22
+ # define PRINTF(format_index, first_argument_index) \
23
+ __attribute__((format(printf, format_index, first_argument_index)))
24
+ # define UNUSED(u) \
25
+ __attribute__((__unused__)) u
26
+ #else
27
+ # define PRINTF(format, arguments)
28
+ # define UNUSED(u) u
29
+ #endif
30
+
31
+ #if defined(__GNUC__) && __GNUC__ > 2 && defined(__OPTIMIZE__)
32
+ # define BOOLEAN_EXPR(expr) __extension__({ \
33
+ int _boolean_var_; \
34
+ if (expr) \
35
+ _boolean_var_ = 1; \
36
+ else \
37
+ _boolean_var_ = 0; \
38
+ _boolean_var_; \
39
+ })
40
+ # define LIKELY(expr) (__builtin_expect(BOOLEAN_EXPR(expr), 1))
41
+ # define UNLIKELY(expr) (__builtin_expect(BOOLEAN_EXPR(expr), 0))
42
+ #else
43
+ # define LIKELY(expr) (expr)
44
+ # define UNLIKELY(expr) (expr)
45
+ #endif
46
+
47
+ #define binary_search_middle_of(begin, end) \
48
+ (((unsigned)((begin) + (end))) >> 1)
49
+
50
+ #define unicode_table_lookup(table, c, index) \
51
+ binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
52
+
53
+ bool binary_search_unicode_table(const void *table,
54
+ size_t n,
55
+ size_t sizeof_entry,
56
+ size_t sizeof_char,
57
+ uint32_t c,
58
+ size_t *index);
@@ -0,0 +1,10 @@
1
+ #include <ruby.h>
2
+ #include <stdarg.h>
3
+ #include <stdbool.h>
4
+ #include <stddef.h>
5
+ #include <stdint.h>
6
+ #include "extconf.h"
7
+ #include "u.h"
8
+ #include "private.h"
9
+ #include "rb_private.h"
10
+ #include "rb_u_string.h"
@@ -0,0 +1,98 @@
1
+ #include "rb_includes.h"
2
+ #ifdef HAVE_RUBY_ENCODING_H
3
+ # include <ruby/encoding.h>
4
+ #endif
5
+ #include <errno.h>
6
+
7
+ #ifndef HAVE_RB_MEMHASH
8
+ int
9
+ rb_memhash(const char *string, long length)
10
+ {
11
+ const char *p = string;
12
+ const char *end = string + length;
13
+ int hash = 0;
14
+
15
+ while (p < end) {
16
+ hash = hash * 65599 + *p;
17
+ p++;
18
+ }
19
+
20
+ return hash + (hash >> 5);
21
+ }
22
+ #endif
23
+
24
+ long
25
+ rb_u_memsearch(const void *a, long a_n, const void *b, long b_n)
26
+ {
27
+ #ifdef HAVE_RUBY_ENCODING_H
28
+ return rb_memsearch(a, a_n, b, b_n, rb_utf8_encoding());
29
+ #else
30
+ return rb_memsearch(a, a_n, b, b_n);
31
+ #endif
32
+ }
33
+
34
+ static VALUE PRINTF(1, 0)
35
+ format_message(const char *format, va_list args)
36
+ {
37
+ #ifdef HAVE_RUBY_ENCODING_H
38
+ return rb_enc_vsprintf(rb_utf8_encoding(), format, args);
39
+ #else
40
+ # ifdef HAVE_RB_VSPRINTF
41
+ return rb_vsprintf(format, args);
42
+ # else
43
+ char buf[2048];
44
+ int n = vsnprintf(buf, sizeof(buf), format, args);
45
+ return rb_str_new(buf, n);
46
+ # endif
47
+ #endif
48
+ }
49
+
50
+ void
51
+ rb_u_raise(VALUE exception, const char *format, ...)
52
+ {
53
+ va_list args;
54
+ va_start(args, format);
55
+ VALUE message = format_message(format, args);
56
+ va_end(args);
57
+ rb_exc_raise(rb_exc_new3(exception, message));
58
+ }
59
+
60
+ void
61
+ rb_u_raise_errno(int number, const char *format, ...)
62
+ {
63
+ va_list args;
64
+ va_start(args, format);
65
+ #ifdef HAVE_RUBY_ENCODING_H
66
+ VALUE message = format_message(format, args);
67
+ va_end(args);
68
+ rb_syserr_fail_str(number, message);
69
+ #else
70
+ char buf[2048];
71
+ vsnprintf(buf, sizeof(buf), format, args);
72
+ errno = number;
73
+ va_end(args);
74
+ rb_sys_fail(buf);
75
+ #endif
76
+ }
77
+
78
+ VALUE
79
+ rb_u_str_new(const char *string, long length)
80
+ {
81
+ #ifdef HAVE_RUBY_ENCODING_H
82
+ return rb_enc_str_new(string, length, rb_utf8_encoding());
83
+ #else
84
+ return rb_str_new(string, length);
85
+ #endif
86
+ }
87
+
88
+ VALUE
89
+ rb_u_str_buf_new(long length)
90
+ {
91
+ #ifdef HAVE_RUBY_ENCODING_H
92
+ VALUE buffer = rb_str_buf_new(length);
93
+ rb_enc_associate(buffer, rb_utf8_encoding());
94
+ return buffer;
95
+ #else
96
+ return rb_str_buf_new(length);
97
+ #endif
98
+ }
@@ -0,0 +1,67 @@
1
+ #if __GNUC__ >= 4
2
+ # define RB_U_NULL_TERMINATED(parameter) __attribute__((__sentinel__(parameter)))
3
+ #else
4
+ # define RB_U_NULL_TERMINATED(parameter)
5
+ #endif
6
+
7
+ void need_at_least_n_arguments(int argc, int n);
8
+
9
+ void need_m_to_n_arguments(int argc, int m, int n);
10
+
11
+ void *_rb_u_guarded_alloc(size_t n, ...) RB_U_NULL_TERMINATED(0);
12
+
13
+ int rb_u_char_to_u(uint32_t c, char *result);
14
+
15
+ void rb_u_validate(const char *string, long length);
16
+
17
+ VALUE _rb_u_character_test(VALUE string, bool (*test)(uint32_t));
18
+
19
+ VALUE _rb_u_string_test_locale(int argc, VALUE *argv, VALUE self,
20
+ size_t convert(char *, size_t, const char *, size_t,
21
+ const char *));
22
+
23
+ VALUE _rb_u_string_convert(VALUE self,
24
+ size_t convert(char *, size_t, const char *, size_t));
25
+ VALUE _rb_u_string_convert_locale(int argc, VALUE *argv, VALUE self,
26
+ size_t convert(char *, size_t, const char *,
27
+ size_t, const char *),
28
+ const char *lc_env);
29
+
30
+ VALUE _rb_u_string_property(VALUE self, const char *name,
31
+ int unknown, int property(uint32_t),
32
+ VALUE tosym(int));
33
+
34
+ enum u_normalization_form _rb_u_symbol_to_normalization_form(VALUE symbol);
35
+
36
+ VALUE rb_u_pattern_argument(VALUE pattern, bool quote);
37
+
38
+ long rb_u_string_index_regexp(VALUE self, const char *begin, VALUE regex, bool reverse);
39
+
40
+ #ifndef HAVE_RB_ERRINFO
41
+ # define rb_errinfo() (ruby_errinfo)
42
+ #endif
43
+
44
+ #ifndef HAVE_RB_MEMHASH
45
+ int rb_memhash(const char *string, long length);
46
+ #endif
47
+
48
+ #ifndef RETURN_SIZED_ENUMERATOR
49
+ # define RETURN_SIZED_ENUMERATOR(self, argc, argv, size) \
50
+ RETURN_ENUMERATOR(self, argc, argv)
51
+ #endif
52
+
53
+ #ifndef OBJ_UNTRUSTED
54
+ # define OBJ_UNTRUSTED(o) (false)
55
+ #endif
56
+
57
+ #ifndef OBJ_UNTRUST
58
+ # define OBJ_UNTRUST(o) do { } while (0)
59
+ #endif
60
+
61
+ long rb_u_memsearch(const void *a, long a_n, const void *b, long b_n);
62
+
63
+ NORETURN(void rb_u_raise(VALUE exception, const char *format, ...)) PRINTF(2, 3);
64
+ NORETURN(void rb_u_raise_errno(int number, const char *format, ...)) PRINTF(2, 3);
65
+
66
+ VALUE rb_u_str_new(const char *string, long length);
67
+ VALUE rb_u_str_buf_new(long length);
@@ -0,0 +1,251 @@
1
+ /* -*- coding: utf-8 -*- */
2
+
3
+ #include "extconf.h"
4
+ #include <errno.h>
5
+ #include <ruby.h>
6
+ #include <stdarg.h>
7
+ #include <stdbool.h>
8
+ #include <stddef.h>
9
+ #include <stdint.h>
10
+ #include <stdlib.h>
11
+ #include <limits.h>
12
+ #include "u.h"
13
+ #include "private.h"
14
+ #include "rb_private.h"
15
+ #include "rb_u_buffer.h"
16
+ #include "rb_u_string.h"
17
+
18
+ void
19
+ need_at_least_n_arguments(int argc, int n)
20
+ {
21
+ if (argc < n)
22
+ rb_u_raise(rb_eArgError,
23
+ "wrong number of arguments (%d for at least %d)",
24
+ argc, n);
25
+ }
26
+
27
+ void
28
+ need_m_to_n_arguments(int argc, int m, int n)
29
+ {
30
+ if (argc < m || argc > n)
31
+ rb_u_raise(rb_eArgError,
32
+ "wrong number of arguments (%d for %d..%d)",
33
+ argc, m, n);
34
+ }
35
+
36
+ struct guarded_alloc_closure {
37
+ void *result;
38
+ size_t n;
39
+ };
40
+
41
+ static VALUE
42
+ guarded_alloc(VALUE data)
43
+ {
44
+ struct guarded_alloc_closure *closure = (struct guarded_alloc_closure *)data;
45
+ closure->result = (void *)ALLOC_N(char, closure->n);
46
+ return Qnil;
47
+ }
48
+
49
+ void *
50
+ _rb_u_guarded_alloc(size_t n, ...)
51
+ {
52
+ struct guarded_alloc_closure closure = { NULL, n };
53
+ int error;
54
+ rb_protect(guarded_alloc, (VALUE)&closure, &error);
55
+ if (error == 0)
56
+ return closure.result;
57
+ va_list args;
58
+ va_start(args, n);
59
+ void *previous;
60
+ while ((previous = va_arg(args, void *)) != NULL)
61
+ free(previous);
62
+ va_end(args);
63
+ rb_exc_raise(rb_errinfo());
64
+ }
65
+
66
+ int
67
+ rb_u_char_to_u(uint32_t c, char *result)
68
+ {
69
+ if (!u_char_isvalid(c))
70
+ rb_u_raise(rb_eArgError, "not a Unicode character: %#04x", c);
71
+
72
+ return u_char_to_u(c, result);
73
+ }
74
+
75
+ void
76
+ rb_u_validate(const char *string, long length)
77
+ {
78
+ const char *end;
79
+ if (!u_valid(string, length, &end))
80
+ rb_u_raise(rb_eArgError,
81
+ "invalid byte sequence at byte %ld",
82
+ end - string);
83
+ }
84
+
85
+ VALUE
86
+ _rb_u_character_test(VALUE self, bool (*test)(uint32_t))
87
+ {
88
+ const struct rb_u_string *s = RVAL2USTRING(self);
89
+ for (const char *p = USTRING_STR(s), *end = USTRING_END(s); p < end; )
90
+ if (!test(u_decode(&p, p, end)))
91
+ return Qfalse;
92
+ return Qtrue;
93
+ }
94
+
95
+ VALUE
96
+ _rb_u_string_test_locale(int argc, VALUE *argv, VALUE self,
97
+ size_t convert(char *, size_t, const char *, size_t,
98
+ const char *))
99
+ {
100
+ const char *locale = NULL;
101
+
102
+ VALUE rblocale;
103
+ if (rb_scan_args(argc, argv, "01", &rblocale) == 1)
104
+ locale = StringValuePtr(rblocale);
105
+
106
+ const struct rb_u_string *string = RVAL2USTRING(self);
107
+
108
+ size_t nfd_n = u_normalize(NULL, 0,
109
+ USTRING_STR(string), USTRING_LENGTH(string),
110
+ U_NORMALIZATION_FORM_D);
111
+ char *nfd = ALLOC_N(char, nfd_n + 1);
112
+ nfd_n = u_normalize(nfd, nfd_n + 1,
113
+ USTRING_STR(string), USTRING_LENGTH(string),
114
+ U_NORMALIZATION_FORM_D);
115
+
116
+ size_t converted_n = convert(NULL, 0, nfd, nfd_n, locale);
117
+ char *converted = _rb_u_guarded_alloc(converted_n + 1, nfd, NULL);
118
+ convert(converted, converted_n + 1, nfd, nfd_n, locale);
119
+
120
+ VALUE result = converted_n == nfd_n &&
121
+ memcmp(converted, nfd, nfd_n) == 0 ? Qtrue : Qfalse;
122
+
123
+ free(converted);
124
+ free(nfd);
125
+
126
+ return result;
127
+ }
128
+
129
+ VALUE
130
+ _rb_u_string_convert(VALUE self,
131
+ size_t convert(char *, size_t, const char *, size_t))
132
+ {
133
+ const struct rb_u_string *string = RVAL2USTRING(self);
134
+
135
+ size_t n = convert(NULL, 0, USTRING_STR(string), USTRING_LENGTH(string));
136
+ char *converted = ALLOC_N(char, n + 1);
137
+ convert(converted, n + 1, USTRING_STR(string), USTRING_LENGTH(string));
138
+
139
+ return rb_u_string_new_c_own(self, converted, n);
140
+ }
141
+
142
+ static size_t
143
+ try_convert(char *result, size_t m, const struct rb_u_string *string,
144
+ size_t convert(char *, size_t, const char *, size_t,
145
+ const char *), const char *locale)
146
+ {
147
+ errno = 0;
148
+ size_t n = convert(result, m, USTRING_STR(string), USTRING_LENGTH(string),
149
+ locale);
150
+ if (errno != 0) {
151
+ free(result);
152
+ rb_u_raise_errno(errno, "can’t apply conversion");
153
+ }
154
+ return n;
155
+ }
156
+
157
+ VALUE
158
+ _rb_u_string_convert_locale(int argc, VALUE *argv, VALUE self,
159
+ size_t convert(char *, size_t, const char *, size_t,
160
+ const char *),
161
+ const char *lc_env)
162
+ {
163
+ const char *locale = NULL;
164
+
165
+ VALUE rblocale;
166
+ if (rb_scan_args(argc, argv, "01", &rblocale) == 1)
167
+ locale = StringValuePtr(rblocale);
168
+ else if (lc_env != NULL) {
169
+ const char * const env[] = { "LC_ALL", lc_env, "LANG", NULL };
170
+ for (const char * const *p = env; *p != NULL; p++)
171
+ if ((locale = getenv(*p)) != NULL)
172
+ break;
173
+ }
174
+
175
+ const struct rb_u_string *string = RVAL2USTRING(self);
176
+
177
+ size_t n = try_convert(NULL, 0, string, convert, locale);
178
+ char *converted = ALLOC_N(char, n + 1);
179
+ size_t m = try_convert(converted, n + 1, string, convert, locale);
180
+ if (m < n) {
181
+ char *t = REALLOC_N(converted, char, m + 1);
182
+ if (t != NULL)
183
+ converted = t;
184
+ n = m;
185
+ }
186
+
187
+ return rb_u_string_new_c_own(self, converted, n);
188
+ }
189
+
190
+ VALUE
191
+ _rb_u_string_property(VALUE self, const char *name, int unknown,
192
+ int property(uint32_t), VALUE tosym(int))
193
+ {
194
+ const struct rb_u_string *string = RVAL2USTRING(self);
195
+ const char *p = USTRING_STR(string);
196
+ const char *end = USTRING_END(string);
197
+ if (p == end)
198
+ return tosym(unknown);
199
+ int first = property(u_decode(&p, p, end));
200
+ while (p < end) {
201
+ int value = property(u_decode(&p, p, end));
202
+ if (value != first)
203
+ rb_u_raise(rb_eArgError,
204
+ "string consists of characters with different %s values: :%s+, :%s",
205
+ name,
206
+ rb_id2name(SYM2ID(tosym(first))),
207
+ rb_id2name(SYM2ID(tosym(value))));
208
+ }
209
+ return tosym(first);
210
+ }
211
+
212
+ #define SYMBOL2MODE(symbol, mode, id) do { \
213
+ static ID id_##symbol; \
214
+ if (id_##symbol == 0) \
215
+ id_##symbol = rb_intern(#symbol); \
216
+ if (id == id_##symbol) \
217
+ return mode; \
218
+ } while (0)
219
+
220
+ enum u_normalization_form
221
+ _rb_u_symbol_to_normalization_form(VALUE symbol)
222
+ {
223
+ if (!SYMBOL_P(symbol)) {
224
+ VALUE inspected = rb_inspect(symbol);
225
+
226
+ rb_u_raise(rb_eTypeError,
227
+ "not a symbol: %s",
228
+ StringValuePtr(inspected));
229
+ }
230
+
231
+ ID id = SYM2ID(symbol);
232
+
233
+ SYMBOL2MODE(nfd, U_NORMALIZATION_FORM_D, id);
234
+ SYMBOL2MODE(nfc, U_NORMALIZATION_FORM_C, id);
235
+ SYMBOL2MODE(nfkd, U_NORMALIZATION_FORM_KD, id);
236
+ SYMBOL2MODE(nfkc, U_NORMALIZATION_FORM_KC, id);
237
+
238
+ rb_u_raise(rb_eArgError,
239
+ "unknown normalization form: :%s",
240
+ rb_id2name(SYM2ID(symbol)));
241
+ }
242
+
243
+ U_EXTERN void Init_u(void);
244
+ void
245
+ Init_u(void)
246
+ {
247
+ VALUE mU = rb_define_module("U");
248
+
249
+ Init_u_buffer(mU);
250
+ Init_u_string(mU);
251
+ }