libarchive-static 1.0.6 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. checksums.yaml +4 -4
  2. data/ext/extconf.rb +2 -9
  3. data/ext/libarchive-0.1.1/ext/archive_read_support_compression.c +6 -6
  4. data/ext/libarchive-0.1.1/ext/archive_read_support_compression.o +0 -0
  5. data/ext/libarchive-0.1.1/ext/archive_read_support_format.o +0 -0
  6. data/ext/libarchive-0.1.1/ext/archive_write_open_rb_str.c +1 -1
  7. data/ext/libarchive-0.1.1/ext/archive_write_open_rb_str.o +0 -0
  8. data/ext/libarchive-0.1.1/ext/archive_write_set_compression.c +5 -5
  9. data/ext/libarchive-0.1.1/ext/archive_write_set_compression.o +0 -0
  10. data/ext/libarchive-0.1.1/ext/config.h +23 -0
  11. data/ext/libarchive-0.1.1/ext/config.log +230 -0
  12. data/ext/libarchive-0.1.1/ext/config.status +671 -0
  13. data/ext/libarchive-0.1.1/ext/libarchive.c +1 -1
  14. data/ext/libarchive-0.1.1/ext/libarchive.o +0 -0
  15. data/ext/libarchive-0.1.1/ext/libarchive_archive.c +7 -7
  16. data/ext/libarchive-0.1.1/ext/libarchive_archive.o +0 -0
  17. data/ext/libarchive-0.1.1/ext/libarchive_entry.c +6 -0
  18. data/ext/libarchive-0.1.1/ext/libarchive_entry.o +0 -0
  19. data/ext/libarchive-0.1.1/ext/libarchive_reader.c +6 -4
  20. data/ext/libarchive-0.1.1/ext/libarchive_reader.o +0 -0
  21. data/ext/libarchive-0.1.1/ext/libarchive_ruby.so +0 -0
  22. data/ext/libarchive-0.1.1/ext/libarchive_win32.h +1 -1
  23. data/ext/libarchive-0.1.1/ext/libarchive_writer.c +2 -2
  24. data/ext/libarchive-0.1.1/ext/libarchive_writer.o +0 -0
  25. data/ext/libarchive-3.6.2/Makefile.in +16892 -0
  26. data/ext/libarchive-3.6.2/build/autoconf/ax_append_compile_flags.m4 +67 -0
  27. data/ext/libarchive-3.6.2/build/autoconf/ax_append_flag.m4 +71 -0
  28. data/ext/libarchive-3.6.2/build/autoconf/ax_check_compile_flag.m4 +74 -0
  29. data/ext/libarchive-3.6.2/build/autoconf/ax_require_defined.m4 +37 -0
  30. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/autoconf/check_stdcall_func.m4 +0 -0
  31. data/ext/libarchive-3.6.2/build/autoconf/compile +348 -0
  32. data/ext/libarchive-3.6.2/build/autoconf/config.guess +1754 -0
  33. data/ext/libarchive-3.6.2/build/autoconf/config.rpath +696 -0
  34. data/ext/libarchive-3.6.2/build/autoconf/config.sub +1890 -0
  35. data/ext/libarchive-3.6.2/build/autoconf/depcomp +791 -0
  36. data/ext/libarchive-3.6.2/build/autoconf/iconv.m4 +271 -0
  37. data/ext/libarchive-3.6.2/build/autoconf/install-sh +541 -0
  38. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/autoconf/la_uid_t.m4 +0 -0
  39. data/ext/libarchive-3.6.2/build/autoconf/lib-ld.m4 +109 -0
  40. data/ext/libarchive-3.6.2/build/autoconf/lib-link.m4 +777 -0
  41. data/ext/libarchive-3.6.2/build/autoconf/lib-prefix.m4 +224 -0
  42. data/ext/libarchive-3.6.2/build/autoconf/ltmain.sh +11251 -0
  43. data/ext/libarchive-3.6.2/build/autoconf/m4_ax_compile_check_sizeof.m4 +115 -0
  44. data/ext/libarchive-3.6.2/build/autoconf/missing +215 -0
  45. data/ext/libarchive-3.6.2/build/autoconf/test-driver +153 -0
  46. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/pkgconfig/libarchive.pc.in +4 -1
  47. data/ext/libarchive-3.6.2/config.h.in +1504 -0
  48. data/ext/libarchive-3.6.2/configure +25558 -0
  49. data/ext/libarchive-3.6.2/libarchive/archive.h +1212 -0
  50. data/ext/libarchive-3.6.2/libarchive/archive_acl.c +2097 -0
  51. data/ext/libarchive-3.6.2/libarchive/archive_acl_private.h +83 -0
  52. data/ext/libarchive-3.6.2/libarchive/archive_blake2.h +197 -0
  53. data/ext/libarchive-3.6.2/libarchive/archive_blake2_impl.h +161 -0
  54. data/ext/libarchive-3.6.2/libarchive/archive_blake2s_ref.c +369 -0
  55. data/ext/libarchive-3.6.2/libarchive/archive_blake2sp_ref.c +361 -0
  56. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_check_magic.c +63 -22
  57. data/ext/libarchive-3.6.2/libarchive/archive_cmdline.c +227 -0
  58. data/ext/libarchive-3.6.2/libarchive/archive_cmdline_private.h +47 -0
  59. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_crc32.h +17 -0
  60. data/ext/libarchive-3.6.2/libarchive/archive_cryptor.c +534 -0
  61. data/ext/libarchive-3.6.2/libarchive/archive_cryptor_private.h +188 -0
  62. data/ext/libarchive-3.6.2/libarchive/archive_digest.c +1505 -0
  63. data/ext/libarchive-3.6.2/libarchive/archive_digest_private.h +416 -0
  64. data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_darwin.c +559 -0
  65. data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_freebsd.c +712 -0
  66. data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_linux.c +760 -0
  67. data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_sunos.c +824 -0
  68. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_endian.h +48 -15
  69. data/ext/libarchive-3.6.2/libarchive/archive_entry.c +2149 -0
  70. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry.h +305 -106
  71. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_copy_bhfi.c +5 -4
  72. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_copy_stat.c +9 -3
  73. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_link_resolver.c +104 -62
  74. data/ext/libarchive-3.6.2/libarchive/archive_entry_locale.h +92 -0
  75. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_private.h +65 -49
  76. data/ext/libarchive-3.6.2/libarchive/archive_entry_sparse.c +156 -0
  77. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_stat.c +6 -6
  78. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_strmode.c +1 -1
  79. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_xattr.c +4 -6
  80. data/ext/libarchive-3.6.2/libarchive/archive_getdate.c +1165 -0
  81. data/ext/libarchive-3.6.2/libarchive/archive_getdate.h +39 -0
  82. data/ext/libarchive-3.6.2/libarchive/archive_hmac.c +334 -0
  83. data/ext/libarchive-3.6.2/libarchive/archive_hmac_private.h +117 -0
  84. data/ext/libarchive-3.6.2/libarchive/archive_match.c +1875 -0
  85. data/ext/libarchive-3.6.2/libarchive/archive_openssl_evp_private.h +53 -0
  86. data/ext/libarchive-3.6.2/libarchive/archive_openssl_hmac_private.h +54 -0
  87. data/ext/libarchive-3.6.2/libarchive/archive_options.c +218 -0
  88. data/ext/libarchive-3.6.2/libarchive/archive_options_private.h +51 -0
  89. data/ext/libarchive-3.6.2/libarchive/archive_pack_dev.c +337 -0
  90. data/ext/libarchive-3.6.2/libarchive/archive_pack_dev.h +49 -0
  91. data/ext/libarchive-3.6.2/libarchive/archive_pathmatch.c +463 -0
  92. data/ext/libarchive-3.6.2/libarchive/archive_pathmatch.h +52 -0
  93. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_platform.h +77 -9
  94. data/ext/libarchive-3.6.2/libarchive/archive_platform_acl.h +55 -0
  95. data/ext/libarchive-3.6.2/libarchive/archive_platform_xattr.h +47 -0
  96. data/ext/libarchive-3.6.2/libarchive/archive_ppmd7.c +1168 -0
  97. data/ext/libarchive-3.6.2/libarchive/archive_ppmd7_private.h +119 -0
  98. data/ext/libarchive-3.6.2/libarchive/archive_ppmd8.c +1287 -0
  99. data/ext/libarchive-3.6.2/libarchive/archive_ppmd8_private.h +148 -0
  100. data/ext/libarchive-3.6.2/libarchive/archive_ppmd_private.h +151 -0
  101. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_private.h +74 -18
  102. data/ext/libarchive-3.6.2/libarchive/archive_random.c +272 -0
  103. data/ext/libarchive-3.6.2/libarchive/archive_random_private.h +36 -0
  104. data/ext/libarchive-3.6.2/libarchive/archive_rb.c +709 -0
  105. data/ext/libarchive-3.6.2/libarchive/archive_rb.h +113 -0
  106. data/ext/libarchive-3.6.2/libarchive/archive_read.c +1756 -0
  107. data/ext/libarchive-3.6.2/libarchive/archive_read_add_passphrase.c +190 -0
  108. data/ext/libarchive-3.6.2/libarchive/archive_read_append_filter.c +204 -0
  109. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_data_into_fd.c +64 -18
  110. data/ext/libarchive-3.6.2/libarchive/archive_read_disk_entry_from_file.c +1086 -0
  111. data/ext/libarchive-3.6.2/libarchive/archive_read_disk_posix.c +2732 -0
  112. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_disk_private.h +40 -4
  113. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_disk_set_standard_lookup.c +21 -11
  114. data/ext/libarchive-3.6.2/libarchive/archive_read_disk_windows.c +2479 -0
  115. data/ext/libarchive-3.6.2/libarchive/archive_read_extract.c +60 -0
  116. data/ext/{libarchive-2.8.4/libarchive/archive_read_extract.c → libarchive-3.6.2/libarchive/archive_read_extract2.c} +34 -61
  117. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_fd.c +70 -49
  118. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_file.c +38 -23
  119. data/ext/libarchive-3.6.2/libarchive/archive_read_open_filename.c +586 -0
  120. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_memory.c +58 -28
  121. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_private.h +127 -59
  122. data/ext/libarchive-3.6.2/libarchive/archive_read_set_format.c +117 -0
  123. data/ext/libarchive-3.6.2/libarchive/archive_read_set_options.c +133 -0
  124. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_all.c → libarchive-3.6.2/libarchive/archive_read_support_filter_all.c} +35 -10
  125. data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_by_code.c +83 -0
  126. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_bzip2.c → libarchive-3.6.2/libarchive/archive_read_support_filter_bzip2.c} +38 -26
  127. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_compress.c → libarchive-3.6.2/libarchive/archive_read_support_filter_compress.c} +52 -44
  128. data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_grzip.c +112 -0
  129. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_gzip.c → libarchive-3.6.2/libarchive/archive_read_support_filter_gzip.c} +108 -37
  130. data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lrzip.c +122 -0
  131. data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lz4.c +742 -0
  132. data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lzop.c +499 -0
  133. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_none.c → libarchive-3.6.2/libarchive/archive_read_support_filter_none.c} +15 -3
  134. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_program.c → libarchive-3.6.2/libarchive/archive_read_support_filter_program.c} +114 -77
  135. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_rpm.c → libarchive-3.6.2/libarchive/archive_read_support_filter_rpm.c} +31 -31
  136. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_uu.c → libarchive-3.6.2/libarchive/archive_read_support_filter_uu.c} +141 -85
  137. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_xz.c → libarchive-3.6.2/libarchive/archive_read_support_filter_xz.c} +369 -284
  138. data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_zstd.c +297 -0
  139. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_7zip.c +3900 -0
  140. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_all.c +89 -0
  141. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_ar.c +126 -72
  142. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_by_code.c +92 -0
  143. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_cab.c +3228 -0
  144. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_cpio.c +1104 -0
  145. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_empty.c +14 -11
  146. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_iso9660.c +990 -541
  147. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_lha.c +2916 -0
  148. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_mtree.c +2150 -0
  149. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_rar.c +3797 -0
  150. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_rar5.c +4251 -0
  151. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_raw.c +38 -31
  152. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_tar.c +1157 -629
  153. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_warc.c +848 -0
  154. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_xar.c +439 -258
  155. data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_zip.c +4270 -0
  156. data/ext/libarchive-3.6.2/libarchive/archive_string.c +4240 -0
  157. data/ext/libarchive-3.6.2/libarchive/archive_string.h +243 -0
  158. data/ext/libarchive-3.6.2/libarchive/archive_string_composition.h +2292 -0
  159. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_string_sprintf.c +44 -16
  160. data/ext/libarchive-3.6.2/libarchive/archive_util.c +655 -0
  161. data/ext/libarchive-3.6.2/libarchive/archive_version_details.c +151 -0
  162. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_virtual.c +85 -16
  163. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_windows.c +214 -541
  164. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_windows.h +74 -106
  165. data/ext/libarchive-3.6.2/libarchive/archive_write.c +828 -0
  166. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter.c +72 -0
  167. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_b64encode.c +304 -0
  168. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_by_name.c +77 -0
  169. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_bzip2.c +401 -0
  170. data/ext/{libarchive-2.8.4/libarchive/archive_write_set_compression_compress.c → libarchive-3.6.2/libarchive/archive_write_add_filter_compress.c} +86 -131
  171. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_grzip.c +135 -0
  172. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_gzip.c +442 -0
  173. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lrzip.c +197 -0
  174. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lz4.c +700 -0
  175. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lzop.c +478 -0
  176. data/ext/{libarchive-2.8.4/libarchive/archive_read_support_format_all.c → libarchive-3.6.2/libarchive/archive_write_add_filter_none.c} +11 -11
  177. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_program.c +391 -0
  178. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_uuencode.c +295 -0
  179. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_xz.c +545 -0
  180. data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_zstd.c +418 -0
  181. data/ext/libarchive-3.6.2/libarchive/archive_write_disk_posix.c +4711 -0
  182. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_disk_private.h +9 -2
  183. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_disk_set_standard_lookup.c +30 -29
  184. data/ext/libarchive-3.6.2/libarchive/archive_write_disk_windows.c +2842 -0
  185. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_fd.c +15 -10
  186. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_file.c +15 -9
  187. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_filename.c +128 -20
  188. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_memory.c +7 -18
  189. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_private.h +72 -29
  190. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format.c +56 -3
  191. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_7zip.c +2322 -0
  192. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_ar.c +54 -34
  193. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_by_name.c +20 -2
  194. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio.c +11 -0
  195. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_binary.c +610 -0
  196. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_newc.c +457 -0
  197. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_odc.c +500 -0
  198. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_filter_by_ext.c +142 -0
  199. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_gnutar.c +755 -0
  200. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_iso9660.c +8165 -0
  201. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_mtree.c +2217 -0
  202. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_pax.c +1049 -387
  203. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_private.h +42 -0
  204. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_raw.c +125 -0
  205. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_shar.c +62 -47
  206. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_ustar.c +279 -108
  207. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_v7tar.c +638 -0
  208. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_warc.c +453 -0
  209. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_xar.c +3259 -0
  210. data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_zip.c +1704 -0
  211. data/ext/libarchive-3.6.2/libarchive/archive_write_set_options.c +130 -0
  212. data/ext/libarchive-3.6.2/libarchive/archive_write_set_passphrase.c +95 -0
  213. data/ext/libarchive-3.6.2/libarchive/archive_xxhash.h +48 -0
  214. data/ext/libarchive-3.6.2/libarchive/config_freebsd.h +271 -0
  215. data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/filter_fork.h +10 -5
  216. data/ext/{libarchive-2.8.4/libarchive/filter_fork.c → libarchive-3.6.2/libarchive/filter_fork_posix.c} +98 -19
  217. data/ext/libarchive-3.6.2/libarchive/filter_fork_windows.c +236 -0
  218. data/ext/libarchive-3.6.2/libarchive/xxhash.c +525 -0
  219. data/ext/libarchive-static-makefile +144 -80
  220. data/ext/libarchive-static-wrapper-makefile +1 -1
  221. data/ext/zlib-1.2.13/Makefile.in +404 -0
  222. data/ext/{zlib-1.2.5 → zlib-1.2.13}/adler32.c +51 -34
  223. data/ext/{zlib-1.2.5 → zlib-1.2.13}/compress.c +27 -21
  224. data/ext/zlib-1.2.13/configure +922 -0
  225. data/ext/zlib-1.2.13/crc32.c +1125 -0
  226. data/ext/zlib-1.2.13/crc32.h +9446 -0
  227. data/ext/{zlib-1.2.5 → zlib-1.2.13}/deflate.c +842 -459
  228. data/ext/{zlib-1.2.5 → zlib-1.2.13}/deflate.h +37 -33
  229. data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzclose.c +0 -0
  230. data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzguts.h +103 -16
  231. data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzlib.c +155 -53
  232. data/ext/zlib-1.2.13/gzread.c +650 -0
  233. data/ext/zlib-1.2.13/gzwrite.c +677 -0
  234. data/ext/{zlib-1.2.5 → zlib-1.2.13}/infback.c +24 -12
  235. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffast.c +49 -66
  236. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffast.h +0 -0
  237. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffixed.h +3 -3
  238. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inflate.c +209 -94
  239. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inflate.h +9 -5
  240. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inftrees.c +24 -50
  241. data/ext/{zlib-1.2.5 → zlib-1.2.13}/inftrees.h +1 -1
  242. data/ext/{zlib-1.2.5 → zlib-1.2.13}/trees.c +135 -198
  243. data/ext/{zlib-1.2.5 → zlib-1.2.13}/trees.h +0 -0
  244. data/ext/zlib-1.2.13/uncompr.c +93 -0
  245. data/ext/{zlib-1.2.5 → zlib-1.2.13}/zconf.h +182 -63
  246. data/ext/{zlib-1.2.5 → zlib-1.2.13}/zlib.h +617 -295
  247. data/ext/{zlib-1.2.5 → zlib-1.2.13}/zutil.c +50 -41
  248. data/ext/{zlib-1.2.5 → zlib-1.2.13}/zutil.h +83 -82
  249. metadata +241 -133
  250. data/ext/libarchive-0.1.1/libarchive.c +0 -1762
  251. data/ext/libarchive-2.8.4/Makefile.in +0 -7076
  252. data/ext/libarchive-2.8.4/build/autoconf/compile +0 -143
  253. data/ext/libarchive-2.8.4/build/autoconf/config.guess +0 -1502
  254. data/ext/libarchive-2.8.4/build/autoconf/config.sub +0 -1708
  255. data/ext/libarchive-2.8.4/build/autoconf/depcomp +0 -630
  256. data/ext/libarchive-2.8.4/build/autoconf/install-sh +0 -291
  257. data/ext/libarchive-2.8.4/build/autoconf/ltmain.sh +0 -8406
  258. data/ext/libarchive-2.8.4/build/autoconf/missing +0 -376
  259. data/ext/libarchive-2.8.4/config.h.in +0 -772
  260. data/ext/libarchive-2.8.4/configure +0 -17916
  261. data/ext/libarchive-2.8.4/libarchive/archive.h +0 -741
  262. data/ext/libarchive-2.8.4/libarchive/archive_entry.c +0 -2202
  263. data/ext/libarchive-2.8.4/libarchive/archive_hash.h +0 -281
  264. data/ext/libarchive-2.8.4/libarchive/archive_read.c +0 -1249
  265. data/ext/libarchive-2.8.4/libarchive/archive_read_disk.c +0 -198
  266. data/ext/libarchive-2.8.4/libarchive/archive_read_disk_entry_from_file.c +0 -570
  267. data/ext/libarchive-2.8.4/libarchive/archive_read_open_filename.c +0 -272
  268. data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_cpio.c +0 -777
  269. data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_mtree.c +0 -1304
  270. data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_zip.c +0 -903
  271. data/ext/libarchive-2.8.4/libarchive/archive_string.c +0 -453
  272. data/ext/libarchive-2.8.4/libarchive/archive_string.h +0 -148
  273. data/ext/libarchive-2.8.4/libarchive/archive_util.c +0 -391
  274. data/ext/libarchive-2.8.4/libarchive/archive_write.c +0 -466
  275. data/ext/libarchive-2.8.4/libarchive/archive_write_disk.c +0 -2628
  276. data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_bzip2.c +0 -408
  277. data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_gzip.c +0 -477
  278. data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_none.c +0 -257
  279. data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_program.c +0 -347
  280. data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_xz.c +0 -438
  281. data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_cpio.c +0 -344
  282. data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_cpio_newc.c +0 -295
  283. data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_mtree.c +0 -1050
  284. data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_zip.c +0 -667
  285. data/ext/libarchive-2.8.4/libarchive/config_freebsd.h +0 -154
  286. data/ext/libarchive-2.8.4/libarchive/filter_fork_windows.c +0 -113
  287. data/ext/zlib-1.2.5/Makefile.in +0 -257
  288. data/ext/zlib-1.2.5/configure +0 -596
  289. data/ext/zlib-1.2.5/crc32.c +0 -442
  290. data/ext/zlib-1.2.5/crc32.h +0 -441
  291. data/ext/zlib-1.2.5/example.c +0 -565
  292. data/ext/zlib-1.2.5/gzread.c +0 -653
  293. data/ext/zlib-1.2.5/gzwrite.c +0 -531
  294. data/ext/zlib-1.2.5/minigzip.c +0 -440
  295. data/ext/zlib-1.2.5/uncompr.c +0 -59
@@ -0,0 +1,4240 @@
1
+ /*-
2
+ * Copyright (c) 2003-2011 Tim Kientzle
3
+ * Copyright (c) 2011-2012 Michihiro NAKAJIMA
4
+ * All rights reserved.
5
+ *
6
+ * Redistribution and use in source and binary forms, with or without
7
+ * modification, are permitted provided that the following conditions
8
+ * are met:
9
+ * 1. Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * 2. Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
16
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18
+ * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
19
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ */
26
+
27
+ #include "archive_platform.h"
28
+ __FBSDID("$FreeBSD: head/lib/libarchive/archive_string.c 201095 2009-12-28 02:33:22Z kientzle $");
29
+
30
+ /*
31
+ * Basic resizable string support, to simplify manipulating arbitrary-sized
32
+ * strings while minimizing heap activity.
33
+ *
34
+ * In particular, the buffer used by a string object is only grown, it
35
+ * never shrinks, so you can clear and reuse the same string object
36
+ * without incurring additional memory allocations.
37
+ */
38
+
39
+ #ifdef HAVE_ERRNO_H
40
+ #include <errno.h>
41
+ #endif
42
+ #ifdef HAVE_ICONV_H
43
+ #include <iconv.h>
44
+ #endif
45
+ #ifdef HAVE_LANGINFO_H
46
+ #include <langinfo.h>
47
+ #endif
48
+ #ifdef HAVE_LOCALCHARSET_H
49
+ #include <localcharset.h>
50
+ #endif
51
+ #ifdef HAVE_STDLIB_H
52
+ #include <stdlib.h>
53
+ #endif
54
+ #ifdef HAVE_STRING_H
55
+ #include <string.h>
56
+ #endif
57
+ #ifdef HAVE_WCHAR_H
58
+ #include <wchar.h>
59
+ #endif
60
+ #if defined(_WIN32) && !defined(__CYGWIN__)
61
+ #include <windows.h>
62
+ #include <locale.h>
63
+ #endif
64
+
65
+ #include "archive_endian.h"
66
+ #include "archive_private.h"
67
+ #include "archive_string.h"
68
+ #include "archive_string_composition.h"
69
+
70
+ #if !defined(HAVE_WMEMCPY) && !defined(wmemcpy)
71
+ #define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t))
72
+ #endif
73
+
74
+ #if !defined(HAVE_WMEMMOVE) && !defined(wmemmove)
75
+ #define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t))
76
+ #endif
77
+
78
+ #undef max
79
+ #define max(a, b) ((a)>(b)?(a):(b))
80
+
81
+ struct archive_string_conv {
82
+ struct archive_string_conv *next;
83
+ char *from_charset;
84
+ char *to_charset;
85
+ unsigned from_cp;
86
+ unsigned to_cp;
87
+ /* Set 1 if from_charset and to_charset are the same. */
88
+ int same;
89
+ int flag;
90
+ #define SCONV_TO_CHARSET 1 /* MBS is being converted to specified
91
+ * charset. */
92
+ #define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from
93
+ * specified charset. */
94
+ #define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */
95
+ #define SCONV_WIN_CP (1<<3) /* Use Windows API for converting
96
+ * MBS. */
97
+ #define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive
98
+ * 2.x in the wrong assumption. */
99
+ #define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C.
100
+ * Before UTF-8 characters are actually
101
+ * processed. */
102
+ #define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D.
103
+ * Before UTF-8 characters are actually
104
+ * processed.
105
+ * Currently this only for MAC OS X. */
106
+ #define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */
107
+ #define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */
108
+ #define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */
109
+ #define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */
110
+ #define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */
111
+ #define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */
112
+ #define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE)
113
+ #define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE)
114
+
115
+ #if HAVE_ICONV
116
+ iconv_t cd;
117
+ iconv_t cd_w;/* Use at archive_mstring on
118
+ * Windows. */
119
+ #endif
120
+ /* A temporary buffer for normalization. */
121
+ struct archive_string utftmp;
122
+ int (*converter[2])(struct archive_string *, const void *, size_t,
123
+ struct archive_string_conv *);
124
+ int nconverter;
125
+ };
126
+
127
+ #define CP_C_LOCALE 0 /* "C" locale only for this file. */
128
+ #define CP_UTF16LE 1200
129
+ #define CP_UTF16BE 1201
130
+
131
+ #define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF)
132
+ #define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF)
133
+ #define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF)
134
+ #define UNICODE_MAX 0x10FFFF
135
+ #define UNICODE_R_CHAR 0xFFFD /* Replacement character. */
136
+ /* Set U+FFFD(Replacement character) in UTF-8. */
137
+ static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
138
+
139
+ static struct archive_string_conv *find_sconv_object(struct archive *,
140
+ const char *, const char *);
141
+ static void add_sconv_object(struct archive *, struct archive_string_conv *);
142
+ static struct archive_string_conv *create_sconv_object(const char *,
143
+ const char *, unsigned, int);
144
+ static void free_sconv_object(struct archive_string_conv *);
145
+ static struct archive_string_conv *get_sconv_object(struct archive *,
146
+ const char *, const char *, int);
147
+ static unsigned make_codepage_from_charset(const char *);
148
+ static unsigned get_current_codepage(void);
149
+ static unsigned get_current_oemcp(void);
150
+ static size_t mbsnbytes(const void *, size_t);
151
+ static size_t utf16nbytes(const void *, size_t);
152
+ #if defined(_WIN32) && !defined(__CYGWIN__)
153
+ static int archive_wstring_append_from_mbs_in_codepage(
154
+ struct archive_wstring *, const char *, size_t,
155
+ struct archive_string_conv *);
156
+ static int archive_string_append_from_wcs_in_codepage(struct archive_string *,
157
+ const wchar_t *, size_t, struct archive_string_conv *);
158
+ static int is_big_endian(void);
159
+ static int strncat_in_codepage(struct archive_string *, const void *,
160
+ size_t, struct archive_string_conv *);
161
+ static int win_strncat_from_utf16be(struct archive_string *, const void *,
162
+ size_t, struct archive_string_conv *);
163
+ static int win_strncat_from_utf16le(struct archive_string *, const void *,
164
+ size_t, struct archive_string_conv *);
165
+ static int win_strncat_to_utf16be(struct archive_string *, const void *,
166
+ size_t, struct archive_string_conv *);
167
+ static int win_strncat_to_utf16le(struct archive_string *, const void *,
168
+ size_t, struct archive_string_conv *);
169
+ #endif
170
+ static int best_effort_strncat_from_utf16be(struct archive_string *,
171
+ const void *, size_t, struct archive_string_conv *);
172
+ static int best_effort_strncat_from_utf16le(struct archive_string *,
173
+ const void *, size_t, struct archive_string_conv *);
174
+ static int best_effort_strncat_to_utf16be(struct archive_string *,
175
+ const void *, size_t, struct archive_string_conv *);
176
+ static int best_effort_strncat_to_utf16le(struct archive_string *,
177
+ const void *, size_t, struct archive_string_conv *);
178
+ #if defined(HAVE_ICONV)
179
+ static int iconv_strncat_in_locale(struct archive_string *, const void *,
180
+ size_t, struct archive_string_conv *);
181
+ #endif
182
+ static int best_effort_strncat_in_locale(struct archive_string *,
183
+ const void *, size_t, struct archive_string_conv *);
184
+ static int _utf8_to_unicode(uint32_t *, const char *, size_t);
185
+ static int utf8_to_unicode(uint32_t *, const char *, size_t);
186
+ static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t);
187
+ static int cesu8_to_unicode(uint32_t *, const char *, size_t);
188
+ static size_t unicode_to_utf8(char *, size_t, uint32_t);
189
+ static int utf16_to_unicode(uint32_t *, const char *, size_t, int);
190
+ static size_t unicode_to_utf16be(char *, size_t, uint32_t);
191
+ static size_t unicode_to_utf16le(char *, size_t, uint32_t);
192
+ static int strncat_from_utf8_libarchive2(struct archive_string *,
193
+ const void *, size_t, struct archive_string_conv *);
194
+ static int strncat_from_utf8_to_utf8(struct archive_string *, const void *,
195
+ size_t, struct archive_string_conv *);
196
+ static int archive_string_normalize_C(struct archive_string *, const void *,
197
+ size_t, struct archive_string_conv *);
198
+ static int archive_string_normalize_D(struct archive_string *, const void *,
199
+ size_t, struct archive_string_conv *);
200
+ static int archive_string_append_unicode(struct archive_string *,
201
+ const void *, size_t, struct archive_string_conv *);
202
+
203
+ static struct archive_string *
204
+ archive_string_append(struct archive_string *as, const char *p, size_t s)
205
+ {
206
+ if (archive_string_ensure(as, as->length + s + 1) == NULL)
207
+ return (NULL);
208
+ if (s)
209
+ memmove(as->s + as->length, p, s);
210
+ as->length += s;
211
+ as->s[as->length] = 0;
212
+ return (as);
213
+ }
214
+
215
+ static struct archive_wstring *
216
+ archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s)
217
+ {
218
+ if (archive_wstring_ensure(as, as->length + s + 1) == NULL)
219
+ return (NULL);
220
+ if (s)
221
+ wmemmove(as->s + as->length, p, s);
222
+ as->length += s;
223
+ as->s[as->length] = 0;
224
+ return (as);
225
+ }
226
+
227
+ struct archive_string *
228
+ archive_array_append(struct archive_string *as, const char *p, size_t s)
229
+ {
230
+ return archive_string_append(as, p, s);
231
+ }
232
+
233
+ void
234
+ archive_string_concat(struct archive_string *dest, struct archive_string *src)
235
+ {
236
+ if (archive_string_append(dest, src->s, src->length) == NULL)
237
+ __archive_errx(1, "Out of memory");
238
+ }
239
+
240
+ void
241
+ archive_wstring_concat(struct archive_wstring *dest,
242
+ struct archive_wstring *src)
243
+ {
244
+ if (archive_wstring_append(dest, src->s, src->length) == NULL)
245
+ __archive_errx(1, "Out of memory");
246
+ }
247
+
248
+ void
249
+ archive_string_free(struct archive_string *as)
250
+ {
251
+ as->length = 0;
252
+ as->buffer_length = 0;
253
+ free(as->s);
254
+ as->s = NULL;
255
+ }
256
+
257
+ void
258
+ archive_wstring_free(struct archive_wstring *as)
259
+ {
260
+ as->length = 0;
261
+ as->buffer_length = 0;
262
+ free(as->s);
263
+ as->s = NULL;
264
+ }
265
+
266
+ struct archive_wstring *
267
+ archive_wstring_ensure(struct archive_wstring *as, size_t s)
268
+ {
269
+ return (struct archive_wstring *)
270
+ archive_string_ensure((struct archive_string *)as,
271
+ s * sizeof(wchar_t));
272
+ }
273
+
274
+ /* Returns NULL on any allocation failure. */
275
+ struct archive_string *
276
+ archive_string_ensure(struct archive_string *as, size_t s)
277
+ {
278
+ char *p;
279
+ size_t new_length;
280
+
281
+ /* If buffer is already big enough, don't reallocate. */
282
+ if (as->s && (s <= as->buffer_length))
283
+ return (as);
284
+
285
+ /*
286
+ * Growing the buffer at least exponentially ensures that
287
+ * append operations are always linear in the number of
288
+ * characters appended. Using a smaller growth rate for
289
+ * larger buffers reduces memory waste somewhat at the cost of
290
+ * a larger constant factor.
291
+ */
292
+ if (as->buffer_length < 32)
293
+ /* Start with a minimum 32-character buffer. */
294
+ new_length = 32;
295
+ else if (as->buffer_length < 8192)
296
+ /* Buffers under 8k are doubled for speed. */
297
+ new_length = as->buffer_length + as->buffer_length;
298
+ else {
299
+ /* Buffers 8k and over grow by at least 25% each time. */
300
+ new_length = as->buffer_length + as->buffer_length / 4;
301
+ /* Be safe: If size wraps, fail. */
302
+ if (new_length < as->buffer_length) {
303
+ /* On failure, wipe the string and return NULL. */
304
+ archive_string_free(as);
305
+ errno = ENOMEM;/* Make sure errno has ENOMEM. */
306
+ return (NULL);
307
+ }
308
+ }
309
+ /*
310
+ * The computation above is a lower limit to how much we'll
311
+ * grow the buffer. In any case, we have to grow it enough to
312
+ * hold the request.
313
+ */
314
+ if (new_length < s)
315
+ new_length = s;
316
+ /* Now we can reallocate the buffer. */
317
+ p = (char *)realloc(as->s, new_length);
318
+ if (p == NULL) {
319
+ /* On failure, wipe the string and return NULL. */
320
+ archive_string_free(as);
321
+ errno = ENOMEM;/* Make sure errno has ENOMEM. */
322
+ return (NULL);
323
+ }
324
+
325
+ as->s = p;
326
+ as->buffer_length = new_length;
327
+ return (as);
328
+ }
329
+
330
+ /*
331
+ * TODO: See if there's a way to avoid scanning
332
+ * the source string twice. Then test to see
333
+ * if it actually helps (remember that we're almost
334
+ * always called with pretty short arguments, so
335
+ * such an optimization might not help).
336
+ */
337
+ struct archive_string *
338
+ archive_strncat(struct archive_string *as, const void *_p, size_t n)
339
+ {
340
+ size_t s;
341
+ const char *p, *pp;
342
+
343
+ p = (const char *)_p;
344
+
345
+ /* Like strlen(p), except won't examine positions beyond p[n]. */
346
+ s = 0;
347
+ pp = p;
348
+ while (s < n && *pp) {
349
+ pp++;
350
+ s++;
351
+ }
352
+ if ((as = archive_string_append(as, p, s)) == NULL)
353
+ __archive_errx(1, "Out of memory");
354
+ return (as);
355
+ }
356
+
357
+ struct archive_wstring *
358
+ archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n)
359
+ {
360
+ size_t s;
361
+ const wchar_t *pp;
362
+
363
+ /* Like strlen(p), except won't examine positions beyond p[n]. */
364
+ s = 0;
365
+ pp = p;
366
+ while (s < n && *pp) {
367
+ pp++;
368
+ s++;
369
+ }
370
+ if ((as = archive_wstring_append(as, p, s)) == NULL)
371
+ __archive_errx(1, "Out of memory");
372
+ return (as);
373
+ }
374
+
375
+ struct archive_string *
376
+ archive_strcat(struct archive_string *as, const void *p)
377
+ {
378
+ /* strcat is just strncat without an effective limit.
379
+ * Assert that we'll never get called with a source
380
+ * string over 16MB.
381
+ * TODO: Review all uses of strcat in the source
382
+ * and try to replace them with strncat().
383
+ */
384
+ return archive_strncat(as, p, 0x1000000);
385
+ }
386
+
387
+ struct archive_wstring *
388
+ archive_wstrcat(struct archive_wstring *as, const wchar_t *p)
389
+ {
390
+ /* Ditto. */
391
+ return archive_wstrncat(as, p, 0x1000000);
392
+ }
393
+
394
+ struct archive_string *
395
+ archive_strappend_char(struct archive_string *as, char c)
396
+ {
397
+ if ((as = archive_string_append(as, &c, 1)) == NULL)
398
+ __archive_errx(1, "Out of memory");
399
+ return (as);
400
+ }
401
+
402
+ struct archive_wstring *
403
+ archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c)
404
+ {
405
+ if ((as = archive_wstring_append(as, &c, 1)) == NULL)
406
+ __archive_errx(1, "Out of memory");
407
+ return (as);
408
+ }
409
+
410
+ /*
411
+ * Get the "current character set" name to use with iconv.
412
+ * On FreeBSD, the empty character set name "" chooses
413
+ * the correct character encoding for the current locale,
414
+ * so this isn't necessary.
415
+ * But iconv on Mac OS 10.6 doesn't seem to handle this correctly;
416
+ * on that system, we have to explicitly call nl_langinfo()
417
+ * to get the right name. Not sure about other platforms.
418
+ *
419
+ * NOTE: GNU libiconv does not recognize the character-set name
420
+ * which some platform nl_langinfo(CODESET) returns, so we should
421
+ * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv.
422
+ */
423
+ static const char *
424
+ default_iconv_charset(const char *charset) {
425
+ if (charset != NULL && charset[0] != '\0')
426
+ return charset;
427
+ #if HAVE_LOCALE_CHARSET && !defined(__APPLE__)
428
+ /* locale_charset() is broken on Mac OS */
429
+ return locale_charset();
430
+ #elif HAVE_NL_LANGINFO
431
+ return nl_langinfo(CODESET);
432
+ #else
433
+ return "";
434
+ #endif
435
+ }
436
+
437
+ #if defined(_WIN32) && !defined(__CYGWIN__)
438
+
439
+ /*
440
+ * Convert MBS to WCS.
441
+ * Note: returns -1 if conversion fails.
442
+ */
443
+ int
444
+ archive_wstring_append_from_mbs(struct archive_wstring *dest,
445
+ const char *p, size_t len)
446
+ {
447
+ return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL);
448
+ }
449
+
450
+ static int
451
+ archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest,
452
+ const char *s, size_t length, struct archive_string_conv *sc)
453
+ {
454
+ int count, ret = 0;
455
+ UINT from_cp;
456
+
457
+ if (sc != NULL)
458
+ from_cp = sc->from_cp;
459
+ else
460
+ from_cp = get_current_codepage();
461
+
462
+ if (from_cp == CP_C_LOCALE) {
463
+ /*
464
+ * "C" locale special processing.
465
+ */
466
+ wchar_t *ws;
467
+ const unsigned char *mp;
468
+
469
+ if (NULL == archive_wstring_ensure(dest,
470
+ dest->length + length + 1))
471
+ return (-1);
472
+
473
+ ws = dest->s + dest->length;
474
+ mp = (const unsigned char *)s;
475
+ count = 0;
476
+ while (count < (int)length && *mp) {
477
+ *ws++ = (wchar_t)*mp++;
478
+ count++;
479
+ }
480
+ } else if (sc != NULL &&
481
+ (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) {
482
+ /*
483
+ * Normalize UTF-8 and UTF-16BE and convert it directly
484
+ * to UTF-16 as wchar_t.
485
+ */
486
+ struct archive_string u16;
487
+ int saved_flag = sc->flag;/* save current flag. */
488
+
489
+ if (is_big_endian())
490
+ sc->flag |= SCONV_TO_UTF16BE;
491
+ else
492
+ sc->flag |= SCONV_TO_UTF16LE;
493
+
494
+ if (sc->flag & SCONV_FROM_UTF16) {
495
+ /*
496
+ * UTF-16BE/LE NFD ===> UTF-16 NFC
497
+ * UTF-16BE/LE NFC ===> UTF-16 NFD
498
+ */
499
+ count = (int)utf16nbytes(s, length);
500
+ } else {
501
+ /*
502
+ * UTF-8 NFD ===> UTF-16 NFC
503
+ * UTF-8 NFC ===> UTF-16 NFD
504
+ */
505
+ count = (int)mbsnbytes(s, length);
506
+ }
507
+ u16.s = (char *)dest->s;
508
+ u16.length = dest->length << 1;;
509
+ u16.buffer_length = dest->buffer_length;
510
+ if (sc->flag & SCONV_NORMALIZATION_C)
511
+ ret = archive_string_normalize_C(&u16, s, count, sc);
512
+ else
513
+ ret = archive_string_normalize_D(&u16, s, count, sc);
514
+ dest->s = (wchar_t *)u16.s;
515
+ dest->length = u16.length >> 1;
516
+ dest->buffer_length = u16.buffer_length;
517
+ sc->flag = saved_flag;/* restore the saved flag. */
518
+ return (ret);
519
+ } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) {
520
+ count = (int)utf16nbytes(s, length);
521
+ count >>= 1; /* to be WCS length */
522
+ /* Allocate memory for WCS. */
523
+ if (NULL == archive_wstring_ensure(dest,
524
+ dest->length + count + 1))
525
+ return (-1);
526
+ wmemcpy(dest->s + dest->length, (const wchar_t *)s, count);
527
+ if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) {
528
+ uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
529
+ int b;
530
+ for (b = 0; b < count; b++) {
531
+ uint16_t val = archive_le16dec(u16+b);
532
+ archive_be16enc(u16+b, val);
533
+ }
534
+ } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) {
535
+ uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
536
+ int b;
537
+ for (b = 0; b < count; b++) {
538
+ uint16_t val = archive_be16dec(u16+b);
539
+ archive_le16enc(u16+b, val);
540
+ }
541
+ }
542
+ } else {
543
+ DWORD mbflag;
544
+ size_t buffsize;
545
+
546
+ if (sc == NULL)
547
+ mbflag = 0;
548
+ else if (sc->flag & SCONV_FROM_CHARSET) {
549
+ /* Do not trust the length which comes from
550
+ * an archive file. */
551
+ length = mbsnbytes(s, length);
552
+ mbflag = 0;
553
+ } else
554
+ mbflag = MB_PRECOMPOSED;
555
+
556
+ buffsize = dest->length + length + 1;
557
+ do {
558
+ /* Allocate memory for WCS. */
559
+ if (NULL == archive_wstring_ensure(dest, buffsize))
560
+ return (-1);
561
+ /* Convert MBS to WCS. */
562
+ count = MultiByteToWideChar(from_cp,
563
+ mbflag, s, (int)length, dest->s + dest->length,
564
+ (int)(dest->buffer_length >> 1) -1);
565
+ if (count == 0 &&
566
+ GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
567
+ /* Expand the WCS buffer. */
568
+ buffsize = dest->buffer_length << 1;
569
+ continue;
570
+ }
571
+ if (count == 0 && length != 0)
572
+ ret = -1;
573
+ break;
574
+ } while (1);
575
+ }
576
+ dest->length += count;
577
+ dest->s[dest->length] = L'\0';
578
+ return (ret);
579
+ }
580
+
581
+ #else
582
+
583
+ /*
584
+ * Convert MBS to WCS.
585
+ * Note: returns -1 if conversion fails.
586
+ */
587
+ int
588
+ archive_wstring_append_from_mbs(struct archive_wstring *dest,
589
+ const char *p, size_t len)
590
+ {
591
+ size_t r;
592
+ int ret_val = 0;
593
+ /*
594
+ * No single byte will be more than one wide character,
595
+ * so this length estimate will always be big enough.
596
+ */
597
+ // size_t wcs_length = len;
598
+ size_t mbs_length = len;
599
+ const char *mbs = p;
600
+ wchar_t *wcs;
601
+ #if HAVE_MBRTOWC
602
+ mbstate_t shift_state;
603
+
604
+ memset(&shift_state, 0, sizeof(shift_state));
605
+ #endif
606
+ /*
607
+ * As we decided to have wcs_length == mbs_length == len
608
+ * we can use len here instead of wcs_length
609
+ */
610
+ if (NULL == archive_wstring_ensure(dest, dest->length + len + 1))
611
+ return (-1);
612
+ wcs = dest->s + dest->length;
613
+ /*
614
+ * We cannot use mbsrtowcs/mbstowcs here because those may convert
615
+ * extra MBS when strlen(p) > len and one wide character consists of
616
+ * multi bytes.
617
+ */
618
+ while (*mbs && mbs_length > 0) {
619
+ /*
620
+ * The buffer we allocated is always big enough.
621
+ * Keep this code path in a comment if we decide to choose
622
+ * smaller wcs_length in the future
623
+ */
624
+ /*
625
+ if (wcs_length == 0) {
626
+ dest->length = wcs - dest->s;
627
+ dest->s[dest->length] = L'\0';
628
+ wcs_length = mbs_length;
629
+ if (NULL == archive_wstring_ensure(dest,
630
+ dest->length + wcs_length + 1))
631
+ return (-1);
632
+ wcs = dest->s + dest->length;
633
+ }
634
+ */
635
+ #if HAVE_MBRTOWC
636
+ r = mbrtowc(wcs, mbs, mbs_length, &shift_state);
637
+ #else
638
+ r = mbtowc(wcs, mbs, mbs_length);
639
+ #endif
640
+ if (r == (size_t)-1 || r == (size_t)-2) {
641
+ ret_val = -1;
642
+ break;
643
+ }
644
+ if (r == 0 || r > mbs_length)
645
+ break;
646
+ wcs++;
647
+ // wcs_length--;
648
+ mbs += r;
649
+ mbs_length -= r;
650
+ }
651
+ dest->length = wcs - dest->s;
652
+ dest->s[dest->length] = L'\0';
653
+ return (ret_val);
654
+ }
655
+
656
+ #endif
657
+
658
+ #if defined(_WIN32) && !defined(__CYGWIN__)
659
+
660
+ /*
661
+ * WCS ==> MBS.
662
+ * Note: returns -1 if conversion fails.
663
+ *
664
+ * Win32 builds use WideCharToMultiByte from the Windows API.
665
+ * (Maybe Cygwin should too? WideCharToMultiByte will know a
666
+ * lot more about local character encodings than the wcrtomb()
667
+ * wrapper is going to know.)
668
+ */
669
+ int
670
+ archive_string_append_from_wcs(struct archive_string *as,
671
+ const wchar_t *w, size_t len)
672
+ {
673
+ return archive_string_append_from_wcs_in_codepage(as, w, len, NULL);
674
+ }
675
+
676
+ static int
677
+ archive_string_append_from_wcs_in_codepage(struct archive_string *as,
678
+ const wchar_t *ws, size_t len, struct archive_string_conv *sc)
679
+ {
680
+ BOOL defchar_used, *dp;
681
+ int count, ret = 0;
682
+ UINT to_cp;
683
+ int wslen = (int)len;
684
+
685
+ if (sc != NULL)
686
+ to_cp = sc->to_cp;
687
+ else
688
+ to_cp = get_current_codepage();
689
+
690
+ if (to_cp == CP_C_LOCALE) {
691
+ /*
692
+ * "C" locale special processing.
693
+ */
694
+ const wchar_t *wp = ws;
695
+ char *p;
696
+
697
+ if (NULL == archive_string_ensure(as,
698
+ as->length + wslen +1))
699
+ return (-1);
700
+ p = as->s + as->length;
701
+ count = 0;
702
+ defchar_used = 0;
703
+ while (count < wslen && *wp) {
704
+ if (*wp > 255) {
705
+ *p++ = '?';
706
+ wp++;
707
+ defchar_used = 1;
708
+ } else
709
+ *p++ = (char)*wp++;
710
+ count++;
711
+ }
712
+ } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) {
713
+ uint16_t *u16;
714
+
715
+ if (NULL ==
716
+ archive_string_ensure(as, as->length + len * 2 + 2))
717
+ return (-1);
718
+ u16 = (uint16_t *)(as->s + as->length);
719
+ count = 0;
720
+ defchar_used = 0;
721
+ if (sc->flag & SCONV_TO_UTF16BE) {
722
+ while (count < (int)len && *ws) {
723
+ archive_be16enc(u16+count, *ws);
724
+ ws++;
725
+ count++;
726
+ }
727
+ } else {
728
+ while (count < (int)len && *ws) {
729
+ archive_le16enc(u16+count, *ws);
730
+ ws++;
731
+ count++;
732
+ }
733
+ }
734
+ count <<= 1; /* to be byte size */
735
+ } else {
736
+ /* Make sure the MBS buffer has plenty to set. */
737
+ if (NULL ==
738
+ archive_string_ensure(as, as->length + len * 2 + 1))
739
+ return (-1);
740
+ do {
741
+ defchar_used = 0;
742
+ if (to_cp == CP_UTF8 || sc == NULL)
743
+ dp = NULL;
744
+ else
745
+ dp = &defchar_used;
746
+ count = WideCharToMultiByte(to_cp, 0, ws, wslen,
747
+ as->s + as->length,
748
+ (int)as->buffer_length - (int)as->length - 1, NULL, dp);
749
+ if (count == 0 &&
750
+ GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
751
+ /* Expand the MBS buffer and retry. */
752
+ if (NULL == archive_string_ensure(as,
753
+ as->buffer_length + len))
754
+ return (-1);
755
+ continue;
756
+ }
757
+ if (count == 0)
758
+ ret = -1;
759
+ break;
760
+ } while (1);
761
+ }
762
+ as->length += count;
763
+ as->s[as->length] = '\0';
764
+ return (defchar_used?-1:ret);
765
+ }
766
+
767
+ #elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB)
768
+
769
+ /*
770
+ * Translates a wide character string into current locale character set
771
+ * and appends to the archive_string. Note: returns -1 if conversion
772
+ * fails.
773
+ */
774
+ int
775
+ archive_string_append_from_wcs(struct archive_string *as,
776
+ const wchar_t *w, size_t len)
777
+ {
778
+ /* We cannot use the standard wcstombs() here because it
779
+ * cannot tell us how big the output buffer should be. So
780
+ * I've built a loop around wcrtomb() or wctomb() that
781
+ * converts a character at a time and resizes the string as
782
+ * needed. We prefer wcrtomb() when it's available because
783
+ * it's thread-safe. */
784
+ int n, ret_val = 0;
785
+ char *p;
786
+ char *end;
787
+ #if HAVE_WCRTOMB
788
+ mbstate_t shift_state;
789
+
790
+ memset(&shift_state, 0, sizeof(shift_state));
791
+ #else
792
+ /* Clear the shift state before starting. */
793
+ wctomb(NULL, L'\0');
794
+ #endif
795
+ /*
796
+ * Allocate buffer for MBS.
797
+ * We need this allocation here since it is possible that
798
+ * as->s is still NULL.
799
+ */
800
+ if (archive_string_ensure(as, as->length + len + 1) == NULL)
801
+ return (-1);
802
+
803
+ p = as->s + as->length;
804
+ end = as->s + as->buffer_length - MB_CUR_MAX -1;
805
+ while (*w != L'\0' && len > 0) {
806
+ if (p >= end) {
807
+ as->length = p - as->s;
808
+ as->s[as->length] = '\0';
809
+ /* Re-allocate buffer for MBS. */
810
+ if (archive_string_ensure(as,
811
+ as->length + max(len * 2,
812
+ (size_t)MB_CUR_MAX) + 1) == NULL)
813
+ return (-1);
814
+ p = as->s + as->length;
815
+ end = as->s + as->buffer_length - MB_CUR_MAX -1;
816
+ }
817
+ #if HAVE_WCRTOMB
818
+ n = wcrtomb(p, *w++, &shift_state);
819
+ #else
820
+ n = wctomb(p, *w++);
821
+ #endif
822
+ if (n == -1) {
823
+ if (errno == EILSEQ) {
824
+ /* Skip an illegal wide char. */
825
+ *p++ = '?';
826
+ ret_val = -1;
827
+ } else {
828
+ ret_val = -1;
829
+ break;
830
+ }
831
+ } else
832
+ p += n;
833
+ len--;
834
+ }
835
+ as->length = p - as->s;
836
+ as->s[as->length] = '\0';
837
+ return (ret_val);
838
+ }
839
+
840
+ #else /* HAVE_WCTOMB || HAVE_WCRTOMB */
841
+
842
+ /*
843
+ * TODO: Test if __STDC_ISO_10646__ is defined.
844
+ * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
845
+ * one character at a time. If a non-Windows platform doesn't have
846
+ * either of these, fall back to the built-in UTF8 conversion.
847
+ */
848
+ int
849
+ archive_string_append_from_wcs(struct archive_string *as,
850
+ const wchar_t *w, size_t len)
851
+ {
852
+ (void)as;/* UNUSED */
853
+ (void)w;/* UNUSED */
854
+ (void)len;/* UNUSED */
855
+ errno = ENOSYS;
856
+ return (-1);
857
+ }
858
+
859
+ #endif /* HAVE_WCTOMB || HAVE_WCRTOMB */
860
+
861
+ /*
862
+ * Find a string conversion object by a pair of 'from' charset name
863
+ * and 'to' charset name from an archive object.
864
+ * Return NULL if not found.
865
+ */
866
+ static struct archive_string_conv *
867
+ find_sconv_object(struct archive *a, const char *fc, const char *tc)
868
+ {
869
+ struct archive_string_conv *sc;
870
+
871
+ if (a == NULL)
872
+ return (NULL);
873
+
874
+ for (sc = a->sconv; sc != NULL; sc = sc->next) {
875
+ if (strcmp(sc->from_charset, fc) == 0 &&
876
+ strcmp(sc->to_charset, tc) == 0)
877
+ break;
878
+ }
879
+ return (sc);
880
+ }
881
+
882
+ /*
883
+ * Register a string object to an archive object.
884
+ */
885
+ static void
886
+ add_sconv_object(struct archive *a, struct archive_string_conv *sc)
887
+ {
888
+ struct archive_string_conv **psc;
889
+
890
+ /* Add a new sconv to sconv list. */
891
+ psc = &(a->sconv);
892
+ while (*psc != NULL)
893
+ psc = &((*psc)->next);
894
+ *psc = sc;
895
+ }
896
+
897
+ static void
898
+ add_converter(struct archive_string_conv *sc, int (*converter)
899
+ (struct archive_string *, const void *, size_t,
900
+ struct archive_string_conv *))
901
+ {
902
+ if (sc == NULL || sc->nconverter >= 2)
903
+ __archive_errx(1, "Programming error");
904
+ sc->converter[sc->nconverter++] = converter;
905
+ }
906
+
907
+ static void
908
+ setup_converter(struct archive_string_conv *sc)
909
+ {
910
+
911
+ /* Reset. */
912
+ sc->nconverter = 0;
913
+
914
+ /*
915
+ * Perform special sequence for the incorrect UTF-8 filenames
916
+ * made by libarchive2.x.
917
+ */
918
+ if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) {
919
+ add_converter(sc, strncat_from_utf8_libarchive2);
920
+ return;
921
+ }
922
+
923
+ /*
924
+ * Convert a string to UTF-16BE/LE.
925
+ */
926
+ if (sc->flag & SCONV_TO_UTF16) {
927
+ /*
928
+ * If the current locale is UTF-8, we can translate
929
+ * a UTF-8 string into a UTF-16BE string.
930
+ */
931
+ if (sc->flag & SCONV_FROM_UTF8) {
932
+ add_converter(sc, archive_string_append_unicode);
933
+ return;
934
+ }
935
+
936
+ #if defined(_WIN32) && !defined(__CYGWIN__)
937
+ if (sc->flag & SCONV_WIN_CP) {
938
+ if (sc->flag & SCONV_TO_UTF16BE)
939
+ add_converter(sc, win_strncat_to_utf16be);
940
+ else
941
+ add_converter(sc, win_strncat_to_utf16le);
942
+ return;
943
+ }
944
+ #endif
945
+
946
+ #if defined(HAVE_ICONV)
947
+ if (sc->cd != (iconv_t)-1) {
948
+ add_converter(sc, iconv_strncat_in_locale);
949
+ return;
950
+ }
951
+ #endif
952
+
953
+ if (sc->flag & SCONV_BEST_EFFORT) {
954
+ if (sc->flag & SCONV_TO_UTF16BE)
955
+ add_converter(sc,
956
+ best_effort_strncat_to_utf16be);
957
+ else
958
+ add_converter(sc,
959
+ best_effort_strncat_to_utf16le);
960
+ } else
961
+ /* Make sure we have no converter. */
962
+ sc->nconverter = 0;
963
+ return;
964
+ }
965
+
966
+ /*
967
+ * Convert a string from UTF-16BE/LE.
968
+ */
969
+ if (sc->flag & SCONV_FROM_UTF16) {
970
+ /*
971
+ * At least we should normalize a UTF-16BE string.
972
+ */
973
+ if (sc->flag & SCONV_NORMALIZATION_D)
974
+ add_converter(sc,archive_string_normalize_D);
975
+ else if (sc->flag & SCONV_NORMALIZATION_C)
976
+ add_converter(sc, archive_string_normalize_C);
977
+
978
+ if (sc->flag & SCONV_TO_UTF8) {
979
+ /*
980
+ * If the current locale is UTF-8, we can translate
981
+ * a UTF-16BE/LE string into a UTF-8 string directly.
982
+ */
983
+ if (!(sc->flag &
984
+ (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
985
+ add_converter(sc,
986
+ archive_string_append_unicode);
987
+ return;
988
+ }
989
+
990
+ #if defined(_WIN32) && !defined(__CYGWIN__)
991
+ if (sc->flag & SCONV_WIN_CP) {
992
+ if (sc->flag & SCONV_FROM_UTF16BE)
993
+ add_converter(sc, win_strncat_from_utf16be);
994
+ else
995
+ add_converter(sc, win_strncat_from_utf16le);
996
+ return;
997
+ }
998
+ #endif
999
+
1000
+ #if defined(HAVE_ICONV)
1001
+ if (sc->cd != (iconv_t)-1) {
1002
+ add_converter(sc, iconv_strncat_in_locale);
1003
+ return;
1004
+ }
1005
+ #endif
1006
+
1007
+ if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
1008
+ == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
1009
+ add_converter(sc, best_effort_strncat_from_utf16be);
1010
+ else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
1011
+ == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
1012
+ add_converter(sc, best_effort_strncat_from_utf16le);
1013
+ else
1014
+ /* Make sure we have no converter. */
1015
+ sc->nconverter = 0;
1016
+ return;
1017
+ }
1018
+
1019
+ if (sc->flag & SCONV_FROM_UTF8) {
1020
+ /*
1021
+ * At least we should normalize a UTF-8 string.
1022
+ */
1023
+ if (sc->flag & SCONV_NORMALIZATION_D)
1024
+ add_converter(sc,archive_string_normalize_D);
1025
+ else if (sc->flag & SCONV_NORMALIZATION_C)
1026
+ add_converter(sc, archive_string_normalize_C);
1027
+
1028
+ /*
1029
+ * Copy UTF-8 string with a check of CESU-8.
1030
+ * Apparently, iconv does not check surrogate pairs in UTF-8
1031
+ * when both from-charset and to-charset are UTF-8, and then
1032
+ * we use our UTF-8 copy code.
1033
+ */
1034
+ if (sc->flag & SCONV_TO_UTF8) {
1035
+ /*
1036
+ * If the current locale is UTF-8, we can translate
1037
+ * a UTF-16BE string into a UTF-8 string directly.
1038
+ */
1039
+ if (!(sc->flag &
1040
+ (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
1041
+ add_converter(sc, strncat_from_utf8_to_utf8);
1042
+ return;
1043
+ }
1044
+ }
1045
+
1046
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1047
+ /*
1048
+ * On Windows we can use Windows API for a string conversion.
1049
+ */
1050
+ if (sc->flag & SCONV_WIN_CP) {
1051
+ add_converter(sc, strncat_in_codepage);
1052
+ return;
1053
+ }
1054
+ #endif
1055
+
1056
+ #if HAVE_ICONV
1057
+ if (sc->cd != (iconv_t)-1) {
1058
+ add_converter(sc, iconv_strncat_in_locale);
1059
+ /*
1060
+ * iconv generally does not support UTF-8-MAC and so
1061
+ * we have to the output of iconv from NFC to NFD if
1062
+ * need.
1063
+ */
1064
+ if ((sc->flag & SCONV_FROM_CHARSET) &&
1065
+ (sc->flag & SCONV_TO_UTF8)) {
1066
+ if (sc->flag & SCONV_NORMALIZATION_D)
1067
+ add_converter(sc, archive_string_normalize_D);
1068
+ }
1069
+ return;
1070
+ }
1071
+ #endif
1072
+
1073
+ /*
1074
+ * Try conversion in the best effort or no conversion.
1075
+ */
1076
+ if ((sc->flag & SCONV_BEST_EFFORT) || sc->same)
1077
+ add_converter(sc, best_effort_strncat_in_locale);
1078
+ else
1079
+ /* Make sure we have no converter. */
1080
+ sc->nconverter = 0;
1081
+ }
1082
+
1083
+ /*
1084
+ * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE
1085
+ * and CP932 which are referenced in create_sconv_object().
1086
+ */
1087
+ static const char *
1088
+ canonical_charset_name(const char *charset)
1089
+ {
1090
+ char cs[16];
1091
+ char *p;
1092
+ const char *s;
1093
+
1094
+ if (charset == NULL || charset[0] == '\0'
1095
+ || strlen(charset) > 15)
1096
+ return (charset);
1097
+
1098
+ /* Copy name to uppercase. */
1099
+ p = cs;
1100
+ s = charset;
1101
+ while (*s) {
1102
+ char c = *s++;
1103
+ if (c >= 'a' && c <= 'z')
1104
+ c -= 'a' - 'A';
1105
+ *p++ = c;
1106
+ }
1107
+ *p++ = '\0';
1108
+
1109
+ if (strcmp(cs, "UTF-8") == 0 ||
1110
+ strcmp(cs, "UTF8") == 0)
1111
+ return ("UTF-8");
1112
+ if (strcmp(cs, "UTF-16BE") == 0 ||
1113
+ strcmp(cs, "UTF16BE") == 0)
1114
+ return ("UTF-16BE");
1115
+ if (strcmp(cs, "UTF-16LE") == 0 ||
1116
+ strcmp(cs, "UTF16LE") == 0)
1117
+ return ("UTF-16LE");
1118
+ if (strcmp(cs, "CP932") == 0)
1119
+ return ("CP932");
1120
+ return (charset);
1121
+ }
1122
+
1123
+ /*
1124
+ * Create a string conversion object.
1125
+ */
1126
+ static struct archive_string_conv *
1127
+ create_sconv_object(const char *fc, const char *tc,
1128
+ unsigned current_codepage, int flag)
1129
+ {
1130
+ struct archive_string_conv *sc;
1131
+
1132
+ sc = calloc(1, sizeof(*sc));
1133
+ if (sc == NULL)
1134
+ return (NULL);
1135
+ sc->next = NULL;
1136
+ sc->from_charset = strdup(fc);
1137
+ if (sc->from_charset == NULL) {
1138
+ free(sc);
1139
+ return (NULL);
1140
+ }
1141
+ sc->to_charset = strdup(tc);
1142
+ if (sc->to_charset == NULL) {
1143
+ free(sc->from_charset);
1144
+ free(sc);
1145
+ return (NULL);
1146
+ }
1147
+ archive_string_init(&sc->utftmp);
1148
+
1149
+ if (flag & SCONV_TO_CHARSET) {
1150
+ /*
1151
+ * Convert characters from the current locale charset to
1152
+ * a specified charset.
1153
+ */
1154
+ sc->from_cp = current_codepage;
1155
+ sc->to_cp = make_codepage_from_charset(tc);
1156
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1157
+ if (IsValidCodePage(sc->to_cp))
1158
+ flag |= SCONV_WIN_CP;
1159
+ #endif
1160
+ } else if (flag & SCONV_FROM_CHARSET) {
1161
+ /*
1162
+ * Convert characters from a specified charset to
1163
+ * the current locale charset.
1164
+ */
1165
+ sc->to_cp = current_codepage;
1166
+ sc->from_cp = make_codepage_from_charset(fc);
1167
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1168
+ if (IsValidCodePage(sc->from_cp))
1169
+ flag |= SCONV_WIN_CP;
1170
+ #endif
1171
+ }
1172
+
1173
+ /*
1174
+ * Check if "from charset" and "to charset" are the same.
1175
+ */
1176
+ if (strcmp(fc, tc) == 0 ||
1177
+ (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp))
1178
+ sc->same = 1;
1179
+ else
1180
+ sc->same = 0;
1181
+
1182
+ /*
1183
+ * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE.
1184
+ */
1185
+ if (strcmp(tc, "UTF-8") == 0)
1186
+ flag |= SCONV_TO_UTF8;
1187
+ else if (strcmp(tc, "UTF-16BE") == 0)
1188
+ flag |= SCONV_TO_UTF16BE;
1189
+ else if (strcmp(tc, "UTF-16LE") == 0)
1190
+ flag |= SCONV_TO_UTF16LE;
1191
+ if (strcmp(fc, "UTF-8") == 0)
1192
+ flag |= SCONV_FROM_UTF8;
1193
+ else if (strcmp(fc, "UTF-16BE") == 0)
1194
+ flag |= SCONV_FROM_UTF16BE;
1195
+ else if (strcmp(fc, "UTF-16LE") == 0)
1196
+ flag |= SCONV_FROM_UTF16LE;
1197
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1198
+ if (sc->to_cp == CP_UTF8)
1199
+ flag |= SCONV_TO_UTF8;
1200
+ else if (sc->to_cp == CP_UTF16BE)
1201
+ flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP;
1202
+ else if (sc->to_cp == CP_UTF16LE)
1203
+ flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP;
1204
+ if (sc->from_cp == CP_UTF8)
1205
+ flag |= SCONV_FROM_UTF8;
1206
+ else if (sc->from_cp == CP_UTF16BE)
1207
+ flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP;
1208
+ else if (sc->from_cp == CP_UTF16LE)
1209
+ flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP;
1210
+ #endif
1211
+
1212
+ /*
1213
+ * Set a flag for Unicode NFD. Usually iconv cannot correctly
1214
+ * handle it. So we have to translate NFD characters to NFC ones
1215
+ * ourselves before iconv handles. Another reason is to prevent
1216
+ * that the same sight of two filenames, one is NFC and other
1217
+ * is NFD, would be in its directory.
1218
+ * On Mac OS X, although its filesystem layer automatically
1219
+ * convert filenames to NFD, it would be useful for filename
1220
+ * comparing to find out the same filenames that we normalize
1221
+ * that to be NFD ourselves.
1222
+ */
1223
+ if ((flag & SCONV_FROM_CHARSET) &&
1224
+ (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) {
1225
+ #if defined(__APPLE__)
1226
+ if (flag & SCONV_TO_UTF8)
1227
+ flag |= SCONV_NORMALIZATION_D;
1228
+ else
1229
+ #endif
1230
+ flag |= SCONV_NORMALIZATION_C;
1231
+ }
1232
+ #if defined(__APPLE__)
1233
+ /*
1234
+ * In case writing an archive file, make sure that a filename
1235
+ * going to be passed to iconv is a Unicode NFC string since
1236
+ * a filename in HFS Plus filesystem is a Unicode NFD one and
1237
+ * iconv cannot handle it with "UTF-8" charset. It is simpler
1238
+ * than a use of "UTF-8-MAC" charset.
1239
+ */
1240
+ if ((flag & SCONV_TO_CHARSET) &&
1241
+ (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1242
+ !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
1243
+ flag |= SCONV_NORMALIZATION_C;
1244
+ /*
1245
+ * In case reading an archive file. make sure that a filename
1246
+ * will be passed to users is a Unicode NFD string in order to
1247
+ * correctly compare the filename with other one which comes
1248
+ * from HFS Plus filesystem.
1249
+ */
1250
+ if ((flag & SCONV_FROM_CHARSET) &&
1251
+ !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1252
+ (flag & SCONV_TO_UTF8))
1253
+ flag |= SCONV_NORMALIZATION_D;
1254
+ #endif
1255
+
1256
+ #if defined(HAVE_ICONV)
1257
+ sc->cd_w = (iconv_t)-1;
1258
+ /*
1259
+ * Create an iconv object.
1260
+ */
1261
+ if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) &&
1262
+ (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) ||
1263
+ (flag & SCONV_WIN_CP)) {
1264
+ /* This case we won't use iconv. */
1265
+ sc->cd = (iconv_t)-1;
1266
+ } else {
1267
+ sc->cd = iconv_open(tc, fc);
1268
+ if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) {
1269
+ /*
1270
+ * Unfortunately, all of iconv implements do support
1271
+ * "CP932" character-set, so we should use "SJIS"
1272
+ * instead if iconv_open failed.
1273
+ */
1274
+ if (strcmp(tc, "CP932") == 0)
1275
+ sc->cd = iconv_open("SJIS", fc);
1276
+ else if (strcmp(fc, "CP932") == 0)
1277
+ sc->cd = iconv_open(tc, "SJIS");
1278
+ }
1279
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1280
+ /*
1281
+ * archive_mstring on Windows directly convert multi-bytes
1282
+ * into archive_wstring in order not to depend on locale
1283
+ * so that you can do a I18N programming. This will be
1284
+ * used only in archive_mstring_copy_mbs_len_l so far.
1285
+ */
1286
+ if (flag & SCONV_FROM_CHARSET) {
1287
+ sc->cd_w = iconv_open("UTF-8", fc);
1288
+ if (sc->cd_w == (iconv_t)-1 &&
1289
+ (sc->flag & SCONV_BEST_EFFORT)) {
1290
+ if (strcmp(fc, "CP932") == 0)
1291
+ sc->cd_w = iconv_open("UTF-8", "SJIS");
1292
+ }
1293
+ }
1294
+ #endif /* _WIN32 && !__CYGWIN__ */
1295
+ }
1296
+ #endif /* HAVE_ICONV */
1297
+
1298
+ sc->flag = flag;
1299
+
1300
+ /*
1301
+ * Set up converters.
1302
+ */
1303
+ setup_converter(sc);
1304
+
1305
+ return (sc);
1306
+ }
1307
+
1308
+ /*
1309
+ * Free a string conversion object.
1310
+ */
1311
+ static void
1312
+ free_sconv_object(struct archive_string_conv *sc)
1313
+ {
1314
+ free(sc->from_charset);
1315
+ free(sc->to_charset);
1316
+ archive_string_free(&sc->utftmp);
1317
+ #if HAVE_ICONV
1318
+ if (sc->cd != (iconv_t)-1)
1319
+ iconv_close(sc->cd);
1320
+ if (sc->cd_w != (iconv_t)-1)
1321
+ iconv_close(sc->cd_w);
1322
+ #endif
1323
+ free(sc);
1324
+ }
1325
+
1326
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1327
+ static unsigned
1328
+ my_atoi(const char *p)
1329
+ {
1330
+ unsigned cp;
1331
+
1332
+ cp = 0;
1333
+ while (*p) {
1334
+ if (*p >= '0' && *p <= '9')
1335
+ cp = cp * 10 + (*p - '0');
1336
+ else
1337
+ return (-1);
1338
+ p++;
1339
+ }
1340
+ return (cp);
1341
+ }
1342
+
1343
+ /*
1344
+ * Translate Charset name (as used by iconv) into CodePage (as used by Windows)
1345
+ * Return -1 if failed.
1346
+ *
1347
+ * Note: This translation code may be insufficient.
1348
+ */
1349
+ static struct charset {
1350
+ const char *name;
1351
+ unsigned cp;
1352
+ } charsets[] = {
1353
+ /* MUST BE SORTED! */
1354
+ {"ASCII", 1252},
1355
+ {"ASMO-708", 708},
1356
+ {"BIG5", 950},
1357
+ {"CHINESE", 936},
1358
+ {"CP367", 1252},
1359
+ {"CP819", 1252},
1360
+ {"CP1025", 21025},
1361
+ {"DOS-720", 720},
1362
+ {"DOS-862", 862},
1363
+ {"EUC-CN", 51936},
1364
+ {"EUC-JP", 51932},
1365
+ {"EUC-KR", 949},
1366
+ {"EUCCN", 51936},
1367
+ {"EUCJP", 51932},
1368
+ {"EUCKR", 949},
1369
+ {"GB18030", 54936},
1370
+ {"GB2312", 936},
1371
+ {"HEBREW", 1255},
1372
+ {"HZ-GB-2312", 52936},
1373
+ {"IBM273", 20273},
1374
+ {"IBM277", 20277},
1375
+ {"IBM278", 20278},
1376
+ {"IBM280", 20280},
1377
+ {"IBM284", 20284},
1378
+ {"IBM285", 20285},
1379
+ {"IBM290", 20290},
1380
+ {"IBM297", 20297},
1381
+ {"IBM367", 1252},
1382
+ {"IBM420", 20420},
1383
+ {"IBM423", 20423},
1384
+ {"IBM424", 20424},
1385
+ {"IBM819", 1252},
1386
+ {"IBM871", 20871},
1387
+ {"IBM880", 20880},
1388
+ {"IBM905", 20905},
1389
+ {"IBM924", 20924},
1390
+ {"ISO-8859-1", 28591},
1391
+ {"ISO-8859-13", 28603},
1392
+ {"ISO-8859-15", 28605},
1393
+ {"ISO-8859-2", 28592},
1394
+ {"ISO-8859-3", 28593},
1395
+ {"ISO-8859-4", 28594},
1396
+ {"ISO-8859-5", 28595},
1397
+ {"ISO-8859-6", 28596},
1398
+ {"ISO-8859-7", 28597},
1399
+ {"ISO-8859-8", 28598},
1400
+ {"ISO-8859-9", 28599},
1401
+ {"ISO8859-1", 28591},
1402
+ {"ISO8859-13", 28603},
1403
+ {"ISO8859-15", 28605},
1404
+ {"ISO8859-2", 28592},
1405
+ {"ISO8859-3", 28593},
1406
+ {"ISO8859-4", 28594},
1407
+ {"ISO8859-5", 28595},
1408
+ {"ISO8859-6", 28596},
1409
+ {"ISO8859-7", 28597},
1410
+ {"ISO8859-8", 28598},
1411
+ {"ISO8859-9", 28599},
1412
+ {"JOHAB", 1361},
1413
+ {"KOI8-R", 20866},
1414
+ {"KOI8-U", 21866},
1415
+ {"KS_C_5601-1987", 949},
1416
+ {"LATIN1", 1252},
1417
+ {"LATIN2", 28592},
1418
+ {"MACINTOSH", 10000},
1419
+ {"SHIFT-JIS", 932},
1420
+ {"SHIFT_JIS", 932},
1421
+ {"SJIS", 932},
1422
+ {"US", 1252},
1423
+ {"US-ASCII", 1252},
1424
+ {"UTF-16", 1200},
1425
+ {"UTF-16BE", 1201},
1426
+ {"UTF-16LE", 1200},
1427
+ {"UTF-8", CP_UTF8},
1428
+ {"X-EUROPA", 29001},
1429
+ {"X-MAC-ARABIC", 10004},
1430
+ {"X-MAC-CE", 10029},
1431
+ {"X-MAC-CHINESEIMP", 10008},
1432
+ {"X-MAC-CHINESETRAD", 10002},
1433
+ {"X-MAC-CROATIAN", 10082},
1434
+ {"X-MAC-CYRILLIC", 10007},
1435
+ {"X-MAC-GREEK", 10006},
1436
+ {"X-MAC-HEBREW", 10005},
1437
+ {"X-MAC-ICELANDIC", 10079},
1438
+ {"X-MAC-JAPANESE", 10001},
1439
+ {"X-MAC-KOREAN", 10003},
1440
+ {"X-MAC-ROMANIAN", 10010},
1441
+ {"X-MAC-THAI", 10021},
1442
+ {"X-MAC-TURKISH", 10081},
1443
+ {"X-MAC-UKRAINIAN", 10017},
1444
+ };
1445
+ static unsigned
1446
+ make_codepage_from_charset(const char *charset)
1447
+ {
1448
+ char cs[16];
1449
+ char *p;
1450
+ unsigned cp;
1451
+ int a, b;
1452
+
1453
+ if (charset == NULL || strlen(charset) > 15)
1454
+ return -1;
1455
+
1456
+ /* Copy name to uppercase. */
1457
+ p = cs;
1458
+ while (*charset) {
1459
+ char c = *charset++;
1460
+ if (c >= 'a' && c <= 'z')
1461
+ c -= 'a' - 'A';
1462
+ *p++ = c;
1463
+ }
1464
+ *p++ = '\0';
1465
+ cp = -1;
1466
+
1467
+ /* Look it up in the table first, so that we can easily
1468
+ * override CP367, which we map to 1252 instead of 367. */
1469
+ a = 0;
1470
+ b = sizeof(charsets)/sizeof(charsets[0]);
1471
+ while (b > a) {
1472
+ int c = (b + a) / 2;
1473
+ int r = strcmp(charsets[c].name, cs);
1474
+ if (r < 0)
1475
+ a = c + 1;
1476
+ else if (r > 0)
1477
+ b = c;
1478
+ else
1479
+ return charsets[c].cp;
1480
+ }
1481
+
1482
+ /* If it's not in the table, try to parse it. */
1483
+ switch (*cs) {
1484
+ case 'C':
1485
+ if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') {
1486
+ cp = my_atoi(cs + 2);
1487
+ } else if (strcmp(cs, "CP_ACP") == 0)
1488
+ cp = get_current_codepage();
1489
+ else if (strcmp(cs, "CP_OEMCP") == 0)
1490
+ cp = get_current_oemcp();
1491
+ break;
1492
+ case 'I':
1493
+ if (cs[1] == 'B' && cs[2] == 'M' &&
1494
+ cs[3] >= '0' && cs[3] <= '9') {
1495
+ cp = my_atoi(cs + 3);
1496
+ }
1497
+ break;
1498
+ case 'W':
1499
+ if (strncmp(cs, "WINDOWS-", 8) == 0) {
1500
+ cp = my_atoi(cs + 8);
1501
+ if (cp != 874 && (cp < 1250 || cp > 1258))
1502
+ cp = -1;/* This may invalid code. */
1503
+ }
1504
+ break;
1505
+ }
1506
+ return (cp);
1507
+ }
1508
+
1509
+ /*
1510
+ * Return ANSI Code Page of current locale set by setlocale().
1511
+ */
1512
+ static unsigned
1513
+ get_current_codepage(void)
1514
+ {
1515
+ char *locale, *p;
1516
+ unsigned cp;
1517
+
1518
+ locale = setlocale(LC_CTYPE, NULL);
1519
+ if (locale == NULL)
1520
+ return (GetACP());
1521
+ if (locale[0] == 'C' && locale[1] == '\0')
1522
+ return (CP_C_LOCALE);
1523
+ p = strrchr(locale, '.');
1524
+ if (p == NULL)
1525
+ return (GetACP());
1526
+ if (strcmp(p+1, "utf8") == 0)
1527
+ return CP_UTF8;
1528
+ cp = my_atoi(p+1);
1529
+ if ((int)cp <= 0)
1530
+ return (GetACP());
1531
+ return (cp);
1532
+ }
1533
+
1534
+ /*
1535
+ * Translation table between Locale Name and ACP/OEMCP.
1536
+ */
1537
+ static struct {
1538
+ unsigned acp;
1539
+ unsigned ocp;
1540
+ const char *locale;
1541
+ } acp_ocp_map[] = {
1542
+ { 950, 950, "Chinese_Taiwan" },
1543
+ { 936, 936, "Chinese_People's Republic of China" },
1544
+ { 950, 950, "Chinese_Taiwan" },
1545
+ { 1250, 852, "Czech_Czech Republic" },
1546
+ { 1252, 850, "Danish_Denmark" },
1547
+ { 1252, 850, "Dutch_Netherlands" },
1548
+ { 1252, 850, "Dutch_Belgium" },
1549
+ { 1252, 437, "English_United States" },
1550
+ { 1252, 850, "English_Australia" },
1551
+ { 1252, 850, "English_Canada" },
1552
+ { 1252, 850, "English_New Zealand" },
1553
+ { 1252, 850, "English_United Kingdom" },
1554
+ { 1252, 437, "English_United States" },
1555
+ { 1252, 850, "Finnish_Finland" },
1556
+ { 1252, 850, "French_France" },
1557
+ { 1252, 850, "French_Belgium" },
1558
+ { 1252, 850, "French_Canada" },
1559
+ { 1252, 850, "French_Switzerland" },
1560
+ { 1252, 850, "German_Germany" },
1561
+ { 1252, 850, "German_Austria" },
1562
+ { 1252, 850, "German_Switzerland" },
1563
+ { 1253, 737, "Greek_Greece" },
1564
+ { 1250, 852, "Hungarian_Hungary" },
1565
+ { 1252, 850, "Icelandic_Iceland" },
1566
+ { 1252, 850, "Italian_Italy" },
1567
+ { 1252, 850, "Italian_Switzerland" },
1568
+ { 932, 932, "Japanese_Japan" },
1569
+ { 949, 949, "Korean_Korea" },
1570
+ { 1252, 850, "Norwegian (BokmOl)_Norway" },
1571
+ { 1252, 850, "Norwegian (BokmOl)_Norway" },
1572
+ { 1252, 850, "Norwegian-Nynorsk_Norway" },
1573
+ { 1250, 852, "Polish_Poland" },
1574
+ { 1252, 850, "Portuguese_Portugal" },
1575
+ { 1252, 850, "Portuguese_Brazil" },
1576
+ { 1251, 866, "Russian_Russia" },
1577
+ { 1250, 852, "Slovak_Slovakia" },
1578
+ { 1252, 850, "Spanish_Spain" },
1579
+ { 1252, 850, "Spanish_Mexico" },
1580
+ { 1252, 850, "Spanish_Spain" },
1581
+ { 1252, 850, "Swedish_Sweden" },
1582
+ { 1254, 857, "Turkish_Turkey" },
1583
+ { 0, 0, NULL}
1584
+ };
1585
+
1586
+ /*
1587
+ * Return OEM Code Page of current locale set by setlocale().
1588
+ */
1589
+ static unsigned
1590
+ get_current_oemcp(void)
1591
+ {
1592
+ int i;
1593
+ char *locale, *p;
1594
+ size_t len;
1595
+
1596
+ locale = setlocale(LC_CTYPE, NULL);
1597
+ if (locale == NULL)
1598
+ return (GetOEMCP());
1599
+ if (locale[0] == 'C' && locale[1] == '\0')
1600
+ return (CP_C_LOCALE);
1601
+
1602
+ p = strrchr(locale, '.');
1603
+ if (p == NULL)
1604
+ return (GetOEMCP());
1605
+ len = p - locale;
1606
+ for (i = 0; acp_ocp_map[i].acp; i++) {
1607
+ if (strncmp(acp_ocp_map[i].locale, locale, len) == 0)
1608
+ return (acp_ocp_map[i].ocp);
1609
+ }
1610
+ return (GetOEMCP());
1611
+ }
1612
+ #else
1613
+
1614
+ /*
1615
+ * POSIX platform does not use CodePage.
1616
+ */
1617
+
1618
+ static unsigned
1619
+ get_current_codepage(void)
1620
+ {
1621
+ return (-1);/* Unknown */
1622
+ }
1623
+ static unsigned
1624
+ make_codepage_from_charset(const char *charset)
1625
+ {
1626
+ (void)charset; /* UNUSED */
1627
+ return (-1);/* Unknown */
1628
+ }
1629
+ static unsigned
1630
+ get_current_oemcp(void)
1631
+ {
1632
+ return (-1);/* Unknown */
1633
+ }
1634
+
1635
+ #endif /* defined(_WIN32) && !defined(__CYGWIN__) */
1636
+
1637
+ /*
1638
+ * Return a string conversion object.
1639
+ */
1640
+ static struct archive_string_conv *
1641
+ get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag)
1642
+ {
1643
+ struct archive_string_conv *sc;
1644
+ unsigned current_codepage;
1645
+
1646
+ /* Check if we have made the sconv object. */
1647
+ sc = find_sconv_object(a, fc, tc);
1648
+ if (sc != NULL)
1649
+ return (sc);
1650
+
1651
+ if (a == NULL)
1652
+ current_codepage = get_current_codepage();
1653
+ else
1654
+ current_codepage = a->current_codepage;
1655
+
1656
+ sc = create_sconv_object(canonical_charset_name(fc),
1657
+ canonical_charset_name(tc), current_codepage, flag);
1658
+ if (sc == NULL) {
1659
+ if (a != NULL)
1660
+ archive_set_error(a, ENOMEM,
1661
+ "Could not allocate memory for "
1662
+ "a string conversion object");
1663
+ return (NULL);
1664
+ }
1665
+
1666
+ /*
1667
+ * If there is no converter for current string conversion object,
1668
+ * we cannot handle this conversion.
1669
+ */
1670
+ if (sc->nconverter == 0) {
1671
+ if (a != NULL) {
1672
+ #if HAVE_ICONV
1673
+ archive_set_error(a, ARCHIVE_ERRNO_MISC,
1674
+ "iconv_open failed : Cannot handle ``%s''",
1675
+ (flag & SCONV_TO_CHARSET)?tc:fc);
1676
+ #else
1677
+ archive_set_error(a, ARCHIVE_ERRNO_MISC,
1678
+ "A character-set conversion not fully supported "
1679
+ "on this platform");
1680
+ #endif
1681
+ }
1682
+ /* Failed; free a sconv object. */
1683
+ free_sconv_object(sc);
1684
+ return (NULL);
1685
+ }
1686
+
1687
+ /*
1688
+ * Success!
1689
+ */
1690
+ if (a != NULL)
1691
+ add_sconv_object(a, sc);
1692
+ return (sc);
1693
+ }
1694
+
1695
+ static const char *
1696
+ get_current_charset(struct archive *a)
1697
+ {
1698
+ const char *cur_charset;
1699
+
1700
+ if (a == NULL)
1701
+ cur_charset = default_iconv_charset("");
1702
+ else {
1703
+ cur_charset = default_iconv_charset(a->current_code);
1704
+ if (a->current_code == NULL) {
1705
+ a->current_code = strdup(cur_charset);
1706
+ a->current_codepage = get_current_codepage();
1707
+ a->current_oemcp = get_current_oemcp();
1708
+ }
1709
+ }
1710
+ return (cur_charset);
1711
+ }
1712
+
1713
+ /*
1714
+ * Make and Return a string conversion object.
1715
+ * Return NULL if the platform does not support the specified conversion
1716
+ * and best_effort is 0.
1717
+ * If best_effort is set, A string conversion object must be returned
1718
+ * unless memory allocation for the object fails, but the conversion
1719
+ * might fail when non-ASCII code is found.
1720
+ */
1721
+ struct archive_string_conv *
1722
+ archive_string_conversion_to_charset(struct archive *a, const char *charset,
1723
+ int best_effort)
1724
+ {
1725
+ int flag = SCONV_TO_CHARSET;
1726
+
1727
+ if (best_effort)
1728
+ flag |= SCONV_BEST_EFFORT;
1729
+ return (get_sconv_object(a, get_current_charset(a), charset, flag));
1730
+ }
1731
+
1732
+ struct archive_string_conv *
1733
+ archive_string_conversion_from_charset(struct archive *a, const char *charset,
1734
+ int best_effort)
1735
+ {
1736
+ int flag = SCONV_FROM_CHARSET;
1737
+
1738
+ if (best_effort)
1739
+ flag |= SCONV_BEST_EFFORT;
1740
+ return (get_sconv_object(a, charset, get_current_charset(a), flag));
1741
+ }
1742
+
1743
+ /*
1744
+ * archive_string_default_conversion_*_archive() are provided for Windows
1745
+ * platform because other archiver application use CP_OEMCP for
1746
+ * MultiByteToWideChar() and WideCharToMultiByte() for the filenames
1747
+ * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP
1748
+ * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP).
1749
+ * So we should make a string conversion between CP_ACP and CP_OEMCP
1750
+ * for compatibility.
1751
+ */
1752
+ #if defined(_WIN32) && !defined(__CYGWIN__)
1753
+ struct archive_string_conv *
1754
+ archive_string_default_conversion_for_read(struct archive *a)
1755
+ {
1756
+ const char *cur_charset = get_current_charset(a);
1757
+ char oemcp[16];
1758
+
1759
+ /* NOTE: a check of cur_charset is unneeded but we need
1760
+ * that get_current_charset() has been surely called at
1761
+ * this time whatever C compiler optimized. */
1762
+ if (cur_charset != NULL &&
1763
+ (a->current_codepage == CP_C_LOCALE ||
1764
+ a->current_codepage == a->current_oemcp))
1765
+ return (NULL);/* no conversion. */
1766
+
1767
+ _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
1768
+ /* Make sure a null termination must be set. */
1769
+ oemcp[sizeof(oemcp)-1] = '\0';
1770
+ return (get_sconv_object(a, oemcp, cur_charset,
1771
+ SCONV_FROM_CHARSET));
1772
+ }
1773
+
1774
+ struct archive_string_conv *
1775
+ archive_string_default_conversion_for_write(struct archive *a)
1776
+ {
1777
+ const char *cur_charset = get_current_charset(a);
1778
+ char oemcp[16];
1779
+
1780
+ /* NOTE: a check of cur_charset is unneeded but we need
1781
+ * that get_current_charset() has been surely called at
1782
+ * this time whatever C compiler optimized. */
1783
+ if (cur_charset != NULL &&
1784
+ (a->current_codepage == CP_C_LOCALE ||
1785
+ a->current_codepage == a->current_oemcp))
1786
+ return (NULL);/* no conversion. */
1787
+
1788
+ _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
1789
+ /* Make sure a null termination must be set. */
1790
+ oemcp[sizeof(oemcp)-1] = '\0';
1791
+ return (get_sconv_object(a, cur_charset, oemcp,
1792
+ SCONV_TO_CHARSET));
1793
+ }
1794
+ #else
1795
+ struct archive_string_conv *
1796
+ archive_string_default_conversion_for_read(struct archive *a)
1797
+ {
1798
+ (void)a; /* UNUSED */
1799
+ return (NULL);
1800
+ }
1801
+
1802
+ struct archive_string_conv *
1803
+ archive_string_default_conversion_for_write(struct archive *a)
1804
+ {
1805
+ (void)a; /* UNUSED */
1806
+ return (NULL);
1807
+ }
1808
+ #endif
1809
+
1810
+ /*
1811
+ * Dispose of all character conversion objects in the archive object.
1812
+ */
1813
+ void
1814
+ archive_string_conversion_free(struct archive *a)
1815
+ {
1816
+ struct archive_string_conv *sc;
1817
+ struct archive_string_conv *sc_next;
1818
+
1819
+ for (sc = a->sconv; sc != NULL; sc = sc_next) {
1820
+ sc_next = sc->next;
1821
+ free_sconv_object(sc);
1822
+ }
1823
+ a->sconv = NULL;
1824
+ free(a->current_code);
1825
+ a->current_code = NULL;
1826
+ }
1827
+
1828
+ /*
1829
+ * Return a conversion charset name.
1830
+ */
1831
+ const char *
1832
+ archive_string_conversion_charset_name(struct archive_string_conv *sc)
1833
+ {
1834
+ if (sc->flag & SCONV_TO_CHARSET)
1835
+ return (sc->to_charset);
1836
+ else
1837
+ return (sc->from_charset);
1838
+ }
1839
+
1840
+ /*
1841
+ * Change the behavior of a string conversion.
1842
+ */
1843
+ void
1844
+ archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt)
1845
+ {
1846
+ switch (opt) {
1847
+ /*
1848
+ * A filename in UTF-8 was made with libarchive 2.x in a wrong
1849
+ * assumption that wchar_t was Unicode.
1850
+ * This option enables simulating the assumption in order to read
1851
+ * that filename correctly.
1852
+ */
1853
+ case SCONV_SET_OPT_UTF8_LIBARCHIVE2X:
1854
+ #if (defined(_WIN32) && !defined(__CYGWIN__)) \
1855
+ || defined(__STDC_ISO_10646__) || defined(__APPLE__)
1856
+ /*
1857
+ * Nothing to do for it since wchar_t on these platforms
1858
+ * is really Unicode.
1859
+ */
1860
+ (void)sc; /* UNUSED */
1861
+ #else
1862
+ if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) {
1863
+ sc->flag |= SCONV_UTF8_LIBARCHIVE_2;
1864
+ /* Set up string converters. */
1865
+ setup_converter(sc);
1866
+ }
1867
+ #endif
1868
+ break;
1869
+ case SCONV_SET_OPT_NORMALIZATION_C:
1870
+ if ((sc->flag & SCONV_NORMALIZATION_C) == 0) {
1871
+ sc->flag |= SCONV_NORMALIZATION_C;
1872
+ sc->flag &= ~SCONV_NORMALIZATION_D;
1873
+ /* Set up string converters. */
1874
+ setup_converter(sc);
1875
+ }
1876
+ break;
1877
+ case SCONV_SET_OPT_NORMALIZATION_D:
1878
+ #if defined(HAVE_ICONV)
1879
+ /*
1880
+ * If iconv will take the string, do not change the
1881
+ * setting of the normalization.
1882
+ */
1883
+ if (!(sc->flag & SCONV_WIN_CP) &&
1884
+ (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1885
+ !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
1886
+ break;
1887
+ #endif
1888
+ if ((sc->flag & SCONV_NORMALIZATION_D) == 0) {
1889
+ sc->flag |= SCONV_NORMALIZATION_D;
1890
+ sc->flag &= ~SCONV_NORMALIZATION_C;
1891
+ /* Set up string converters. */
1892
+ setup_converter(sc);
1893
+ }
1894
+ break;
1895
+ default:
1896
+ break;
1897
+ }
1898
+ }
1899
+
1900
+ /*
1901
+ *
1902
+ * Copy one archive_string to another in locale conversion.
1903
+ *
1904
+ * archive_strncat_l();
1905
+ * archive_strncpy_l();
1906
+ *
1907
+ */
1908
+
1909
+ static size_t
1910
+ mbsnbytes(const void *_p, size_t n)
1911
+ {
1912
+ size_t s;
1913
+ const char *p, *pp;
1914
+
1915
+ if (_p == NULL)
1916
+ return (0);
1917
+ p = (const char *)_p;
1918
+
1919
+ /* Like strlen(p), except won't examine positions beyond p[n]. */
1920
+ s = 0;
1921
+ pp = p;
1922
+ while (s < n && *pp) {
1923
+ pp++;
1924
+ s++;
1925
+ }
1926
+ return (s);
1927
+ }
1928
+
1929
+ static size_t
1930
+ utf16nbytes(const void *_p, size_t n)
1931
+ {
1932
+ size_t s;
1933
+ const char *p, *pp;
1934
+
1935
+ if (_p == NULL)
1936
+ return (0);
1937
+ p = (const char *)_p;
1938
+
1939
+ /* Like strlen(p), except won't examine positions beyond p[n]. */
1940
+ s = 0;
1941
+ pp = p;
1942
+ n >>= 1;
1943
+ while (s < n && (pp[0] || pp[1])) {
1944
+ pp += 2;
1945
+ s++;
1946
+ }
1947
+ return (s<<1);
1948
+ }
1949
+
1950
+ int
1951
+ archive_strncpy_l(struct archive_string *as, const void *_p, size_t n,
1952
+ struct archive_string_conv *sc)
1953
+ {
1954
+ as->length = 0;
1955
+ return (archive_strncat_l(as, _p, n, sc));
1956
+ }
1957
+
1958
+ int
1959
+ archive_strncat_l(struct archive_string *as, const void *_p, size_t n,
1960
+ struct archive_string_conv *sc)
1961
+ {
1962
+ const void *s;
1963
+ size_t length = 0;
1964
+ int i, r = 0, r2;
1965
+
1966
+ if (_p != NULL && n > 0) {
1967
+ if (sc != NULL && (sc->flag & SCONV_FROM_UTF16))
1968
+ length = utf16nbytes(_p, n);
1969
+ else
1970
+ length = mbsnbytes(_p, n);
1971
+ }
1972
+
1973
+ /* We must allocate memory even if there is no data for conversion
1974
+ * or copy. This simulates archive_string_append behavior. */
1975
+ if (length == 0) {
1976
+ int tn = 1;
1977
+ if (sc != NULL && (sc->flag & SCONV_TO_UTF16))
1978
+ tn = 2;
1979
+ if (archive_string_ensure(as, as->length + tn) == NULL)
1980
+ return (-1);
1981
+ as->s[as->length] = 0;
1982
+ if (tn == 2)
1983
+ as->s[as->length+1] = 0;
1984
+ return (0);
1985
+ }
1986
+
1987
+ /*
1988
+ * If sc is NULL, we just make a copy.
1989
+ */
1990
+ if (sc == NULL) {
1991
+ if (archive_string_append(as, _p, length) == NULL)
1992
+ return (-1);/* No memory */
1993
+ return (0);
1994
+ }
1995
+
1996
+ s = _p;
1997
+ i = 0;
1998
+ if (sc->nconverter > 1) {
1999
+ sc->utftmp.length = 0;
2000
+ r2 = sc->converter[0](&(sc->utftmp), s, length, sc);
2001
+ if (r2 != 0 && errno == ENOMEM)
2002
+ return (r2);
2003
+ if (r > r2)
2004
+ r = r2;
2005
+ s = sc->utftmp.s;
2006
+ length = sc->utftmp.length;
2007
+ ++i;
2008
+ }
2009
+ r2 = sc->converter[i](as, s, length, sc);
2010
+ if (r > r2)
2011
+ r = r2;
2012
+ return (r);
2013
+ }
2014
+
2015
+ #if HAVE_ICONV
2016
+
2017
+ /*
2018
+ * Return -1 if conversion fails.
2019
+ */
2020
+ static int
2021
+ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
2022
+ size_t length, struct archive_string_conv *sc)
2023
+ {
2024
+ ICONV_CONST char *itp;
2025
+ size_t remaining;
2026
+ iconv_t cd;
2027
+ char *outp;
2028
+ size_t avail, bs;
2029
+ int return_value = 0; /* success */
2030
+ int to_size, from_size;
2031
+
2032
+ if (sc->flag & SCONV_TO_UTF16)
2033
+ to_size = 2;
2034
+ else
2035
+ to_size = 1;
2036
+ if (sc->flag & SCONV_FROM_UTF16)
2037
+ from_size = 2;
2038
+ else
2039
+ from_size = 1;
2040
+
2041
+ if (archive_string_ensure(as, as->length + length*2+to_size) == NULL)
2042
+ return (-1);
2043
+
2044
+ cd = sc->cd;
2045
+ itp = (char *)(uintptr_t)_p;
2046
+ remaining = length;
2047
+ outp = as->s + as->length;
2048
+ avail = as->buffer_length - as->length - to_size;
2049
+ while (remaining >= (size_t)from_size) {
2050
+ size_t result = iconv(cd, &itp, &remaining, &outp, &avail);
2051
+
2052
+ if (result != (size_t)-1)
2053
+ break; /* Conversion completed. */
2054
+
2055
+ if (errno == EILSEQ || errno == EINVAL) {
2056
+ /*
2057
+ * If an output charset is UTF-8 or UTF-16BE/LE,
2058
+ * unknown character should be U+FFFD
2059
+ * (replacement character).
2060
+ */
2061
+ if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
2062
+ size_t rbytes;
2063
+ if (sc->flag & SCONV_TO_UTF8)
2064
+ rbytes = sizeof(utf8_replacement_char);
2065
+ else
2066
+ rbytes = 2;
2067
+
2068
+ if (avail < rbytes) {
2069
+ as->length = outp - as->s;
2070
+ bs = as->buffer_length +
2071
+ (remaining * to_size) + rbytes;
2072
+ if (NULL ==
2073
+ archive_string_ensure(as, bs))
2074
+ return (-1);
2075
+ outp = as->s + as->length;
2076
+ avail = as->buffer_length
2077
+ - as->length - to_size;
2078
+ }
2079
+ if (sc->flag & SCONV_TO_UTF8)
2080
+ memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
2081
+ else if (sc->flag & SCONV_TO_UTF16BE)
2082
+ archive_be16enc(outp, UNICODE_R_CHAR);
2083
+ else
2084
+ archive_le16enc(outp, UNICODE_R_CHAR);
2085
+ outp += rbytes;
2086
+ avail -= rbytes;
2087
+ } else {
2088
+ /* Skip the illegal input bytes. */
2089
+ *outp++ = '?';
2090
+ avail--;
2091
+ }
2092
+ itp += from_size;
2093
+ remaining -= from_size;
2094
+ return_value = -1; /* failure */
2095
+ } else {
2096
+ /* E2BIG no output buffer,
2097
+ * Increase an output buffer. */
2098
+ as->length = outp - as->s;
2099
+ bs = as->buffer_length + remaining * 2;
2100
+ if (NULL == archive_string_ensure(as, bs))
2101
+ return (-1);
2102
+ outp = as->s + as->length;
2103
+ avail = as->buffer_length - as->length - to_size;
2104
+ }
2105
+ }
2106
+ as->length = outp - as->s;
2107
+ as->s[as->length] = 0;
2108
+ if (to_size == 2)
2109
+ as->s[as->length+1] = 0;
2110
+ return (return_value);
2111
+ }
2112
+
2113
+ #endif /* HAVE_ICONV */
2114
+
2115
+
2116
+ #if defined(_WIN32) && !defined(__CYGWIN__)
2117
+
2118
+ /*
2119
+ * Translate a string from a some CodePage to an another CodePage by
2120
+ * Windows APIs, and copy the result. Return -1 if conversion fails.
2121
+ */
2122
+ static int
2123
+ strncat_in_codepage(struct archive_string *as,
2124
+ const void *_p, size_t length, struct archive_string_conv *sc)
2125
+ {
2126
+ const char *s = (const char *)_p;
2127
+ struct archive_wstring aws;
2128
+ size_t l;
2129
+ int r, saved_flag;
2130
+
2131
+ archive_string_init(&aws);
2132
+ saved_flag = sc->flag;
2133
+ sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C);
2134
+ r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc);
2135
+ sc->flag = saved_flag;
2136
+ if (r != 0) {
2137
+ archive_wstring_free(&aws);
2138
+ if (errno != ENOMEM)
2139
+ archive_string_append(as, s, length);
2140
+ return (-1);
2141
+ }
2142
+
2143
+ l = as->length;
2144
+ r = archive_string_append_from_wcs_in_codepage(
2145
+ as, aws.s, aws.length, sc);
2146
+ if (r != 0 && errno != ENOMEM && l == as->length)
2147
+ archive_string_append(as, s, length);
2148
+ archive_wstring_free(&aws);
2149
+ return (r);
2150
+ }
2151
+
2152
+ /*
2153
+ * Test whether MBS ==> WCS is okay.
2154
+ */
2155
+ static int
2156
+ invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
2157
+ {
2158
+ const char *p = (const char *)_p;
2159
+ unsigned codepage;
2160
+ DWORD mbflag = MB_ERR_INVALID_CHARS;
2161
+
2162
+ if (sc->flag & SCONV_FROM_CHARSET)
2163
+ codepage = sc->to_cp;
2164
+ else
2165
+ codepage = sc->from_cp;
2166
+
2167
+ if (codepage == CP_C_LOCALE)
2168
+ return (0);
2169
+ if (codepage != CP_UTF8)
2170
+ mbflag |= MB_PRECOMPOSED;
2171
+
2172
+ if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0)
2173
+ return (-1); /* Invalid */
2174
+ return (0); /* Okay */
2175
+ }
2176
+
2177
+ #else
2178
+
2179
+ /*
2180
+ * Test whether MBS ==> WCS is okay.
2181
+ */
2182
+ static int
2183
+ invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
2184
+ {
2185
+ const char *p = (const char *)_p;
2186
+ size_t r;
2187
+
2188
+ #if HAVE_MBRTOWC
2189
+ mbstate_t shift_state;
2190
+
2191
+ memset(&shift_state, 0, sizeof(shift_state));
2192
+ #else
2193
+ /* Clear the shift state before starting. */
2194
+ mbtowc(NULL, NULL, 0);
2195
+ #endif
2196
+ while (n) {
2197
+ wchar_t wc;
2198
+
2199
+ #if HAVE_MBRTOWC
2200
+ r = mbrtowc(&wc, p, n, &shift_state);
2201
+ #else
2202
+ r = mbtowc(&wc, p, n);
2203
+ #endif
2204
+ if (r == (size_t)-1 || r == (size_t)-2)
2205
+ return (-1);/* Invalid. */
2206
+ if (r == 0)
2207
+ break;
2208
+ p += r;
2209
+ n -= r;
2210
+ }
2211
+ (void)sc; /* UNUSED */
2212
+ return (0); /* All Okey. */
2213
+ }
2214
+
2215
+ #endif /* defined(_WIN32) && !defined(__CYGWIN__) */
2216
+
2217
+ /*
2218
+ * Basically returns -1 because we cannot make a conversion of charset
2219
+ * without iconv but in some cases this would return 0.
2220
+ * Returns 0 if all copied characters are ASCII.
2221
+ * Returns 0 if both from-locale and to-locale are the same and those
2222
+ * can be WCS with no error.
2223
+ */
2224
+ static int
2225
+ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
2226
+ size_t length, struct archive_string_conv *sc)
2227
+ {
2228
+ size_t remaining;
2229
+ const uint8_t *itp;
2230
+ int return_value = 0; /* success */
2231
+
2232
+ /*
2233
+ * If both from-locale and to-locale is the same, this makes a copy.
2234
+ * And then this checks all copied MBS can be WCS if so returns 0.
2235
+ */
2236
+ if (sc->same) {
2237
+ if (archive_string_append(as, _p, length) == NULL)
2238
+ return (-1);/* No memory */
2239
+ return (invalid_mbs(_p, length, sc));
2240
+ }
2241
+
2242
+ /*
2243
+ * If a character is ASCII, this just copies it. If not, this
2244
+ * assigns '?' character instead but in UTF-8 locale this assigns
2245
+ * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
2246
+ * a Replacement Character in Unicode.
2247
+ */
2248
+
2249
+ remaining = length;
2250
+ itp = (const uint8_t *)_p;
2251
+ while (*itp && remaining > 0) {
2252
+ if (*itp > 127) {
2253
+ // Non-ASCII: Substitute with suitable replacement
2254
+ if (sc->flag & SCONV_TO_UTF8) {
2255
+ if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
2256
+ __archive_errx(1, "Out of memory");
2257
+ }
2258
+ } else {
2259
+ archive_strappend_char(as, '?');
2260
+ }
2261
+ return_value = -1;
2262
+ } else {
2263
+ archive_strappend_char(as, *itp);
2264
+ }
2265
+ ++itp;
2266
+ }
2267
+ return (return_value);
2268
+ }
2269
+
2270
+
2271
+ /*
2272
+ * Unicode conversion functions.
2273
+ * - UTF-8 <===> UTF-8 in removing surrogate pairs.
2274
+ * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs.
2275
+ * - UTF-8 made by libarchive 2.x ===> UTF-8.
2276
+ * - UTF-16BE <===> UTF-8.
2277
+ *
2278
+ */
2279
+
2280
+ /*
2281
+ * Utility to convert a single UTF-8 sequence.
2282
+ *
2283
+ * Usually return used bytes, return used byte in negative value when
2284
+ * a unicode character is replaced with U+FFFD.
2285
+ * See also http://unicode.org/review/pr-121.html Public Review Issue #121
2286
+ * Recommended Practice for Replacement Characters.
2287
+ */
2288
+ static int
2289
+ _utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2290
+ {
2291
+ static const char utf8_count[256] = {
2292
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */
2293
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */
2294
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */
2295
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */
2296
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */
2297
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */
2298
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */
2299
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */
2300
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */
2301
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */
2302
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */
2303
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */
2304
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */
2305
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */
2306
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */
2307
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */
2308
+ };
2309
+ int ch, i;
2310
+ int cnt;
2311
+ uint32_t wc;
2312
+
2313
+ /* Sanity check. */
2314
+ if (n == 0)
2315
+ return (0);
2316
+ /*
2317
+ * Decode 1-4 bytes depending on the value of the first byte.
2318
+ */
2319
+ ch = (unsigned char)*s;
2320
+ if (ch == 0)
2321
+ return (0); /* Standard: return 0 for end-of-string. */
2322
+ cnt = utf8_count[ch];
2323
+
2324
+ /* Invalid sequence or there are not plenty bytes. */
2325
+ if ((int)n < cnt) {
2326
+ cnt = (int)n;
2327
+ for (i = 1; i < cnt; i++) {
2328
+ if ((s[i] & 0xc0) != 0x80) {
2329
+ cnt = i;
2330
+ break;
2331
+ }
2332
+ }
2333
+ goto invalid_sequence;
2334
+ }
2335
+
2336
+ /* Make a Unicode code point from a single UTF-8 sequence. */
2337
+ switch (cnt) {
2338
+ case 1: /* 1 byte sequence. */
2339
+ *pwc = ch & 0x7f;
2340
+ return (cnt);
2341
+ case 2: /* 2 bytes sequence. */
2342
+ if ((s[1] & 0xc0) != 0x80) {
2343
+ cnt = 1;
2344
+ goto invalid_sequence;
2345
+ }
2346
+ *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
2347
+ return (cnt);
2348
+ case 3: /* 3 bytes sequence. */
2349
+ if ((s[1] & 0xc0) != 0x80) {
2350
+ cnt = 1;
2351
+ goto invalid_sequence;
2352
+ }
2353
+ if ((s[2] & 0xc0) != 0x80) {
2354
+ cnt = 2;
2355
+ goto invalid_sequence;
2356
+ }
2357
+ wc = ((ch & 0x0f) << 12)
2358
+ | ((s[1] & 0x3f) << 6)
2359
+ | (s[2] & 0x3f);
2360
+ if (wc < 0x800)
2361
+ goto invalid_sequence;/* Overlong sequence. */
2362
+ break;
2363
+ case 4: /* 4 bytes sequence. */
2364
+ if ((s[1] & 0xc0) != 0x80) {
2365
+ cnt = 1;
2366
+ goto invalid_sequence;
2367
+ }
2368
+ if ((s[2] & 0xc0) != 0x80) {
2369
+ cnt = 2;
2370
+ goto invalid_sequence;
2371
+ }
2372
+ if ((s[3] & 0xc0) != 0x80) {
2373
+ cnt = 3;
2374
+ goto invalid_sequence;
2375
+ }
2376
+ wc = ((ch & 0x07) << 18)
2377
+ | ((s[1] & 0x3f) << 12)
2378
+ | ((s[2] & 0x3f) << 6)
2379
+ | (s[3] & 0x3f);
2380
+ if (wc < 0x10000)
2381
+ goto invalid_sequence;/* Overlong sequence. */
2382
+ break;
2383
+ default: /* Others are all invalid sequence. */
2384
+ if (ch == 0xc0 || ch == 0xc1)
2385
+ cnt = 2;
2386
+ else if (ch >= 0xf5 && ch <= 0xf7)
2387
+ cnt = 4;
2388
+ else if (ch >= 0xf8 && ch <= 0xfb)
2389
+ cnt = 5;
2390
+ else if (ch == 0xfc || ch == 0xfd)
2391
+ cnt = 6;
2392
+ else
2393
+ cnt = 1;
2394
+ if ((int)n < cnt)
2395
+ cnt = (int)n;
2396
+ for (i = 1; i < cnt; i++) {
2397
+ if ((s[i] & 0xc0) != 0x80) {
2398
+ cnt = i;
2399
+ break;
2400
+ }
2401
+ }
2402
+ goto invalid_sequence;
2403
+ }
2404
+
2405
+ /* The code point larger than 0x10FFFF is not legal
2406
+ * Unicode values. */
2407
+ if (wc > UNICODE_MAX)
2408
+ goto invalid_sequence;
2409
+ /* Correctly gets a Unicode, returns used bytes. */
2410
+ *pwc = wc;
2411
+ return (cnt);
2412
+ invalid_sequence:
2413
+ *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2414
+ return (cnt * -1);
2415
+ }
2416
+
2417
+ static int
2418
+ utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2419
+ {
2420
+ int cnt;
2421
+
2422
+ cnt = _utf8_to_unicode(pwc, s, n);
2423
+ /* Any of Surrogate pair is not legal Unicode values. */
2424
+ if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
2425
+ return (-3);
2426
+ return (cnt);
2427
+ }
2428
+
2429
+ static inline uint32_t
2430
+ combine_surrogate_pair(uint32_t uc, uint32_t uc2)
2431
+ {
2432
+ uc -= 0xD800;
2433
+ uc *= 0x400;
2434
+ uc += uc2 - 0xDC00;
2435
+ uc += 0x10000;
2436
+ return (uc);
2437
+ }
2438
+
2439
+ /*
2440
+ * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in
2441
+ * removing surrogate pairs.
2442
+ *
2443
+ * CESU-8: The Compatibility Encoding Scheme for UTF-16.
2444
+ *
2445
+ * Usually return used bytes, return used byte in negative value when
2446
+ * a unicode character is replaced with U+FFFD.
2447
+ */
2448
+ static int
2449
+ cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2450
+ {
2451
+ uint32_t wc = 0;
2452
+ int cnt;
2453
+
2454
+ cnt = _utf8_to_unicode(&wc, s, n);
2455
+ if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) {
2456
+ uint32_t wc2 = 0;
2457
+ if (n - 3 < 3) {
2458
+ /* Invalid byte sequence. */
2459
+ goto invalid_sequence;
2460
+ }
2461
+ cnt = _utf8_to_unicode(&wc2, s+3, n-3);
2462
+ if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) {
2463
+ /* Invalid byte sequence. */
2464
+ goto invalid_sequence;
2465
+ }
2466
+ wc = combine_surrogate_pair(wc, wc2);
2467
+ cnt = 6;
2468
+ } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) {
2469
+ /* Invalid byte sequence. */
2470
+ goto invalid_sequence;
2471
+ }
2472
+ *pwc = wc;
2473
+ return (cnt);
2474
+ invalid_sequence:
2475
+ *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2476
+ if (cnt > 0)
2477
+ cnt *= -1;
2478
+ return (cnt);
2479
+ }
2480
+
2481
+ /*
2482
+ * Convert a Unicode code point to a single UTF-8 sequence.
2483
+ *
2484
+ * NOTE:This function does not check if the Unicode is legal or not.
2485
+ * Please you definitely check it before calling this.
2486
+ */
2487
+ static size_t
2488
+ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
2489
+ {
2490
+ char *_p = p;
2491
+
2492
+ /* Invalid Unicode char maps to Replacement character */
2493
+ if (uc > UNICODE_MAX)
2494
+ uc = UNICODE_R_CHAR;
2495
+ /* Translate code point to UTF8 */
2496
+ if (uc <= 0x7f) {
2497
+ if (remaining == 0)
2498
+ return (0);
2499
+ *p++ = (char)uc;
2500
+ } else if (uc <= 0x7ff) {
2501
+ if (remaining < 2)
2502
+ return (0);
2503
+ *p++ = 0xc0 | ((uc >> 6) & 0x1f);
2504
+ *p++ = 0x80 | (uc & 0x3f);
2505
+ } else if (uc <= 0xffff) {
2506
+ if (remaining < 3)
2507
+ return (0);
2508
+ *p++ = 0xe0 | ((uc >> 12) & 0x0f);
2509
+ *p++ = 0x80 | ((uc >> 6) & 0x3f);
2510
+ *p++ = 0x80 | (uc & 0x3f);
2511
+ } else {
2512
+ if (remaining < 4)
2513
+ return (0);
2514
+ *p++ = 0xf0 | ((uc >> 18) & 0x07);
2515
+ *p++ = 0x80 | ((uc >> 12) & 0x3f);
2516
+ *p++ = 0x80 | ((uc >> 6) & 0x3f);
2517
+ *p++ = 0x80 | (uc & 0x3f);
2518
+ }
2519
+ return (p - _p);
2520
+ }
2521
+
2522
+ static int
2523
+ utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n)
2524
+ {
2525
+ return (utf16_to_unicode(pwc, s, n, 1));
2526
+ }
2527
+
2528
+ static int
2529
+ utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n)
2530
+ {
2531
+ return (utf16_to_unicode(pwc, s, n, 0));
2532
+ }
2533
+
2534
+ static int
2535
+ utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be)
2536
+ {
2537
+ const char *utf16 = s;
2538
+ unsigned uc;
2539
+
2540
+ if (n == 0)
2541
+ return (0);
2542
+ if (n == 1) {
2543
+ /* set the Replacement Character instead. */
2544
+ *pwc = UNICODE_R_CHAR;
2545
+ return (-1);
2546
+ }
2547
+
2548
+ if (be)
2549
+ uc = archive_be16dec(utf16);
2550
+ else
2551
+ uc = archive_le16dec(utf16);
2552
+ utf16 += 2;
2553
+
2554
+ /* If this is a surrogate pair, assemble the full code point.*/
2555
+ if (IS_HIGH_SURROGATE_LA(uc)) {
2556
+ unsigned uc2;
2557
+
2558
+ if (n >= 4) {
2559
+ if (be)
2560
+ uc2 = archive_be16dec(utf16);
2561
+ else
2562
+ uc2 = archive_le16dec(utf16);
2563
+ } else
2564
+ uc2 = 0;
2565
+ if (IS_LOW_SURROGATE_LA(uc2)) {
2566
+ uc = combine_surrogate_pair(uc, uc2);
2567
+ utf16 += 2;
2568
+ } else {
2569
+ /* Undescribed code point should be U+FFFD
2570
+ * (replacement character). */
2571
+ *pwc = UNICODE_R_CHAR;
2572
+ return (-2);
2573
+ }
2574
+ }
2575
+
2576
+ /*
2577
+ * Surrogate pair values(0xd800 through 0xdfff) are only
2578
+ * used by UTF-16, so, after above calculation, the code
2579
+ * must not be surrogate values, and Unicode has no codes
2580
+ * larger than 0x10ffff. Thus, those are not legal Unicode
2581
+ * values.
2582
+ */
2583
+ if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
2584
+ /* Undescribed code point should be U+FFFD
2585
+ * (replacement character). */
2586
+ *pwc = UNICODE_R_CHAR;
2587
+ return (((int)(utf16 - s)) * -1);
2588
+ }
2589
+ *pwc = uc;
2590
+ return ((int)(utf16 - s));
2591
+ }
2592
+
2593
+ static size_t
2594
+ unicode_to_utf16be(char *p, size_t remaining, uint32_t uc)
2595
+ {
2596
+ char *utf16 = p;
2597
+
2598
+ if (uc > 0xffff) {
2599
+ /* We have a code point that won't fit into a
2600
+ * wchar_t; convert it to a surrogate pair. */
2601
+ if (remaining < 4)
2602
+ return (0);
2603
+ uc -= 0x10000;
2604
+ archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
2605
+ archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
2606
+ return (4);
2607
+ } else {
2608
+ if (remaining < 2)
2609
+ return (0);
2610
+ archive_be16enc(utf16, uc);
2611
+ return (2);
2612
+ }
2613
+ }
2614
+
2615
+ static size_t
2616
+ unicode_to_utf16le(char *p, size_t remaining, uint32_t uc)
2617
+ {
2618
+ char *utf16 = p;
2619
+
2620
+ if (uc > 0xffff) {
2621
+ /* We have a code point that won't fit into a
2622
+ * wchar_t; convert it to a surrogate pair. */
2623
+ if (remaining < 4)
2624
+ return (0);
2625
+ uc -= 0x10000;
2626
+ archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
2627
+ archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
2628
+ return (4);
2629
+ } else {
2630
+ if (remaining < 2)
2631
+ return (0);
2632
+ archive_le16enc(utf16, uc);
2633
+ return (2);
2634
+ }
2635
+ }
2636
+
2637
+ /*
2638
+ * Copy UTF-8 string in checking surrogate pair.
2639
+ * If any surrogate pair are found, it would be canonicalized.
2640
+ */
2641
+ static int
2642
+ strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p,
2643
+ size_t len, struct archive_string_conv *sc)
2644
+ {
2645
+ const char *s;
2646
+ char *p, *endp;
2647
+ int n, ret = 0;
2648
+
2649
+ (void)sc; /* UNUSED */
2650
+
2651
+ if (archive_string_ensure(as, as->length + len + 1) == NULL)
2652
+ return (-1);
2653
+
2654
+ s = (const char *)_p;
2655
+ p = as->s + as->length;
2656
+ endp = as->s + as->buffer_length -1;
2657
+ do {
2658
+ uint32_t uc;
2659
+ const char *ss = s;
2660
+ size_t w;
2661
+
2662
+ /*
2663
+ * Forward byte sequence until a conversion of that is needed.
2664
+ */
2665
+ while ((n = utf8_to_unicode(&uc, s, len)) > 0) {
2666
+ s += n;
2667
+ len -= n;
2668
+ }
2669
+ if (ss < s) {
2670
+ if (p + (s - ss) > endp) {
2671
+ as->length = p - as->s;
2672
+ if (archive_string_ensure(as,
2673
+ as->buffer_length + len + 1) == NULL)
2674
+ return (-1);
2675
+ p = as->s + as->length;
2676
+ endp = as->s + as->buffer_length -1;
2677
+ }
2678
+
2679
+ memcpy(p, ss, s - ss);
2680
+ p += s - ss;
2681
+ }
2682
+
2683
+ /*
2684
+ * If n is negative, current byte sequence needs a replacement.
2685
+ */
2686
+ if (n < 0) {
2687
+ if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) {
2688
+ /* Current byte sequence may be CESU-8. */
2689
+ n = cesu8_to_unicode(&uc, s, len);
2690
+ }
2691
+ if (n < 0) {
2692
+ ret = -1;
2693
+ n *= -1;/* Use a replaced unicode character. */
2694
+ }
2695
+
2696
+ /* Rebuild UTF-8 byte sequence. */
2697
+ while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) {
2698
+ as->length = p - as->s;
2699
+ if (archive_string_ensure(as,
2700
+ as->buffer_length + len + 1) == NULL)
2701
+ return (-1);
2702
+ p = as->s + as->length;
2703
+ endp = as->s + as->buffer_length -1;
2704
+ }
2705
+ p += w;
2706
+ s += n;
2707
+ len -= n;
2708
+ }
2709
+ } while (n > 0);
2710
+ as->length = p - as->s;
2711
+ as->s[as->length] = '\0';
2712
+ return (ret);
2713
+ }
2714
+
2715
+ static int
2716
+ archive_string_append_unicode(struct archive_string *as, const void *_p,
2717
+ size_t len, struct archive_string_conv *sc)
2718
+ {
2719
+ const char *s;
2720
+ char *p, *endp;
2721
+ uint32_t uc;
2722
+ size_t w;
2723
+ int n, ret = 0, ts, tm;
2724
+ int (*parse)(uint32_t *, const char *, size_t);
2725
+ size_t (*unparse)(char *, size_t, uint32_t);
2726
+
2727
+ if (sc->flag & SCONV_TO_UTF16BE) {
2728
+ unparse = unicode_to_utf16be;
2729
+ ts = 2;
2730
+ } else if (sc->flag & SCONV_TO_UTF16LE) {
2731
+ unparse = unicode_to_utf16le;
2732
+ ts = 2;
2733
+ } else if (sc->flag & SCONV_TO_UTF8) {
2734
+ unparse = unicode_to_utf8;
2735
+ ts = 1;
2736
+ } else {
2737
+ /*
2738
+ * This case is going to be converted to another
2739
+ * character-set through iconv.
2740
+ */
2741
+ if (sc->flag & SCONV_FROM_UTF16BE) {
2742
+ unparse = unicode_to_utf16be;
2743
+ ts = 2;
2744
+ } else if (sc->flag & SCONV_FROM_UTF16LE) {
2745
+ unparse = unicode_to_utf16le;
2746
+ ts = 2;
2747
+ } else {
2748
+ unparse = unicode_to_utf8;
2749
+ ts = 1;
2750
+ }
2751
+ }
2752
+
2753
+ if (sc->flag & SCONV_FROM_UTF16BE) {
2754
+ parse = utf16be_to_unicode;
2755
+ tm = 1;
2756
+ } else if (sc->flag & SCONV_FROM_UTF16LE) {
2757
+ parse = utf16le_to_unicode;
2758
+ tm = 1;
2759
+ } else {
2760
+ parse = cesu8_to_unicode;
2761
+ tm = ts;
2762
+ }
2763
+
2764
+ if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
2765
+ return (-1);
2766
+
2767
+ s = (const char *)_p;
2768
+ p = as->s + as->length;
2769
+ endp = as->s + as->buffer_length - ts;
2770
+ while ((n = parse(&uc, s, len)) != 0) {
2771
+ if (n < 0) {
2772
+ /* Use a replaced unicode character. */
2773
+ n *= -1;
2774
+ ret = -1;
2775
+ }
2776
+ s += n;
2777
+ len -= n;
2778
+ while ((w = unparse(p, endp - p, uc)) == 0) {
2779
+ /* There is not enough output buffer so
2780
+ * we have to expand it. */
2781
+ as->length = p - as->s;
2782
+ if (archive_string_ensure(as,
2783
+ as->buffer_length + len * tm + ts) == NULL)
2784
+ return (-1);
2785
+ p = as->s + as->length;
2786
+ endp = as->s + as->buffer_length - ts;
2787
+ }
2788
+ p += w;
2789
+ }
2790
+ as->length = p - as->s;
2791
+ as->s[as->length] = '\0';
2792
+ if (ts == 2)
2793
+ as->s[as->length+1] = '\0';
2794
+ return (ret);
2795
+ }
2796
+
2797
+ /*
2798
+ * Following Constants for Hangul compositions this information comes from
2799
+ * Unicode Standard Annex #15 http://unicode.org/reports/tr15/
2800
+ */
2801
+ #define HC_SBASE 0xAC00
2802
+ #define HC_LBASE 0x1100
2803
+ #define HC_VBASE 0x1161
2804
+ #define HC_TBASE 0x11A7
2805
+ #define HC_LCOUNT 19
2806
+ #define HC_VCOUNT 21
2807
+ #define HC_TCOUNT 28
2808
+ #define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT)
2809
+ #define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT)
2810
+
2811
+ static uint32_t
2812
+ get_nfc(uint32_t uc, uint32_t uc2)
2813
+ {
2814
+ int t, b;
2815
+
2816
+ t = 0;
2817
+ b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1;
2818
+ while (b >= t) {
2819
+ int m = (t + b) / 2;
2820
+ if (u_composition_table[m].cp1 < uc)
2821
+ t = m + 1;
2822
+ else if (u_composition_table[m].cp1 > uc)
2823
+ b = m - 1;
2824
+ else if (u_composition_table[m].cp2 < uc2)
2825
+ t = m + 1;
2826
+ else if (u_composition_table[m].cp2 > uc2)
2827
+ b = m - 1;
2828
+ else
2829
+ return (u_composition_table[m].nfc);
2830
+ }
2831
+ return (0);
2832
+ }
2833
+
2834
+ #define FDC_MAX 10 /* The maximum number of Following Decomposable
2835
+ * Characters. */
2836
+
2837
+ /*
2838
+ * Update first code point.
2839
+ */
2840
+ #define UPDATE_UC(new_uc) do { \
2841
+ uc = new_uc; \
2842
+ ucptr = NULL; \
2843
+ } while (0)
2844
+
2845
+ /*
2846
+ * Replace first code point with second code point.
2847
+ */
2848
+ #define REPLACE_UC_WITH_UC2() do { \
2849
+ uc = uc2; \
2850
+ ucptr = uc2ptr; \
2851
+ n = n2; \
2852
+ } while (0)
2853
+
2854
+ #define EXPAND_BUFFER() do { \
2855
+ as->length = p - as->s; \
2856
+ if (archive_string_ensure(as, \
2857
+ as->buffer_length + len * tm + ts) == NULL)\
2858
+ return (-1); \
2859
+ p = as->s + as->length; \
2860
+ endp = as->s + as->buffer_length - ts; \
2861
+ } while (0)
2862
+
2863
+ #define UNPARSE(p, endp, uc) do { \
2864
+ while ((w = unparse(p, (endp) - (p), uc)) == 0) {\
2865
+ EXPAND_BUFFER(); \
2866
+ } \
2867
+ p += w; \
2868
+ } while (0)
2869
+
2870
+ /*
2871
+ * Write first code point.
2872
+ * If the code point has not be changed from its original code,
2873
+ * this just copies it from its original buffer pointer.
2874
+ * If not, this converts it to UTF-8 byte sequence and copies it.
2875
+ */
2876
+ #define WRITE_UC() do { \
2877
+ if (ucptr) { \
2878
+ if (p + n > endp) \
2879
+ EXPAND_BUFFER(); \
2880
+ switch (n) { \
2881
+ case 4: \
2882
+ *p++ = *ucptr++; \
2883
+ /* FALL THROUGH */ \
2884
+ case 3: \
2885
+ *p++ = *ucptr++; \
2886
+ /* FALL THROUGH */ \
2887
+ case 2: \
2888
+ *p++ = *ucptr++; \
2889
+ /* FALL THROUGH */ \
2890
+ case 1: \
2891
+ *p++ = *ucptr; \
2892
+ break; \
2893
+ } \
2894
+ ucptr = NULL; \
2895
+ } else { \
2896
+ UNPARSE(p, endp, uc); \
2897
+ } \
2898
+ } while (0)
2899
+
2900
+ /*
2901
+ * Collect following decomposable code points.
2902
+ */
2903
+ #define COLLECT_CPS(start) do { \
2904
+ int _i; \
2905
+ for (_i = start; _i < FDC_MAX ; _i++) { \
2906
+ nx = parse(&ucx[_i], s, len); \
2907
+ if (nx <= 0) \
2908
+ break; \
2909
+ cx = CCC(ucx[_i]); \
2910
+ if (cl >= cx && cl != 228 && cx != 228)\
2911
+ break; \
2912
+ s += nx; \
2913
+ len -= nx; \
2914
+ cl = cx; \
2915
+ ccx[_i] = cx; \
2916
+ } \
2917
+ if (_i >= FDC_MAX) { \
2918
+ ret = -1; \
2919
+ ucx_size = FDC_MAX; \
2920
+ } else \
2921
+ ucx_size = _i; \
2922
+ } while (0)
2923
+
2924
+ /*
2925
+ * Normalize UTF-8/UTF-16BE characters to Form C and copy the result.
2926
+ *
2927
+ * TODO: Convert composition exclusions, which are never converted
2928
+ * from NFC,NFD,NFKC and NFKD, to Form C.
2929
+ */
2930
+ static int
2931
+ archive_string_normalize_C(struct archive_string *as, const void *_p,
2932
+ size_t len, struct archive_string_conv *sc)
2933
+ {
2934
+ const char *s = (const char *)_p;
2935
+ char *p, *endp;
2936
+ uint32_t uc, uc2;
2937
+ size_t w;
2938
+ int always_replace, n, n2, ret = 0, spair, ts, tm;
2939
+ int (*parse)(uint32_t *, const char *, size_t);
2940
+ size_t (*unparse)(char *, size_t, uint32_t);
2941
+
2942
+ always_replace = 1;
2943
+ ts = 1;/* text size. */
2944
+ if (sc->flag & SCONV_TO_UTF16BE) {
2945
+ unparse = unicode_to_utf16be;
2946
+ ts = 2;
2947
+ if (sc->flag & SCONV_FROM_UTF16BE)
2948
+ always_replace = 0;
2949
+ } else if (sc->flag & SCONV_TO_UTF16LE) {
2950
+ unparse = unicode_to_utf16le;
2951
+ ts = 2;
2952
+ if (sc->flag & SCONV_FROM_UTF16LE)
2953
+ always_replace = 0;
2954
+ } else if (sc->flag & SCONV_TO_UTF8) {
2955
+ unparse = unicode_to_utf8;
2956
+ if (sc->flag & SCONV_FROM_UTF8)
2957
+ always_replace = 0;
2958
+ } else {
2959
+ /*
2960
+ * This case is going to be converted to another
2961
+ * character-set through iconv.
2962
+ */
2963
+ always_replace = 0;
2964
+ if (sc->flag & SCONV_FROM_UTF16BE) {
2965
+ unparse = unicode_to_utf16be;
2966
+ ts = 2;
2967
+ } else if (sc->flag & SCONV_FROM_UTF16LE) {
2968
+ unparse = unicode_to_utf16le;
2969
+ ts = 2;
2970
+ } else {
2971
+ unparse = unicode_to_utf8;
2972
+ }
2973
+ }
2974
+
2975
+ if (sc->flag & SCONV_FROM_UTF16BE) {
2976
+ parse = utf16be_to_unicode;
2977
+ tm = 1;
2978
+ spair = 4;/* surrogate pair size in UTF-16. */
2979
+ } else if (sc->flag & SCONV_FROM_UTF16LE) {
2980
+ parse = utf16le_to_unicode;
2981
+ tm = 1;
2982
+ spair = 4;/* surrogate pair size in UTF-16. */
2983
+ } else {
2984
+ parse = cesu8_to_unicode;
2985
+ tm = ts;
2986
+ spair = 6;/* surrogate pair size in UTF-8. */
2987
+ }
2988
+
2989
+ if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
2990
+ return (-1);
2991
+
2992
+ p = as->s + as->length;
2993
+ endp = as->s + as->buffer_length - ts;
2994
+ while ((n = parse(&uc, s, len)) != 0) {
2995
+ const char *ucptr, *uc2ptr;
2996
+
2997
+ if (n < 0) {
2998
+ /* Use a replaced unicode character. */
2999
+ UNPARSE(p, endp, uc);
3000
+ s += n*-1;
3001
+ len -= n*-1;
3002
+ ret = -1;
3003
+ continue;
3004
+ } else if (n == spair || always_replace)
3005
+ /* uc is converted from a surrogate pair.
3006
+ * this should be treated as a changed code. */
3007
+ ucptr = NULL;
3008
+ else
3009
+ ucptr = s;
3010
+ s += n;
3011
+ len -= n;
3012
+
3013
+ /* Read second code point. */
3014
+ while ((n2 = parse(&uc2, s, len)) > 0) {
3015
+ uint32_t ucx[FDC_MAX];
3016
+ int ccx[FDC_MAX];
3017
+ int cl, cx, i, nx, ucx_size;
3018
+ int LIndex,SIndex;
3019
+ uint32_t nfc;
3020
+
3021
+ if (n2 == spair || always_replace)
3022
+ /* uc2 is converted from a surrogate pair.
3023
+ * this should be treated as a changed code. */
3024
+ uc2ptr = NULL;
3025
+ else
3026
+ uc2ptr = s;
3027
+ s += n2;
3028
+ len -= n2;
3029
+
3030
+ /*
3031
+ * If current second code point is out of decomposable
3032
+ * code points, finding compositions is unneeded.
3033
+ */
3034
+ if (!IS_DECOMPOSABLE_BLOCK(uc2)) {
3035
+ WRITE_UC();
3036
+ REPLACE_UC_WITH_UC2();
3037
+ continue;
3038
+ }
3039
+
3040
+ /*
3041
+ * Try to combine current code points.
3042
+ */
3043
+ /*
3044
+ * We have to combine Hangul characters according to
3045
+ * http://uniicode.org/reports/tr15/#Hangul
3046
+ */
3047
+ if (0 <= (LIndex = uc - HC_LBASE) &&
3048
+ LIndex < HC_LCOUNT) {
3049
+ /*
3050
+ * Hangul Composition.
3051
+ * 1. Two current code points are L and V.
3052
+ */
3053
+ int VIndex = uc2 - HC_VBASE;
3054
+ if (0 <= VIndex && VIndex < HC_VCOUNT) {
3055
+ /* Make syllable of form LV. */
3056
+ UPDATE_UC(HC_SBASE +
3057
+ (LIndex * HC_VCOUNT + VIndex) *
3058
+ HC_TCOUNT);
3059
+ } else {
3060
+ WRITE_UC();
3061
+ REPLACE_UC_WITH_UC2();
3062
+ }
3063
+ continue;
3064
+ } else if (0 <= (SIndex = uc - HC_SBASE) &&
3065
+ SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) {
3066
+ /*
3067
+ * Hangul Composition.
3068
+ * 2. Two current code points are LV and T.
3069
+ */
3070
+ int TIndex = uc2 - HC_TBASE;
3071
+ if (0 < TIndex && TIndex < HC_TCOUNT) {
3072
+ /* Make syllable of form LVT. */
3073
+ UPDATE_UC(uc + TIndex);
3074
+ } else {
3075
+ WRITE_UC();
3076
+ REPLACE_UC_WITH_UC2();
3077
+ }
3078
+ continue;
3079
+ } else if ((nfc = get_nfc(uc, uc2)) != 0) {
3080
+ /* A composition to current code points
3081
+ * is found. */
3082
+ UPDATE_UC(nfc);
3083
+ continue;
3084
+ } else if ((cl = CCC(uc2)) == 0) {
3085
+ /* Clearly 'uc2' the second code point is not
3086
+ * a decomposable code. */
3087
+ WRITE_UC();
3088
+ REPLACE_UC_WITH_UC2();
3089
+ continue;
3090
+ }
3091
+
3092
+ /*
3093
+ * Collect following decomposable code points.
3094
+ */
3095
+ cx = 0;
3096
+ ucx[0] = uc2;
3097
+ ccx[0] = cl;
3098
+ COLLECT_CPS(1);
3099
+
3100
+ /*
3101
+ * Find a composed code in the collected code points.
3102
+ */
3103
+ i = 1;
3104
+ while (i < ucx_size) {
3105
+ int j;
3106
+
3107
+ if ((nfc = get_nfc(uc, ucx[i])) == 0) {
3108
+ i++;
3109
+ continue;
3110
+ }
3111
+
3112
+ /*
3113
+ * nfc is composed of uc and ucx[i].
3114
+ */
3115
+ UPDATE_UC(nfc);
3116
+
3117
+ /*
3118
+ * Remove ucx[i] by shifting
3119
+ * following code points.
3120
+ */
3121
+ for (j = i; j+1 < ucx_size; j++) {
3122
+ ucx[j] = ucx[j+1];
3123
+ ccx[j] = ccx[j+1];
3124
+ }
3125
+ ucx_size --;
3126
+
3127
+ /*
3128
+ * Collect following code points blocked
3129
+ * by ucx[i] the removed code point.
3130
+ */
3131
+ if (ucx_size > 0 && i == ucx_size &&
3132
+ nx > 0 && cx == cl) {
3133
+ cl = ccx[ucx_size-1];
3134
+ COLLECT_CPS(ucx_size);
3135
+ }
3136
+ /*
3137
+ * Restart finding a composed code with
3138
+ * the updated uc from the top of the
3139
+ * collected code points.
3140
+ */
3141
+ i = 0;
3142
+ }
3143
+
3144
+ /*
3145
+ * Apparently the current code points are not
3146
+ * decomposed characters or already composed.
3147
+ */
3148
+ WRITE_UC();
3149
+ for (i = 0; i < ucx_size; i++)
3150
+ UNPARSE(p, endp, ucx[i]);
3151
+
3152
+ /*
3153
+ * Flush out remaining canonical combining characters.
3154
+ */
3155
+ if (nx > 0 && cx == cl && len > 0) {
3156
+ while ((nx = parse(&ucx[0], s, len))
3157
+ > 0) {
3158
+ cx = CCC(ucx[0]);
3159
+ if (cl > cx)
3160
+ break;
3161
+ s += nx;
3162
+ len -= nx;
3163
+ cl = cx;
3164
+ UNPARSE(p, endp, ucx[0]);
3165
+ }
3166
+ }
3167
+ break;
3168
+ }
3169
+ if (n2 < 0) {
3170
+ WRITE_UC();
3171
+ /* Use a replaced unicode character. */
3172
+ UNPARSE(p, endp, uc2);
3173
+ s += n2*-1;
3174
+ len -= n2*-1;
3175
+ ret = -1;
3176
+ continue;
3177
+ } else if (n2 == 0) {
3178
+ WRITE_UC();
3179
+ break;
3180
+ }
3181
+ }
3182
+ as->length = p - as->s;
3183
+ as->s[as->length] = '\0';
3184
+ if (ts == 2)
3185
+ as->s[as->length+1] = '\0';
3186
+ return (ret);
3187
+ }
3188
+
3189
+ static int
3190
+ get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc)
3191
+ {
3192
+ int t, b;
3193
+
3194
+ /*
3195
+ * These are not converted to NFD on Mac OS.
3196
+ */
3197
+ if ((uc >= 0x2000 && uc <= 0x2FFF) ||
3198
+ (uc >= 0xF900 && uc <= 0xFAFF) ||
3199
+ (uc >= 0x2F800 && uc <= 0x2FAFF))
3200
+ return (0);
3201
+ /*
3202
+ * Those code points are not converted to NFD on Mac OS.
3203
+ * I do not know the reason because it is undocumented.
3204
+ * NFC NFD
3205
+ * 1109A ==> 11099 110BA
3206
+ * 1109C ==> 1109B 110BA
3207
+ * 110AB ==> 110A5 110BA
3208
+ */
3209
+ if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB)
3210
+ return (0);
3211
+
3212
+ t = 0;
3213
+ b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1;
3214
+ while (b >= t) {
3215
+ int m = (t + b) / 2;
3216
+ if (u_decomposition_table[m].nfc < uc)
3217
+ t = m + 1;
3218
+ else if (u_decomposition_table[m].nfc > uc)
3219
+ b = m - 1;
3220
+ else {
3221
+ *cp1 = u_decomposition_table[m].cp1;
3222
+ *cp2 = u_decomposition_table[m].cp2;
3223
+ return (1);
3224
+ }
3225
+ }
3226
+ return (0);
3227
+ }
3228
+
3229
+ #define REPLACE_UC_WITH(cp) do { \
3230
+ uc = cp; \
3231
+ ucptr = NULL; \
3232
+ } while (0)
3233
+
3234
+ /*
3235
+ * Normalize UTF-8 characters to Form D and copy the result.
3236
+ */
3237
+ static int
3238
+ archive_string_normalize_D(struct archive_string *as, const void *_p,
3239
+ size_t len, struct archive_string_conv *sc)
3240
+ {
3241
+ const char *s = (const char *)_p;
3242
+ char *p, *endp;
3243
+ uint32_t uc, uc2;
3244
+ size_t w;
3245
+ int always_replace, n, n2, ret = 0, spair, ts, tm;
3246
+ int (*parse)(uint32_t *, const char *, size_t);
3247
+ size_t (*unparse)(char *, size_t, uint32_t);
3248
+
3249
+ always_replace = 1;
3250
+ ts = 1;/* text size. */
3251
+ if (sc->flag & SCONV_TO_UTF16BE) {
3252
+ unparse = unicode_to_utf16be;
3253
+ ts = 2;
3254
+ if (sc->flag & SCONV_FROM_UTF16BE)
3255
+ always_replace = 0;
3256
+ } else if (sc->flag & SCONV_TO_UTF16LE) {
3257
+ unparse = unicode_to_utf16le;
3258
+ ts = 2;
3259
+ if (sc->flag & SCONV_FROM_UTF16LE)
3260
+ always_replace = 0;
3261
+ } else if (sc->flag & SCONV_TO_UTF8) {
3262
+ unparse = unicode_to_utf8;
3263
+ if (sc->flag & SCONV_FROM_UTF8)
3264
+ always_replace = 0;
3265
+ } else {
3266
+ /*
3267
+ * This case is going to be converted to another
3268
+ * character-set through iconv.
3269
+ */
3270
+ always_replace = 0;
3271
+ if (sc->flag & SCONV_FROM_UTF16BE) {
3272
+ unparse = unicode_to_utf16be;
3273
+ ts = 2;
3274
+ } else if (sc->flag & SCONV_FROM_UTF16LE) {
3275
+ unparse = unicode_to_utf16le;
3276
+ ts = 2;
3277
+ } else {
3278
+ unparse = unicode_to_utf8;
3279
+ }
3280
+ }
3281
+
3282
+ if (sc->flag & SCONV_FROM_UTF16BE) {
3283
+ parse = utf16be_to_unicode;
3284
+ tm = 1;
3285
+ spair = 4;/* surrogate pair size in UTF-16. */
3286
+ } else if (sc->flag & SCONV_FROM_UTF16LE) {
3287
+ parse = utf16le_to_unicode;
3288
+ tm = 1;
3289
+ spair = 4;/* surrogate pair size in UTF-16. */
3290
+ } else {
3291
+ parse = cesu8_to_unicode;
3292
+ tm = ts;
3293
+ spair = 6;/* surrogate pair size in UTF-8. */
3294
+ }
3295
+
3296
+ if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
3297
+ return (-1);
3298
+
3299
+ p = as->s + as->length;
3300
+ endp = as->s + as->buffer_length - ts;
3301
+ while ((n = parse(&uc, s, len)) != 0) {
3302
+ const char *ucptr;
3303
+ uint32_t cp1, cp2;
3304
+ int SIndex;
3305
+ struct {
3306
+ uint32_t uc;
3307
+ int ccc;
3308
+ } fdc[FDC_MAX];
3309
+ int fdi, fdj;
3310
+ int ccc;
3311
+
3312
+ check_first_code:
3313
+ if (n < 0) {
3314
+ /* Use a replaced unicode character. */
3315
+ UNPARSE(p, endp, uc);
3316
+ s += n*-1;
3317
+ len -= n*-1;
3318
+ ret = -1;
3319
+ continue;
3320
+ } else if (n == spair || always_replace)
3321
+ /* uc is converted from a surrogate pair.
3322
+ * this should be treated as a changed code. */
3323
+ ucptr = NULL;
3324
+ else
3325
+ ucptr = s;
3326
+ s += n;
3327
+ len -= n;
3328
+
3329
+ /* Hangul Decomposition. */
3330
+ if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) {
3331
+ int L = HC_LBASE + SIndex / HC_NCOUNT;
3332
+ int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT;
3333
+ int T = HC_TBASE + SIndex % HC_TCOUNT;
3334
+
3335
+ REPLACE_UC_WITH(L);
3336
+ WRITE_UC();
3337
+ REPLACE_UC_WITH(V);
3338
+ WRITE_UC();
3339
+ if (T != HC_TBASE) {
3340
+ REPLACE_UC_WITH(T);
3341
+ WRITE_UC();
3342
+ }
3343
+ continue;
3344
+ }
3345
+ if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) {
3346
+ WRITE_UC();
3347
+ continue;
3348
+ }
3349
+
3350
+ fdi = 0;
3351
+ while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) {
3352
+ int k;
3353
+
3354
+ for (k = fdi; k > 0; k--)
3355
+ fdc[k] = fdc[k-1];
3356
+ fdc[0].ccc = CCC(cp2);
3357
+ fdc[0].uc = cp2;
3358
+ fdi++;
3359
+ REPLACE_UC_WITH(cp1);
3360
+ }
3361
+
3362
+ /* Read following code points. */
3363
+ while ((n2 = parse(&uc2, s, len)) > 0 &&
3364
+ (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) {
3365
+ int j, k;
3366
+
3367
+ s += n2;
3368
+ len -= n2;
3369
+ for (j = 0; j < fdi; j++) {
3370
+ if (fdc[j].ccc > ccc)
3371
+ break;
3372
+ }
3373
+ if (j < fdi) {
3374
+ for (k = fdi; k > j; k--)
3375
+ fdc[k] = fdc[k-1];
3376
+ fdc[j].ccc = ccc;
3377
+ fdc[j].uc = uc2;
3378
+ } else {
3379
+ fdc[fdi].ccc = ccc;
3380
+ fdc[fdi].uc = uc2;
3381
+ }
3382
+ fdi++;
3383
+ }
3384
+
3385
+ WRITE_UC();
3386
+ for (fdj = 0; fdj < fdi; fdj++) {
3387
+ REPLACE_UC_WITH(fdc[fdj].uc);
3388
+ WRITE_UC();
3389
+ }
3390
+
3391
+ if (n2 == 0)
3392
+ break;
3393
+ REPLACE_UC_WITH(uc2);
3394
+ n = n2;
3395
+ goto check_first_code;
3396
+ }
3397
+ as->length = p - as->s;
3398
+ as->s[as->length] = '\0';
3399
+ if (ts == 2)
3400
+ as->s[as->length+1] = '\0';
3401
+ return (ret);
3402
+ }
3403
+
3404
+ /*
3405
+ * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption
3406
+ * that WCS is Unicode. It is true for several platforms but some are false.
3407
+ * And then people who did not use UTF-8 locale on the non Unicode WCS
3408
+ * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those
3409
+ * now cannot get right filename from libarchive 3.x and later since we
3410
+ * fixed the wrong assumption and it is incompatible to older its versions.
3411
+ * So we provide special option, "compat-2x.x", for resolving it.
3412
+ * That option enable the string conversion of libarchive 2.x.
3413
+ *
3414
+ * Translates the wrong UTF-8 string made by libarchive 2.x into current
3415
+ * locale character set and appends to the archive_string.
3416
+ * Note: returns -1 if conversion fails.
3417
+ */
3418
+ static int
3419
+ strncat_from_utf8_libarchive2(struct archive_string *as,
3420
+ const void *_p, size_t len, struct archive_string_conv *sc)
3421
+ {
3422
+ const char *s;
3423
+ int n;
3424
+ char *p;
3425
+ char *end;
3426
+ uint32_t unicode;
3427
+ #if HAVE_WCRTOMB
3428
+ mbstate_t shift_state;
3429
+
3430
+ memset(&shift_state, 0, sizeof(shift_state));
3431
+ #else
3432
+ /* Clear the shift state before starting. */
3433
+ wctomb(NULL, L'\0');
3434
+ #endif
3435
+ (void)sc; /* UNUSED */
3436
+ /*
3437
+ * Allocate buffer for MBS.
3438
+ * We need this allocation here since it is possible that
3439
+ * as->s is still NULL.
3440
+ */
3441
+ if (archive_string_ensure(as, as->length + len + 1) == NULL)
3442
+ return (-1);
3443
+
3444
+ s = (const char *)_p;
3445
+ p = as->s + as->length;
3446
+ end = as->s + as->buffer_length - MB_CUR_MAX -1;
3447
+ while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) {
3448
+ wchar_t wc;
3449
+
3450
+ if (p >= end) {
3451
+ as->length = p - as->s;
3452
+ /* Re-allocate buffer for MBS. */
3453
+ if (archive_string_ensure(as,
3454
+ as->length + max(len * 2,
3455
+ (size_t)MB_CUR_MAX) + 1) == NULL)
3456
+ return (-1);
3457
+ p = as->s + as->length;
3458
+ end = as->s + as->buffer_length - MB_CUR_MAX -1;
3459
+ }
3460
+
3461
+ /*
3462
+ * As libarchive 2.x, translates the UTF-8 characters into
3463
+ * wide-characters in the assumption that WCS is Unicode.
3464
+ */
3465
+ if (n < 0) {
3466
+ n *= -1;
3467
+ wc = L'?';
3468
+ } else
3469
+ wc = (wchar_t)unicode;
3470
+
3471
+ s += n;
3472
+ len -= n;
3473
+ /*
3474
+ * Translates the wide-character into the current locale MBS.
3475
+ */
3476
+ #if HAVE_WCRTOMB
3477
+ n = (int)wcrtomb(p, wc, &shift_state);
3478
+ #else
3479
+ n = (int)wctomb(p, wc);
3480
+ #endif
3481
+ if (n == -1)
3482
+ return (-1);
3483
+ p += n;
3484
+ }
3485
+ as->length = p - as->s;
3486
+ as->s[as->length] = '\0';
3487
+ return (0);
3488
+ }
3489
+
3490
+
3491
+ /*
3492
+ * Conversion functions between current locale dependent MBS and UTF-16BE.
3493
+ * strncat_from_utf16be() : UTF-16BE --> MBS
3494
+ * strncat_to_utf16be() : MBS --> UTF16BE
3495
+ */
3496
+
3497
+ #if defined(_WIN32) && !defined(__CYGWIN__)
3498
+
3499
+ /*
3500
+ * Convert a UTF-16BE/LE string to current locale and copy the result.
3501
+ * Return -1 if conversion fails.
3502
+ */
3503
+ static int
3504
+ win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes,
3505
+ struct archive_string_conv *sc, int be)
3506
+ {
3507
+ struct archive_string tmp;
3508
+ const char *u16;
3509
+ int ll;
3510
+ BOOL defchar;
3511
+ char *mbs;
3512
+ size_t mbs_size, b;
3513
+ int ret = 0;
3514
+
3515
+ bytes &= ~1;
3516
+ if (archive_string_ensure(as, as->length + bytes +1) == NULL)
3517
+ return (-1);
3518
+
3519
+ mbs = as->s + as->length;
3520
+ mbs_size = as->buffer_length - as->length -1;
3521
+
3522
+ if (sc->to_cp == CP_C_LOCALE) {
3523
+ /*
3524
+ * "C" locale special process.
3525
+ */
3526
+ u16 = _p;
3527
+ ll = 0;
3528
+ for (b = 0; b < bytes; b += 2) {
3529
+ uint16_t val;
3530
+ if (be)
3531
+ val = archive_be16dec(u16+b);
3532
+ else
3533
+ val = archive_le16dec(u16+b);
3534
+ if (val > 255) {
3535
+ *mbs++ = '?';
3536
+ ret = -1;
3537
+ } else
3538
+ *mbs++ = (char)(val&0xff);
3539
+ ll++;
3540
+ }
3541
+ as->length += ll;
3542
+ as->s[as->length] = '\0';
3543
+ return (ret);
3544
+ }
3545
+
3546
+ archive_string_init(&tmp);
3547
+ if (be) {
3548
+ if (is_big_endian()) {
3549
+ u16 = _p;
3550
+ } else {
3551
+ if (archive_string_ensure(&tmp, bytes+2) == NULL)
3552
+ return (-1);
3553
+ memcpy(tmp.s, _p, bytes);
3554
+ for (b = 0; b < bytes; b += 2) {
3555
+ uint16_t val = archive_be16dec(tmp.s+b);
3556
+ archive_le16enc(tmp.s+b, val);
3557
+ }
3558
+ u16 = tmp.s;
3559
+ }
3560
+ } else {
3561
+ if (!is_big_endian()) {
3562
+ u16 = _p;
3563
+ } else {
3564
+ if (archive_string_ensure(&tmp, bytes+2) == NULL)
3565
+ return (-1);
3566
+ memcpy(tmp.s, _p, bytes);
3567
+ for (b = 0; b < bytes; b += 2) {
3568
+ uint16_t val = archive_le16dec(tmp.s+b);
3569
+ archive_be16enc(tmp.s+b, val);
3570
+ }
3571
+ u16 = tmp.s;
3572
+ }
3573
+ }
3574
+
3575
+ do {
3576
+ defchar = 0;
3577
+ ll = WideCharToMultiByte(sc->to_cp, 0,
3578
+ (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size,
3579
+ NULL, &defchar);
3580
+ /* Exit loop if we succeeded */
3581
+ if (ll != 0 ||
3582
+ GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
3583
+ break;
3584
+ }
3585
+ /* Else expand buffer and loop to try again. */
3586
+ ll = WideCharToMultiByte(sc->to_cp, 0,
3587
+ (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL);
3588
+ if (archive_string_ensure(as, ll +1) == NULL)
3589
+ return (-1);
3590
+ mbs = as->s + as->length;
3591
+ mbs_size = as->buffer_length - as->length -1;
3592
+ } while (1);
3593
+ archive_string_free(&tmp);
3594
+ as->length += ll;
3595
+ as->s[as->length] = '\0';
3596
+ if (ll == 0 || defchar)
3597
+ ret = -1;
3598
+ return (ret);
3599
+ }
3600
+
3601
+ static int
3602
+ win_strncat_from_utf16be(struct archive_string *as, const void *_p,
3603
+ size_t bytes, struct archive_string_conv *sc)
3604
+ {
3605
+ return (win_strncat_from_utf16(as, _p, bytes, sc, 1));
3606
+ }
3607
+
3608
+ static int
3609
+ win_strncat_from_utf16le(struct archive_string *as, const void *_p,
3610
+ size_t bytes, struct archive_string_conv *sc)
3611
+ {
3612
+ return (win_strncat_from_utf16(as, _p, bytes, sc, 0));
3613
+ }
3614
+
3615
+ static int
3616
+ is_big_endian(void)
3617
+ {
3618
+ uint16_t d = 1;
3619
+
3620
+ return (archive_be16dec(&d) == 1);
3621
+ }
3622
+
3623
+ /*
3624
+ * Convert a current locale string to UTF-16BE/LE and copy the result.
3625
+ * Return -1 if conversion fails.
3626
+ */
3627
+ static int
3628
+ win_strncat_to_utf16(struct archive_string *as16, const void *_p,
3629
+ size_t length, struct archive_string_conv *sc, int bigendian)
3630
+ {
3631
+ const char *s = (const char *)_p;
3632
+ char *u16;
3633
+ size_t count, avail;
3634
+
3635
+ if (archive_string_ensure(as16,
3636
+ as16->length + (length + 1) * 2) == NULL)
3637
+ return (-1);
3638
+
3639
+ u16 = as16->s + as16->length;
3640
+ avail = as16->buffer_length - 2;
3641
+ if (sc->from_cp == CP_C_LOCALE) {
3642
+ /*
3643
+ * "C" locale special process.
3644
+ */
3645
+ count = 0;
3646
+ while (count < length && *s) {
3647
+ if (bigendian)
3648
+ archive_be16enc(u16, *s);
3649
+ else
3650
+ archive_le16enc(u16, *s);
3651
+ u16 += 2;
3652
+ s++;
3653
+ count++;
3654
+ }
3655
+ as16->length += count << 1;
3656
+ as16->s[as16->length] = 0;
3657
+ as16->s[as16->length+1] = 0;
3658
+ return (0);
3659
+ }
3660
+ do {
3661
+ count = MultiByteToWideChar(sc->from_cp,
3662
+ MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1);
3663
+ /* Exit loop if we succeeded */
3664
+ if (count != 0 ||
3665
+ GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
3666
+ break;
3667
+ }
3668
+ /* Expand buffer and try again */
3669
+ count = MultiByteToWideChar(sc->from_cp,
3670
+ MB_PRECOMPOSED, s, (int)length, NULL, 0);
3671
+ if (archive_string_ensure(as16, (count +1) * 2)
3672
+ == NULL)
3673
+ return (-1);
3674
+ u16 = as16->s + as16->length;
3675
+ avail = as16->buffer_length - 2;
3676
+ } while (1);
3677
+ as16->length += count * 2;
3678
+ as16->s[as16->length] = 0;
3679
+ as16->s[as16->length+1] = 0;
3680
+ if (count == 0)
3681
+ return (-1);
3682
+
3683
+ if (is_big_endian()) {
3684
+ if (!bigendian) {
3685
+ while (count > 0) {
3686
+ uint16_t v = archive_be16dec(u16);
3687
+ archive_le16enc(u16, v);
3688
+ u16 += 2;
3689
+ count--;
3690
+ }
3691
+ }
3692
+ } else {
3693
+ if (bigendian) {
3694
+ while (count > 0) {
3695
+ uint16_t v = archive_le16dec(u16);
3696
+ archive_be16enc(u16, v);
3697
+ u16 += 2;
3698
+ count--;
3699
+ }
3700
+ }
3701
+ }
3702
+ return (0);
3703
+ }
3704
+
3705
+ static int
3706
+ win_strncat_to_utf16be(struct archive_string *as16, const void *_p,
3707
+ size_t length, struct archive_string_conv *sc)
3708
+ {
3709
+ return (win_strncat_to_utf16(as16, _p, length, sc, 1));
3710
+ }
3711
+
3712
+ static int
3713
+ win_strncat_to_utf16le(struct archive_string *as16, const void *_p,
3714
+ size_t length, struct archive_string_conv *sc)
3715
+ {
3716
+ return (win_strncat_to_utf16(as16, _p, length, sc, 0));
3717
+ }
3718
+
3719
+ #endif /* _WIN32 && !__CYGWIN__ */
3720
+
3721
+ /*
3722
+ * Do the best effort for conversions.
3723
+ * We cannot handle UTF-16BE character-set without such iconv,
3724
+ * but there is a chance if a string consists just ASCII code or
3725
+ * a current locale is UTF-8.
3726
+ */
3727
+
3728
+ /*
3729
+ * Convert a UTF-16BE string to current locale and copy the result.
3730
+ * Return -1 if conversion fails.
3731
+ */
3732
+ static int
3733
+ best_effort_strncat_from_utf16(struct archive_string *as, const void *_p,
3734
+ size_t bytes, struct archive_string_conv *sc, int be)
3735
+ {
3736
+ const char *utf16 = (const char *)_p;
3737
+ char *mbs;
3738
+ uint32_t uc;
3739
+ int n, ret;
3740
+
3741
+ (void)sc; /* UNUSED */
3742
+ /*
3743
+ * Other case, we should do the best effort.
3744
+ * If all character are ASCII(<0x7f), we can convert it.
3745
+ * if not , we set a alternative character and return -1.
3746
+ */
3747
+ ret = 0;
3748
+ if (archive_string_ensure(as, as->length + bytes +1) == NULL)
3749
+ return (-1);
3750
+ mbs = as->s + as->length;
3751
+
3752
+ while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) {
3753
+ if (n < 0) {
3754
+ n *= -1;
3755
+ ret = -1;
3756
+ }
3757
+ bytes -= n;
3758
+ utf16 += n;
3759
+
3760
+ if (uc > 127) {
3761
+ /* We cannot handle it. */
3762
+ *mbs++ = '?';
3763
+ ret = -1;
3764
+ } else
3765
+ *mbs++ = (char)uc;
3766
+ }
3767
+ as->length = mbs - as->s;
3768
+ as->s[as->length] = '\0';
3769
+ return (ret);
3770
+ }
3771
+
3772
+ static int
3773
+ best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p,
3774
+ size_t bytes, struct archive_string_conv *sc)
3775
+ {
3776
+ return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1));
3777
+ }
3778
+
3779
+ static int
3780
+ best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p,
3781
+ size_t bytes, struct archive_string_conv *sc)
3782
+ {
3783
+ return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0));
3784
+ }
3785
+
3786
+ /*
3787
+ * Convert a current locale string to UTF-16BE/LE and copy the result.
3788
+ * Return -1 if conversion fails.
3789
+ */
3790
+ static int
3791
+ best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p,
3792
+ size_t length, struct archive_string_conv *sc, int bigendian)
3793
+ {
3794
+ const char *s = (const char *)_p;
3795
+ char *utf16;
3796
+ size_t remaining;
3797
+ int ret;
3798
+
3799
+ (void)sc; /* UNUSED */
3800
+ /*
3801
+ * Other case, we should do the best effort.
3802
+ * If all character are ASCII(<0x7f), we can convert it.
3803
+ * if not , we set a alternative character and return -1.
3804
+ */
3805
+ ret = 0;
3806
+ remaining = length;
3807
+
3808
+ if (archive_string_ensure(as16,
3809
+ as16->length + (length + 1) * 2) == NULL)
3810
+ return (-1);
3811
+
3812
+ utf16 = as16->s + as16->length;
3813
+ while (remaining--) {
3814
+ unsigned c = *s++;
3815
+ if (c > 127) {
3816
+ /* We cannot handle it. */
3817
+ c = UNICODE_R_CHAR;
3818
+ ret = -1;
3819
+ }
3820
+ if (bigendian)
3821
+ archive_be16enc(utf16, c);
3822
+ else
3823
+ archive_le16enc(utf16, c);
3824
+ utf16 += 2;
3825
+ }
3826
+ as16->length = utf16 - as16->s;
3827
+ as16->s[as16->length] = 0;
3828
+ as16->s[as16->length+1] = 0;
3829
+ return (ret);
3830
+ }
3831
+
3832
+ static int
3833
+ best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p,
3834
+ size_t length, struct archive_string_conv *sc)
3835
+ {
3836
+ return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1));
3837
+ }
3838
+
3839
+ static int
3840
+ best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p,
3841
+ size_t length, struct archive_string_conv *sc)
3842
+ {
3843
+ return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0));
3844
+ }
3845
+
3846
+
3847
+ /*
3848
+ * Multistring operations.
3849
+ */
3850
+
3851
+ void
3852
+ archive_mstring_clean(struct archive_mstring *aes)
3853
+ {
3854
+ archive_wstring_free(&(aes->aes_wcs));
3855
+ archive_string_free(&(aes->aes_mbs));
3856
+ archive_string_free(&(aes->aes_utf8));
3857
+ archive_string_free(&(aes->aes_mbs_in_locale));
3858
+ aes->aes_set = 0;
3859
+ }
3860
+
3861
+ void
3862
+ archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src)
3863
+ {
3864
+ dest->aes_set = src->aes_set;
3865
+ archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs));
3866
+ archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8));
3867
+ archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs));
3868
+ }
3869
+
3870
+ int
3871
+ archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
3872
+ const char **p)
3873
+ {
3874
+ struct archive_string_conv *sc;
3875
+ int r;
3876
+
3877
+ /* If we already have a UTF8 form, return that immediately. */
3878
+ if (aes->aes_set & AES_SET_UTF8) {
3879
+ *p = aes->aes_utf8.s;
3880
+ return (0);
3881
+ }
3882
+
3883
+ *p = NULL;
3884
+ /* Try converting WCS to MBS first if MBS does not exist yet. */
3885
+ if ((aes->aes_set & AES_SET_MBS) == 0) {
3886
+ const char *pm; /* unused */
3887
+ archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
3888
+ }
3889
+ if (aes->aes_set & AES_SET_MBS) {
3890
+ sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
3891
+ if (sc == NULL)
3892
+ return (-1);/* Couldn't allocate memory for sc. */
3893
+ r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s,
3894
+ aes->aes_mbs.length, sc);
3895
+ if (a == NULL)
3896
+ free_sconv_object(sc);
3897
+ if (r == 0) {
3898
+ aes->aes_set |= AES_SET_UTF8;
3899
+ *p = aes->aes_utf8.s;
3900
+ return (0);/* success. */
3901
+ } else
3902
+ return (-1);/* failure. */
3903
+ }
3904
+ return (0);/* success. */
3905
+ }
3906
+
3907
+ int
3908
+ archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes,
3909
+ const char **p)
3910
+ {
3911
+ struct archive_string_conv *sc;
3912
+ int r, ret = 0;
3913
+
3914
+ /* If we already have an MBS form, return that immediately. */
3915
+ if (aes->aes_set & AES_SET_MBS) {
3916
+ *p = aes->aes_mbs.s;
3917
+ return (ret);
3918
+ }
3919
+
3920
+ *p = NULL;
3921
+ /* If there's a WCS form, try converting with the native locale. */
3922
+ if (aes->aes_set & AES_SET_WCS) {
3923
+ archive_string_empty(&(aes->aes_mbs));
3924
+ r = archive_string_append_from_wcs(&(aes->aes_mbs),
3925
+ aes->aes_wcs.s, aes->aes_wcs.length);
3926
+ *p = aes->aes_mbs.s;
3927
+ if (r == 0) {
3928
+ aes->aes_set |= AES_SET_MBS;
3929
+ return (ret);
3930
+ } else
3931
+ ret = -1;
3932
+ }
3933
+
3934
+ /* If there's a UTF-8 form, try converting with the native locale. */
3935
+ if (aes->aes_set & AES_SET_UTF8) {
3936
+ archive_string_empty(&(aes->aes_mbs));
3937
+ sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
3938
+ if (sc == NULL)
3939
+ return (-1);/* Couldn't allocate memory for sc. */
3940
+ r = archive_strncpy_l(&(aes->aes_mbs),
3941
+ aes->aes_utf8.s, aes->aes_utf8.length, sc);
3942
+ if (a == NULL)
3943
+ free_sconv_object(sc);
3944
+ *p = aes->aes_mbs.s;
3945
+ if (r == 0) {
3946
+ aes->aes_set |= AES_SET_MBS;
3947
+ ret = 0;/* success; overwrite previous error. */
3948
+ } else
3949
+ ret = -1;/* failure. */
3950
+ }
3951
+ return (ret);
3952
+ }
3953
+
3954
+ int
3955
+ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
3956
+ const wchar_t **wp)
3957
+ {
3958
+ int r, ret = 0;
3959
+
3960
+ (void)a;/* UNUSED */
3961
+ /* Return WCS form if we already have it. */
3962
+ if (aes->aes_set & AES_SET_WCS) {
3963
+ *wp = aes->aes_wcs.s;
3964
+ return (ret);
3965
+ }
3966
+
3967
+ *wp = NULL;
3968
+ /* Try converting UTF8 to MBS first if MBS does not exist yet. */
3969
+ if ((aes->aes_set & AES_SET_MBS) == 0) {
3970
+ const char *p; /* unused */
3971
+ archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */
3972
+ }
3973
+ /* Try converting MBS to WCS using native locale. */
3974
+ if (aes->aes_set & AES_SET_MBS) {
3975
+ archive_wstring_empty(&(aes->aes_wcs));
3976
+ r = archive_wstring_append_from_mbs(&(aes->aes_wcs),
3977
+ aes->aes_mbs.s, aes->aes_mbs.length);
3978
+ if (r == 0) {
3979
+ aes->aes_set |= AES_SET_WCS;
3980
+ *wp = aes->aes_wcs.s;
3981
+ } else
3982
+ ret = -1;/* failure. */
3983
+ }
3984
+ return (ret);
3985
+ }
3986
+
3987
+ int
3988
+ archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes,
3989
+ const char **p, size_t *length, struct archive_string_conv *sc)
3990
+ {
3991
+ int ret = 0;
3992
+ #if defined(_WIN32) && !defined(__CYGWIN__)
3993
+ int r;
3994
+
3995
+ /*
3996
+ * Internationalization programming on Windows must use Wide
3997
+ * characters because Windows platform cannot make locale UTF-8.
3998
+ */
3999
+ if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) {
4000
+ archive_string_empty(&(aes->aes_mbs_in_locale));
4001
+ r = archive_string_append_from_wcs_in_codepage(
4002
+ &(aes->aes_mbs_in_locale), aes->aes_wcs.s,
4003
+ aes->aes_wcs.length, sc);
4004
+ if (r == 0) {
4005
+ *p = aes->aes_mbs_in_locale.s;
4006
+ if (length != NULL)
4007
+ *length = aes->aes_mbs_in_locale.length;
4008
+ return (0);
4009
+ } else if (errno == ENOMEM)
4010
+ return (-1);
4011
+ else
4012
+ ret = -1;
4013
+ }
4014
+ #endif
4015
+
4016
+ /* If there is not an MBS form but there is a WCS or UTF8 form, try converting
4017
+ * with the native locale to be used for translating it to specified
4018
+ * character-set. */
4019
+ if ((aes->aes_set & AES_SET_MBS) == 0) {
4020
+ const char *pm; /* unused */
4021
+ archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
4022
+ }
4023
+ /* If we already have an MBS form, use it to be translated to
4024
+ * specified character-set. */
4025
+ if (aes->aes_set & AES_SET_MBS) {
4026
+ if (sc == NULL) {
4027
+ /* Conversion is unneeded. */
4028
+ *p = aes->aes_mbs.s;
4029
+ if (length != NULL)
4030
+ *length = aes->aes_mbs.length;
4031
+ return (0);
4032
+ }
4033
+ ret = archive_strncpy_l(&(aes->aes_mbs_in_locale),
4034
+ aes->aes_mbs.s, aes->aes_mbs.length, sc);
4035
+ *p = aes->aes_mbs_in_locale.s;
4036
+ if (length != NULL)
4037
+ *length = aes->aes_mbs_in_locale.length;
4038
+ } else {
4039
+ *p = NULL;
4040
+ if (length != NULL)
4041
+ *length = 0;
4042
+ }
4043
+ return (ret);
4044
+ }
4045
+
4046
+ int
4047
+ archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs)
4048
+ {
4049
+ if (mbs == NULL) {
4050
+ aes->aes_set = 0;
4051
+ return (0);
4052
+ }
4053
+ return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs)));
4054
+ }
4055
+
4056
+ int
4057
+ archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs,
4058
+ size_t len)
4059
+ {
4060
+ if (mbs == NULL) {
4061
+ aes->aes_set = 0;
4062
+ return (0);
4063
+ }
4064
+ aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
4065
+ archive_strncpy(&(aes->aes_mbs), mbs, len);
4066
+ archive_string_empty(&(aes->aes_utf8));
4067
+ archive_wstring_empty(&(aes->aes_wcs));
4068
+ return (0);
4069
+ }
4070
+
4071
+ int
4072
+ archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs)
4073
+ {
4074
+ return archive_mstring_copy_wcs_len(aes, wcs,
4075
+ wcs == NULL ? 0 : wcslen(wcs));
4076
+ }
4077
+
4078
+ int
4079
+ archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8)
4080
+ {
4081
+ if (utf8 == NULL) {
4082
+ aes->aes_set = 0;
4083
+ return (0);
4084
+ }
4085
+ aes->aes_set = AES_SET_UTF8;
4086
+ archive_string_empty(&(aes->aes_mbs));
4087
+ archive_string_empty(&(aes->aes_wcs));
4088
+ archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8));
4089
+ return (int)strlen(utf8);
4090
+ }
4091
+
4092
+ int
4093
+ archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs,
4094
+ size_t len)
4095
+ {
4096
+ if (wcs == NULL) {
4097
+ aes->aes_set = 0;
4098
+ return (0);
4099
+ }
4100
+ aes->aes_set = AES_SET_WCS; /* Only WCS form set. */
4101
+ archive_string_empty(&(aes->aes_mbs));
4102
+ archive_string_empty(&(aes->aes_utf8));
4103
+ archive_wstrncpy(&(aes->aes_wcs), wcs, len);
4104
+ return (0);
4105
+ }
4106
+
4107
+ int
4108
+ archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,
4109
+ const char *mbs, size_t len, struct archive_string_conv *sc)
4110
+ {
4111
+ int r;
4112
+
4113
+ if (mbs == NULL) {
4114
+ aes->aes_set = 0;
4115
+ return (0);
4116
+ }
4117
+ archive_string_empty(&(aes->aes_mbs));
4118
+ archive_wstring_empty(&(aes->aes_wcs));
4119
+ archive_string_empty(&(aes->aes_utf8));
4120
+ #if defined(_WIN32) && !defined(__CYGWIN__)
4121
+ /*
4122
+ * Internationalization programming on Windows must use Wide
4123
+ * characters because Windows platform cannot make locale UTF-8.
4124
+ */
4125
+ if (sc == NULL) {
4126
+ if (archive_string_append(&(aes->aes_mbs),
4127
+ mbs, mbsnbytes(mbs, len)) == NULL) {
4128
+ aes->aes_set = 0;
4129
+ r = -1;
4130
+ } else {
4131
+ aes->aes_set = AES_SET_MBS;
4132
+ r = 0;
4133
+ }
4134
+ #if defined(HAVE_ICONV)
4135
+ } else if (sc != NULL && sc->cd_w != (iconv_t)-1) {
4136
+ /*
4137
+ * This case happens only when MultiByteToWideChar() cannot
4138
+ * handle sc->from_cp, and we have to iconv in order to
4139
+ * translate character-set to wchar_t,UTF-16.
4140
+ */
4141
+ iconv_t cd = sc->cd;
4142
+ unsigned from_cp;
4143
+ int flag;
4144
+
4145
+ /*
4146
+ * Translate multi-bytes from some character-set to UTF-8.
4147
+ */
4148
+ sc->cd = sc->cd_w;
4149
+ r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc);
4150
+ sc->cd = cd;
4151
+ if (r != 0) {
4152
+ aes->aes_set = 0;
4153
+ return (r);
4154
+ }
4155
+ aes->aes_set = AES_SET_UTF8;
4156
+
4157
+ /*
4158
+ * Append the UTF-8 string into wstring.
4159
+ */
4160
+ flag = sc->flag;
4161
+ sc->flag &= ~(SCONV_NORMALIZATION_C
4162
+ | SCONV_TO_UTF16| SCONV_FROM_UTF16);
4163
+ from_cp = sc->from_cp;
4164
+ sc->from_cp = CP_UTF8;
4165
+ r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4166
+ aes->aes_utf8.s, aes->aes_utf8.length, sc);
4167
+ sc->flag = flag;
4168
+ sc->from_cp = from_cp;
4169
+ if (r == 0)
4170
+ aes->aes_set |= AES_SET_WCS;
4171
+ #endif
4172
+ } else {
4173
+ r = archive_wstring_append_from_mbs_in_codepage(
4174
+ &(aes->aes_wcs), mbs, len, sc);
4175
+ if (r == 0)
4176
+ aes->aes_set = AES_SET_WCS;
4177
+ else
4178
+ aes->aes_set = 0;
4179
+ }
4180
+ #else
4181
+ r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc);
4182
+ if (r == 0)
4183
+ aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
4184
+ else
4185
+ aes->aes_set = 0;
4186
+ #endif
4187
+ return (r);
4188
+ }
4189
+
4190
+ /*
4191
+ * The 'update' form tries to proactively update all forms of
4192
+ * this string (WCS and MBS) and returns an error if any of
4193
+ * them fail. This is used by the 'pax' handler, for instance,
4194
+ * to detect and report character-conversion failures early while
4195
+ * still allowing clients to get potentially useful values from
4196
+ * the more tolerant lazy conversions. (get_mbs and get_wcs will
4197
+ * strive to give the user something useful, so you can get hopefully
4198
+ * usable values even if some of the character conversions are failing.)
4199
+ */
4200
+ int
4201
+ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
4202
+ const char *utf8)
4203
+ {
4204
+ struct archive_string_conv *sc;
4205
+ int r;
4206
+
4207
+ if (utf8 == NULL) {
4208
+ aes->aes_set = 0;
4209
+ return (0); /* Succeeded in clearing everything. */
4210
+ }
4211
+
4212
+ /* Save the UTF8 string. */
4213
+ archive_strcpy(&(aes->aes_utf8), utf8);
4214
+
4215
+ /* Empty the mbs and wcs strings. */
4216
+ archive_string_empty(&(aes->aes_mbs));
4217
+ archive_wstring_empty(&(aes->aes_wcs));
4218
+
4219
+ aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */
4220
+
4221
+ /* Try converting UTF-8 to MBS, return false on failure. */
4222
+ sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
4223
+ if (sc == NULL)
4224
+ return (-1);/* Couldn't allocate memory for sc. */
4225
+ r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
4226
+ if (a == NULL)
4227
+ free_sconv_object(sc);
4228
+ if (r != 0)
4229
+ return (-1);
4230
+ aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */
4231
+
4232
+ /* Try converting MBS to WCS, return false on failure. */
4233
+ if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
4234
+ aes->aes_mbs.length))
4235
+ return (-1);
4236
+ aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
4237
+
4238
+ /* All conversions succeeded. */
4239
+ return (0);
4240
+ }