libarchive-static 1.0.6 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/extconf.rb +2 -9
- data/ext/libarchive-0.1.1/ext/archive_read_support_compression.c +6 -6
- data/ext/libarchive-0.1.1/ext/archive_read_support_compression.o +0 -0
- data/ext/libarchive-0.1.1/ext/archive_read_support_format.o +0 -0
- data/ext/libarchive-0.1.1/ext/archive_write_open_rb_str.c +1 -1
- data/ext/libarchive-0.1.1/ext/archive_write_open_rb_str.o +0 -0
- data/ext/libarchive-0.1.1/ext/archive_write_set_compression.c +5 -5
- data/ext/libarchive-0.1.1/ext/archive_write_set_compression.o +0 -0
- data/ext/libarchive-0.1.1/ext/config.h +23 -0
- data/ext/libarchive-0.1.1/ext/config.log +230 -0
- data/ext/libarchive-0.1.1/ext/config.status +671 -0
- data/ext/libarchive-0.1.1/ext/libarchive.c +1 -1
- data/ext/libarchive-0.1.1/ext/libarchive.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_archive.c +7 -7
- data/ext/libarchive-0.1.1/ext/libarchive_archive.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_entry.c +6 -0
- data/ext/libarchive-0.1.1/ext/libarchive_entry.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_reader.c +6 -4
- data/ext/libarchive-0.1.1/ext/libarchive_reader.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_ruby.so +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_win32.h +1 -1
- data/ext/libarchive-0.1.1/ext/libarchive_writer.c +2 -2
- data/ext/libarchive-0.1.1/ext/libarchive_writer.o +0 -0
- data/ext/libarchive-3.6.2/Makefile.in +16892 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_append_compile_flags.m4 +67 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_append_flag.m4 +71 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_check_compile_flag.m4 +74 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_require_defined.m4 +37 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/autoconf/check_stdcall_func.m4 +0 -0
- data/ext/libarchive-3.6.2/build/autoconf/compile +348 -0
- data/ext/libarchive-3.6.2/build/autoconf/config.guess +1754 -0
- data/ext/libarchive-3.6.2/build/autoconf/config.rpath +696 -0
- data/ext/libarchive-3.6.2/build/autoconf/config.sub +1890 -0
- data/ext/libarchive-3.6.2/build/autoconf/depcomp +791 -0
- data/ext/libarchive-3.6.2/build/autoconf/iconv.m4 +271 -0
- data/ext/libarchive-3.6.2/build/autoconf/install-sh +541 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/autoconf/la_uid_t.m4 +0 -0
- data/ext/libarchive-3.6.2/build/autoconf/lib-ld.m4 +109 -0
- data/ext/libarchive-3.6.2/build/autoconf/lib-link.m4 +777 -0
- data/ext/libarchive-3.6.2/build/autoconf/lib-prefix.m4 +224 -0
- data/ext/libarchive-3.6.2/build/autoconf/ltmain.sh +11251 -0
- data/ext/libarchive-3.6.2/build/autoconf/m4_ax_compile_check_sizeof.m4 +115 -0
- data/ext/libarchive-3.6.2/build/autoconf/missing +215 -0
- data/ext/libarchive-3.6.2/build/autoconf/test-driver +153 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/pkgconfig/libarchive.pc.in +4 -1
- data/ext/libarchive-3.6.2/config.h.in +1504 -0
- data/ext/libarchive-3.6.2/configure +25558 -0
- data/ext/libarchive-3.6.2/libarchive/archive.h +1212 -0
- data/ext/libarchive-3.6.2/libarchive/archive_acl.c +2097 -0
- data/ext/libarchive-3.6.2/libarchive/archive_acl_private.h +83 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2.h +197 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2_impl.h +161 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2s_ref.c +369 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2sp_ref.c +361 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_check_magic.c +63 -22
- data/ext/libarchive-3.6.2/libarchive/archive_cmdline.c +227 -0
- data/ext/libarchive-3.6.2/libarchive/archive_cmdline_private.h +47 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_crc32.h +17 -0
- data/ext/libarchive-3.6.2/libarchive/archive_cryptor.c +534 -0
- data/ext/libarchive-3.6.2/libarchive/archive_cryptor_private.h +188 -0
- data/ext/libarchive-3.6.2/libarchive/archive_digest.c +1505 -0
- data/ext/libarchive-3.6.2/libarchive/archive_digest_private.h +416 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_darwin.c +559 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_freebsd.c +712 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_linux.c +760 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_sunos.c +824 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_endian.h +48 -15
- data/ext/libarchive-3.6.2/libarchive/archive_entry.c +2149 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry.h +305 -106
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_copy_bhfi.c +5 -4
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_copy_stat.c +9 -3
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_link_resolver.c +104 -62
- data/ext/libarchive-3.6.2/libarchive/archive_entry_locale.h +92 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_private.h +65 -49
- data/ext/libarchive-3.6.2/libarchive/archive_entry_sparse.c +156 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_stat.c +6 -6
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_strmode.c +1 -1
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_xattr.c +4 -6
- data/ext/libarchive-3.6.2/libarchive/archive_getdate.c +1165 -0
- data/ext/libarchive-3.6.2/libarchive/archive_getdate.h +39 -0
- data/ext/libarchive-3.6.2/libarchive/archive_hmac.c +334 -0
- data/ext/libarchive-3.6.2/libarchive/archive_hmac_private.h +117 -0
- data/ext/libarchive-3.6.2/libarchive/archive_match.c +1875 -0
- data/ext/libarchive-3.6.2/libarchive/archive_openssl_evp_private.h +53 -0
- data/ext/libarchive-3.6.2/libarchive/archive_openssl_hmac_private.h +54 -0
- data/ext/libarchive-3.6.2/libarchive/archive_options.c +218 -0
- data/ext/libarchive-3.6.2/libarchive/archive_options_private.h +51 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pack_dev.c +337 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pack_dev.h +49 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pathmatch.c +463 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pathmatch.h +52 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_platform.h +77 -9
- data/ext/libarchive-3.6.2/libarchive/archive_platform_acl.h +55 -0
- data/ext/libarchive-3.6.2/libarchive/archive_platform_xattr.h +47 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd7.c +1168 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd7_private.h +119 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd8.c +1287 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd8_private.h +148 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd_private.h +151 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_private.h +74 -18
- data/ext/libarchive-3.6.2/libarchive/archive_random.c +272 -0
- data/ext/libarchive-3.6.2/libarchive/archive_random_private.h +36 -0
- data/ext/libarchive-3.6.2/libarchive/archive_rb.c +709 -0
- data/ext/libarchive-3.6.2/libarchive/archive_rb.h +113 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read.c +1756 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_add_passphrase.c +190 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_append_filter.c +204 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_data_into_fd.c +64 -18
- data/ext/libarchive-3.6.2/libarchive/archive_read_disk_entry_from_file.c +1086 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_disk_posix.c +2732 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_disk_private.h +40 -4
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_disk_set_standard_lookup.c +21 -11
- data/ext/libarchive-3.6.2/libarchive/archive_read_disk_windows.c +2479 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_extract.c +60 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_extract.c → libarchive-3.6.2/libarchive/archive_read_extract2.c} +34 -61
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_fd.c +70 -49
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_file.c +38 -23
- data/ext/libarchive-3.6.2/libarchive/archive_read_open_filename.c +586 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_memory.c +58 -28
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_private.h +127 -59
- data/ext/libarchive-3.6.2/libarchive/archive_read_set_format.c +117 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_set_options.c +133 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_all.c → libarchive-3.6.2/libarchive/archive_read_support_filter_all.c} +35 -10
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_by_code.c +83 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_bzip2.c → libarchive-3.6.2/libarchive/archive_read_support_filter_bzip2.c} +38 -26
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_compress.c → libarchive-3.6.2/libarchive/archive_read_support_filter_compress.c} +52 -44
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_grzip.c +112 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_gzip.c → libarchive-3.6.2/libarchive/archive_read_support_filter_gzip.c} +108 -37
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lrzip.c +122 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lz4.c +742 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lzop.c +499 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_none.c → libarchive-3.6.2/libarchive/archive_read_support_filter_none.c} +15 -3
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_program.c → libarchive-3.6.2/libarchive/archive_read_support_filter_program.c} +114 -77
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_rpm.c → libarchive-3.6.2/libarchive/archive_read_support_filter_rpm.c} +31 -31
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_uu.c → libarchive-3.6.2/libarchive/archive_read_support_filter_uu.c} +141 -85
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_xz.c → libarchive-3.6.2/libarchive/archive_read_support_filter_xz.c} +369 -284
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_zstd.c +297 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_7zip.c +3900 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_all.c +89 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_ar.c +126 -72
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_by_code.c +92 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_cab.c +3228 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_cpio.c +1104 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_empty.c +14 -11
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_iso9660.c +990 -541
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_lha.c +2916 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_mtree.c +2150 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_rar.c +3797 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_rar5.c +4251 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_raw.c +38 -31
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_tar.c +1157 -629
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_warc.c +848 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_xar.c +439 -258
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_zip.c +4270 -0
- data/ext/libarchive-3.6.2/libarchive/archive_string.c +4240 -0
- data/ext/libarchive-3.6.2/libarchive/archive_string.h +243 -0
- data/ext/libarchive-3.6.2/libarchive/archive_string_composition.h +2292 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_string_sprintf.c +44 -16
- data/ext/libarchive-3.6.2/libarchive/archive_util.c +655 -0
- data/ext/libarchive-3.6.2/libarchive/archive_version_details.c +151 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_virtual.c +85 -16
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_windows.c +214 -541
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_windows.h +74 -106
- data/ext/libarchive-3.6.2/libarchive/archive_write.c +828 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter.c +72 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_b64encode.c +304 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_by_name.c +77 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_bzip2.c +401 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_write_set_compression_compress.c → libarchive-3.6.2/libarchive/archive_write_add_filter_compress.c} +86 -131
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_grzip.c +135 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_gzip.c +442 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lrzip.c +197 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lz4.c +700 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lzop.c +478 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_format_all.c → libarchive-3.6.2/libarchive/archive_write_add_filter_none.c} +11 -11
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_program.c +391 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_uuencode.c +295 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_xz.c +545 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_zstd.c +418 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_disk_posix.c +4711 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_disk_private.h +9 -2
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_disk_set_standard_lookup.c +30 -29
- data/ext/libarchive-3.6.2/libarchive/archive_write_disk_windows.c +2842 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_fd.c +15 -10
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_file.c +15 -9
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_filename.c +128 -20
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_memory.c +7 -18
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_private.h +72 -29
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format.c +56 -3
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_7zip.c +2322 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_ar.c +54 -34
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_by_name.c +20 -2
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio.c +11 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_binary.c +610 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_newc.c +457 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_odc.c +500 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_filter_by_ext.c +142 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_gnutar.c +755 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_iso9660.c +8165 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_mtree.c +2217 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_pax.c +1049 -387
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_private.h +42 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_raw.c +125 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_shar.c +62 -47
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_ustar.c +279 -108
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_v7tar.c +638 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_warc.c +453 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_xar.c +3259 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_zip.c +1704 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_options.c +130 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_passphrase.c +95 -0
- data/ext/libarchive-3.6.2/libarchive/archive_xxhash.h +48 -0
- data/ext/libarchive-3.6.2/libarchive/config_freebsd.h +271 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/filter_fork.h +10 -5
- data/ext/{libarchive-2.8.4/libarchive/filter_fork.c → libarchive-3.6.2/libarchive/filter_fork_posix.c} +98 -19
- data/ext/libarchive-3.6.2/libarchive/filter_fork_windows.c +236 -0
- data/ext/libarchive-3.6.2/libarchive/xxhash.c +525 -0
- data/ext/libarchive-static-makefile +144 -80
- data/ext/libarchive-static-wrapper-makefile +1 -1
- data/ext/zlib-1.2.13/Makefile.in +404 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/adler32.c +51 -34
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/compress.c +27 -21
- data/ext/zlib-1.2.13/configure +922 -0
- data/ext/zlib-1.2.13/crc32.c +1125 -0
- data/ext/zlib-1.2.13/crc32.h +9446 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/deflate.c +842 -459
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/deflate.h +37 -33
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzclose.c +0 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzguts.h +103 -16
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzlib.c +155 -53
- data/ext/zlib-1.2.13/gzread.c +650 -0
- data/ext/zlib-1.2.13/gzwrite.c +677 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/infback.c +24 -12
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffast.c +49 -66
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffast.h +0 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffixed.h +3 -3
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inflate.c +209 -94
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inflate.h +9 -5
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inftrees.c +24 -50
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inftrees.h +1 -1
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/trees.c +135 -198
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/trees.h +0 -0
- data/ext/zlib-1.2.13/uncompr.c +93 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zconf.h +182 -63
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zlib.h +617 -295
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zutil.c +50 -41
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zutil.h +83 -82
- metadata +241 -133
- data/ext/libarchive-0.1.1/libarchive.c +0 -1762
- data/ext/libarchive-2.8.4/Makefile.in +0 -7076
- data/ext/libarchive-2.8.4/build/autoconf/compile +0 -143
- data/ext/libarchive-2.8.4/build/autoconf/config.guess +0 -1502
- data/ext/libarchive-2.8.4/build/autoconf/config.sub +0 -1708
- data/ext/libarchive-2.8.4/build/autoconf/depcomp +0 -630
- data/ext/libarchive-2.8.4/build/autoconf/install-sh +0 -291
- data/ext/libarchive-2.8.4/build/autoconf/ltmain.sh +0 -8406
- data/ext/libarchive-2.8.4/build/autoconf/missing +0 -376
- data/ext/libarchive-2.8.4/config.h.in +0 -772
- data/ext/libarchive-2.8.4/configure +0 -17916
- data/ext/libarchive-2.8.4/libarchive/archive.h +0 -741
- data/ext/libarchive-2.8.4/libarchive/archive_entry.c +0 -2202
- data/ext/libarchive-2.8.4/libarchive/archive_hash.h +0 -281
- data/ext/libarchive-2.8.4/libarchive/archive_read.c +0 -1249
- data/ext/libarchive-2.8.4/libarchive/archive_read_disk.c +0 -198
- data/ext/libarchive-2.8.4/libarchive/archive_read_disk_entry_from_file.c +0 -570
- data/ext/libarchive-2.8.4/libarchive/archive_read_open_filename.c +0 -272
- data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_cpio.c +0 -777
- data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_mtree.c +0 -1304
- data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_zip.c +0 -903
- data/ext/libarchive-2.8.4/libarchive/archive_string.c +0 -453
- data/ext/libarchive-2.8.4/libarchive/archive_string.h +0 -148
- data/ext/libarchive-2.8.4/libarchive/archive_util.c +0 -391
- data/ext/libarchive-2.8.4/libarchive/archive_write.c +0 -466
- data/ext/libarchive-2.8.4/libarchive/archive_write_disk.c +0 -2628
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_bzip2.c +0 -408
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_gzip.c +0 -477
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_none.c +0 -257
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_program.c +0 -347
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_xz.c +0 -438
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_cpio.c +0 -344
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_cpio_newc.c +0 -295
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_mtree.c +0 -1050
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_zip.c +0 -667
- data/ext/libarchive-2.8.4/libarchive/config_freebsd.h +0 -154
- data/ext/libarchive-2.8.4/libarchive/filter_fork_windows.c +0 -113
- data/ext/zlib-1.2.5/Makefile.in +0 -257
- data/ext/zlib-1.2.5/configure +0 -596
- data/ext/zlib-1.2.5/crc32.c +0 -442
- data/ext/zlib-1.2.5/crc32.h +0 -441
- data/ext/zlib-1.2.5/example.c +0 -565
- data/ext/zlib-1.2.5/gzread.c +0 -653
- data/ext/zlib-1.2.5/gzwrite.c +0 -531
- data/ext/zlib-1.2.5/minigzip.c +0 -440
- data/ext/zlib-1.2.5/uncompr.c +0 -59
@@ -0,0 +1,4240 @@
|
|
1
|
+
/*-
|
2
|
+
* Copyright (c) 2003-2011 Tim Kientzle
|
3
|
+
* Copyright (c) 2011-2012 Michihiro NAKAJIMA
|
4
|
+
* All rights reserved.
|
5
|
+
*
|
6
|
+
* Redistribution and use in source and binary forms, with or without
|
7
|
+
* modification, are permitted provided that the following conditions
|
8
|
+
* are met:
|
9
|
+
* 1. Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* 2. Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
|
16
|
+
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
17
|
+
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
18
|
+
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
|
19
|
+
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
20
|
+
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
21
|
+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
22
|
+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
24
|
+
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "archive_platform.h"
|
28
|
+
__FBSDID("$FreeBSD: head/lib/libarchive/archive_string.c 201095 2009-12-28 02:33:22Z kientzle $");
|
29
|
+
|
30
|
+
/*
|
31
|
+
* Basic resizable string support, to simplify manipulating arbitrary-sized
|
32
|
+
* strings while minimizing heap activity.
|
33
|
+
*
|
34
|
+
* In particular, the buffer used by a string object is only grown, it
|
35
|
+
* never shrinks, so you can clear and reuse the same string object
|
36
|
+
* without incurring additional memory allocations.
|
37
|
+
*/
|
38
|
+
|
39
|
+
#ifdef HAVE_ERRNO_H
|
40
|
+
#include <errno.h>
|
41
|
+
#endif
|
42
|
+
#ifdef HAVE_ICONV_H
|
43
|
+
#include <iconv.h>
|
44
|
+
#endif
|
45
|
+
#ifdef HAVE_LANGINFO_H
|
46
|
+
#include <langinfo.h>
|
47
|
+
#endif
|
48
|
+
#ifdef HAVE_LOCALCHARSET_H
|
49
|
+
#include <localcharset.h>
|
50
|
+
#endif
|
51
|
+
#ifdef HAVE_STDLIB_H
|
52
|
+
#include <stdlib.h>
|
53
|
+
#endif
|
54
|
+
#ifdef HAVE_STRING_H
|
55
|
+
#include <string.h>
|
56
|
+
#endif
|
57
|
+
#ifdef HAVE_WCHAR_H
|
58
|
+
#include <wchar.h>
|
59
|
+
#endif
|
60
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
61
|
+
#include <windows.h>
|
62
|
+
#include <locale.h>
|
63
|
+
#endif
|
64
|
+
|
65
|
+
#include "archive_endian.h"
|
66
|
+
#include "archive_private.h"
|
67
|
+
#include "archive_string.h"
|
68
|
+
#include "archive_string_composition.h"
|
69
|
+
|
70
|
+
#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy)
|
71
|
+
#define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t))
|
72
|
+
#endif
|
73
|
+
|
74
|
+
#if !defined(HAVE_WMEMMOVE) && !defined(wmemmove)
|
75
|
+
#define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t))
|
76
|
+
#endif
|
77
|
+
|
78
|
+
#undef max
|
79
|
+
#define max(a, b) ((a)>(b)?(a):(b))
|
80
|
+
|
81
|
+
struct archive_string_conv {
|
82
|
+
struct archive_string_conv *next;
|
83
|
+
char *from_charset;
|
84
|
+
char *to_charset;
|
85
|
+
unsigned from_cp;
|
86
|
+
unsigned to_cp;
|
87
|
+
/* Set 1 if from_charset and to_charset are the same. */
|
88
|
+
int same;
|
89
|
+
int flag;
|
90
|
+
#define SCONV_TO_CHARSET 1 /* MBS is being converted to specified
|
91
|
+
* charset. */
|
92
|
+
#define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from
|
93
|
+
* specified charset. */
|
94
|
+
#define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */
|
95
|
+
#define SCONV_WIN_CP (1<<3) /* Use Windows API for converting
|
96
|
+
* MBS. */
|
97
|
+
#define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive
|
98
|
+
* 2.x in the wrong assumption. */
|
99
|
+
#define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C.
|
100
|
+
* Before UTF-8 characters are actually
|
101
|
+
* processed. */
|
102
|
+
#define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D.
|
103
|
+
* Before UTF-8 characters are actually
|
104
|
+
* processed.
|
105
|
+
* Currently this only for MAC OS X. */
|
106
|
+
#define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */
|
107
|
+
#define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */
|
108
|
+
#define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */
|
109
|
+
#define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */
|
110
|
+
#define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */
|
111
|
+
#define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */
|
112
|
+
#define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE)
|
113
|
+
#define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE)
|
114
|
+
|
115
|
+
#if HAVE_ICONV
|
116
|
+
iconv_t cd;
|
117
|
+
iconv_t cd_w;/* Use at archive_mstring on
|
118
|
+
* Windows. */
|
119
|
+
#endif
|
120
|
+
/* A temporary buffer for normalization. */
|
121
|
+
struct archive_string utftmp;
|
122
|
+
int (*converter[2])(struct archive_string *, const void *, size_t,
|
123
|
+
struct archive_string_conv *);
|
124
|
+
int nconverter;
|
125
|
+
};
|
126
|
+
|
127
|
+
#define CP_C_LOCALE 0 /* "C" locale only for this file. */
|
128
|
+
#define CP_UTF16LE 1200
|
129
|
+
#define CP_UTF16BE 1201
|
130
|
+
|
131
|
+
#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF)
|
132
|
+
#define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF)
|
133
|
+
#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF)
|
134
|
+
#define UNICODE_MAX 0x10FFFF
|
135
|
+
#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */
|
136
|
+
/* Set U+FFFD(Replacement character) in UTF-8. */
|
137
|
+
static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
|
138
|
+
|
139
|
+
static struct archive_string_conv *find_sconv_object(struct archive *,
|
140
|
+
const char *, const char *);
|
141
|
+
static void add_sconv_object(struct archive *, struct archive_string_conv *);
|
142
|
+
static struct archive_string_conv *create_sconv_object(const char *,
|
143
|
+
const char *, unsigned, int);
|
144
|
+
static void free_sconv_object(struct archive_string_conv *);
|
145
|
+
static struct archive_string_conv *get_sconv_object(struct archive *,
|
146
|
+
const char *, const char *, int);
|
147
|
+
static unsigned make_codepage_from_charset(const char *);
|
148
|
+
static unsigned get_current_codepage(void);
|
149
|
+
static unsigned get_current_oemcp(void);
|
150
|
+
static size_t mbsnbytes(const void *, size_t);
|
151
|
+
static size_t utf16nbytes(const void *, size_t);
|
152
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
153
|
+
static int archive_wstring_append_from_mbs_in_codepage(
|
154
|
+
struct archive_wstring *, const char *, size_t,
|
155
|
+
struct archive_string_conv *);
|
156
|
+
static int archive_string_append_from_wcs_in_codepage(struct archive_string *,
|
157
|
+
const wchar_t *, size_t, struct archive_string_conv *);
|
158
|
+
static int is_big_endian(void);
|
159
|
+
static int strncat_in_codepage(struct archive_string *, const void *,
|
160
|
+
size_t, struct archive_string_conv *);
|
161
|
+
static int win_strncat_from_utf16be(struct archive_string *, const void *,
|
162
|
+
size_t, struct archive_string_conv *);
|
163
|
+
static int win_strncat_from_utf16le(struct archive_string *, const void *,
|
164
|
+
size_t, struct archive_string_conv *);
|
165
|
+
static int win_strncat_to_utf16be(struct archive_string *, const void *,
|
166
|
+
size_t, struct archive_string_conv *);
|
167
|
+
static int win_strncat_to_utf16le(struct archive_string *, const void *,
|
168
|
+
size_t, struct archive_string_conv *);
|
169
|
+
#endif
|
170
|
+
static int best_effort_strncat_from_utf16be(struct archive_string *,
|
171
|
+
const void *, size_t, struct archive_string_conv *);
|
172
|
+
static int best_effort_strncat_from_utf16le(struct archive_string *,
|
173
|
+
const void *, size_t, struct archive_string_conv *);
|
174
|
+
static int best_effort_strncat_to_utf16be(struct archive_string *,
|
175
|
+
const void *, size_t, struct archive_string_conv *);
|
176
|
+
static int best_effort_strncat_to_utf16le(struct archive_string *,
|
177
|
+
const void *, size_t, struct archive_string_conv *);
|
178
|
+
#if defined(HAVE_ICONV)
|
179
|
+
static int iconv_strncat_in_locale(struct archive_string *, const void *,
|
180
|
+
size_t, struct archive_string_conv *);
|
181
|
+
#endif
|
182
|
+
static int best_effort_strncat_in_locale(struct archive_string *,
|
183
|
+
const void *, size_t, struct archive_string_conv *);
|
184
|
+
static int _utf8_to_unicode(uint32_t *, const char *, size_t);
|
185
|
+
static int utf8_to_unicode(uint32_t *, const char *, size_t);
|
186
|
+
static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t);
|
187
|
+
static int cesu8_to_unicode(uint32_t *, const char *, size_t);
|
188
|
+
static size_t unicode_to_utf8(char *, size_t, uint32_t);
|
189
|
+
static int utf16_to_unicode(uint32_t *, const char *, size_t, int);
|
190
|
+
static size_t unicode_to_utf16be(char *, size_t, uint32_t);
|
191
|
+
static size_t unicode_to_utf16le(char *, size_t, uint32_t);
|
192
|
+
static int strncat_from_utf8_libarchive2(struct archive_string *,
|
193
|
+
const void *, size_t, struct archive_string_conv *);
|
194
|
+
static int strncat_from_utf8_to_utf8(struct archive_string *, const void *,
|
195
|
+
size_t, struct archive_string_conv *);
|
196
|
+
static int archive_string_normalize_C(struct archive_string *, const void *,
|
197
|
+
size_t, struct archive_string_conv *);
|
198
|
+
static int archive_string_normalize_D(struct archive_string *, const void *,
|
199
|
+
size_t, struct archive_string_conv *);
|
200
|
+
static int archive_string_append_unicode(struct archive_string *,
|
201
|
+
const void *, size_t, struct archive_string_conv *);
|
202
|
+
|
203
|
+
static struct archive_string *
|
204
|
+
archive_string_append(struct archive_string *as, const char *p, size_t s)
|
205
|
+
{
|
206
|
+
if (archive_string_ensure(as, as->length + s + 1) == NULL)
|
207
|
+
return (NULL);
|
208
|
+
if (s)
|
209
|
+
memmove(as->s + as->length, p, s);
|
210
|
+
as->length += s;
|
211
|
+
as->s[as->length] = 0;
|
212
|
+
return (as);
|
213
|
+
}
|
214
|
+
|
215
|
+
static struct archive_wstring *
|
216
|
+
archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s)
|
217
|
+
{
|
218
|
+
if (archive_wstring_ensure(as, as->length + s + 1) == NULL)
|
219
|
+
return (NULL);
|
220
|
+
if (s)
|
221
|
+
wmemmove(as->s + as->length, p, s);
|
222
|
+
as->length += s;
|
223
|
+
as->s[as->length] = 0;
|
224
|
+
return (as);
|
225
|
+
}
|
226
|
+
|
227
|
+
struct archive_string *
|
228
|
+
archive_array_append(struct archive_string *as, const char *p, size_t s)
|
229
|
+
{
|
230
|
+
return archive_string_append(as, p, s);
|
231
|
+
}
|
232
|
+
|
233
|
+
void
|
234
|
+
archive_string_concat(struct archive_string *dest, struct archive_string *src)
|
235
|
+
{
|
236
|
+
if (archive_string_append(dest, src->s, src->length) == NULL)
|
237
|
+
__archive_errx(1, "Out of memory");
|
238
|
+
}
|
239
|
+
|
240
|
+
void
|
241
|
+
archive_wstring_concat(struct archive_wstring *dest,
|
242
|
+
struct archive_wstring *src)
|
243
|
+
{
|
244
|
+
if (archive_wstring_append(dest, src->s, src->length) == NULL)
|
245
|
+
__archive_errx(1, "Out of memory");
|
246
|
+
}
|
247
|
+
|
248
|
+
void
|
249
|
+
archive_string_free(struct archive_string *as)
|
250
|
+
{
|
251
|
+
as->length = 0;
|
252
|
+
as->buffer_length = 0;
|
253
|
+
free(as->s);
|
254
|
+
as->s = NULL;
|
255
|
+
}
|
256
|
+
|
257
|
+
void
|
258
|
+
archive_wstring_free(struct archive_wstring *as)
|
259
|
+
{
|
260
|
+
as->length = 0;
|
261
|
+
as->buffer_length = 0;
|
262
|
+
free(as->s);
|
263
|
+
as->s = NULL;
|
264
|
+
}
|
265
|
+
|
266
|
+
struct archive_wstring *
|
267
|
+
archive_wstring_ensure(struct archive_wstring *as, size_t s)
|
268
|
+
{
|
269
|
+
return (struct archive_wstring *)
|
270
|
+
archive_string_ensure((struct archive_string *)as,
|
271
|
+
s * sizeof(wchar_t));
|
272
|
+
}
|
273
|
+
|
274
|
+
/* Returns NULL on any allocation failure. */
|
275
|
+
struct archive_string *
|
276
|
+
archive_string_ensure(struct archive_string *as, size_t s)
|
277
|
+
{
|
278
|
+
char *p;
|
279
|
+
size_t new_length;
|
280
|
+
|
281
|
+
/* If buffer is already big enough, don't reallocate. */
|
282
|
+
if (as->s && (s <= as->buffer_length))
|
283
|
+
return (as);
|
284
|
+
|
285
|
+
/*
|
286
|
+
* Growing the buffer at least exponentially ensures that
|
287
|
+
* append operations are always linear in the number of
|
288
|
+
* characters appended. Using a smaller growth rate for
|
289
|
+
* larger buffers reduces memory waste somewhat at the cost of
|
290
|
+
* a larger constant factor.
|
291
|
+
*/
|
292
|
+
if (as->buffer_length < 32)
|
293
|
+
/* Start with a minimum 32-character buffer. */
|
294
|
+
new_length = 32;
|
295
|
+
else if (as->buffer_length < 8192)
|
296
|
+
/* Buffers under 8k are doubled for speed. */
|
297
|
+
new_length = as->buffer_length + as->buffer_length;
|
298
|
+
else {
|
299
|
+
/* Buffers 8k and over grow by at least 25% each time. */
|
300
|
+
new_length = as->buffer_length + as->buffer_length / 4;
|
301
|
+
/* Be safe: If size wraps, fail. */
|
302
|
+
if (new_length < as->buffer_length) {
|
303
|
+
/* On failure, wipe the string and return NULL. */
|
304
|
+
archive_string_free(as);
|
305
|
+
errno = ENOMEM;/* Make sure errno has ENOMEM. */
|
306
|
+
return (NULL);
|
307
|
+
}
|
308
|
+
}
|
309
|
+
/*
|
310
|
+
* The computation above is a lower limit to how much we'll
|
311
|
+
* grow the buffer. In any case, we have to grow it enough to
|
312
|
+
* hold the request.
|
313
|
+
*/
|
314
|
+
if (new_length < s)
|
315
|
+
new_length = s;
|
316
|
+
/* Now we can reallocate the buffer. */
|
317
|
+
p = (char *)realloc(as->s, new_length);
|
318
|
+
if (p == NULL) {
|
319
|
+
/* On failure, wipe the string and return NULL. */
|
320
|
+
archive_string_free(as);
|
321
|
+
errno = ENOMEM;/* Make sure errno has ENOMEM. */
|
322
|
+
return (NULL);
|
323
|
+
}
|
324
|
+
|
325
|
+
as->s = p;
|
326
|
+
as->buffer_length = new_length;
|
327
|
+
return (as);
|
328
|
+
}
|
329
|
+
|
330
|
+
/*
|
331
|
+
* TODO: See if there's a way to avoid scanning
|
332
|
+
* the source string twice. Then test to see
|
333
|
+
* if it actually helps (remember that we're almost
|
334
|
+
* always called with pretty short arguments, so
|
335
|
+
* such an optimization might not help).
|
336
|
+
*/
|
337
|
+
struct archive_string *
|
338
|
+
archive_strncat(struct archive_string *as, const void *_p, size_t n)
|
339
|
+
{
|
340
|
+
size_t s;
|
341
|
+
const char *p, *pp;
|
342
|
+
|
343
|
+
p = (const char *)_p;
|
344
|
+
|
345
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
346
|
+
s = 0;
|
347
|
+
pp = p;
|
348
|
+
while (s < n && *pp) {
|
349
|
+
pp++;
|
350
|
+
s++;
|
351
|
+
}
|
352
|
+
if ((as = archive_string_append(as, p, s)) == NULL)
|
353
|
+
__archive_errx(1, "Out of memory");
|
354
|
+
return (as);
|
355
|
+
}
|
356
|
+
|
357
|
+
struct archive_wstring *
|
358
|
+
archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n)
|
359
|
+
{
|
360
|
+
size_t s;
|
361
|
+
const wchar_t *pp;
|
362
|
+
|
363
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
364
|
+
s = 0;
|
365
|
+
pp = p;
|
366
|
+
while (s < n && *pp) {
|
367
|
+
pp++;
|
368
|
+
s++;
|
369
|
+
}
|
370
|
+
if ((as = archive_wstring_append(as, p, s)) == NULL)
|
371
|
+
__archive_errx(1, "Out of memory");
|
372
|
+
return (as);
|
373
|
+
}
|
374
|
+
|
375
|
+
struct archive_string *
|
376
|
+
archive_strcat(struct archive_string *as, const void *p)
|
377
|
+
{
|
378
|
+
/* strcat is just strncat without an effective limit.
|
379
|
+
* Assert that we'll never get called with a source
|
380
|
+
* string over 16MB.
|
381
|
+
* TODO: Review all uses of strcat in the source
|
382
|
+
* and try to replace them with strncat().
|
383
|
+
*/
|
384
|
+
return archive_strncat(as, p, 0x1000000);
|
385
|
+
}
|
386
|
+
|
387
|
+
struct archive_wstring *
|
388
|
+
archive_wstrcat(struct archive_wstring *as, const wchar_t *p)
|
389
|
+
{
|
390
|
+
/* Ditto. */
|
391
|
+
return archive_wstrncat(as, p, 0x1000000);
|
392
|
+
}
|
393
|
+
|
394
|
+
struct archive_string *
|
395
|
+
archive_strappend_char(struct archive_string *as, char c)
|
396
|
+
{
|
397
|
+
if ((as = archive_string_append(as, &c, 1)) == NULL)
|
398
|
+
__archive_errx(1, "Out of memory");
|
399
|
+
return (as);
|
400
|
+
}
|
401
|
+
|
402
|
+
struct archive_wstring *
|
403
|
+
archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c)
|
404
|
+
{
|
405
|
+
if ((as = archive_wstring_append(as, &c, 1)) == NULL)
|
406
|
+
__archive_errx(1, "Out of memory");
|
407
|
+
return (as);
|
408
|
+
}
|
409
|
+
|
410
|
+
/*
|
411
|
+
* Get the "current character set" name to use with iconv.
|
412
|
+
* On FreeBSD, the empty character set name "" chooses
|
413
|
+
* the correct character encoding for the current locale,
|
414
|
+
* so this isn't necessary.
|
415
|
+
* But iconv on Mac OS 10.6 doesn't seem to handle this correctly;
|
416
|
+
* on that system, we have to explicitly call nl_langinfo()
|
417
|
+
* to get the right name. Not sure about other platforms.
|
418
|
+
*
|
419
|
+
* NOTE: GNU libiconv does not recognize the character-set name
|
420
|
+
* which some platform nl_langinfo(CODESET) returns, so we should
|
421
|
+
* use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv.
|
422
|
+
*/
|
423
|
+
static const char *
|
424
|
+
default_iconv_charset(const char *charset) {
|
425
|
+
if (charset != NULL && charset[0] != '\0')
|
426
|
+
return charset;
|
427
|
+
#if HAVE_LOCALE_CHARSET && !defined(__APPLE__)
|
428
|
+
/* locale_charset() is broken on Mac OS */
|
429
|
+
return locale_charset();
|
430
|
+
#elif HAVE_NL_LANGINFO
|
431
|
+
return nl_langinfo(CODESET);
|
432
|
+
#else
|
433
|
+
return "";
|
434
|
+
#endif
|
435
|
+
}
|
436
|
+
|
437
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
438
|
+
|
439
|
+
/*
|
440
|
+
* Convert MBS to WCS.
|
441
|
+
* Note: returns -1 if conversion fails.
|
442
|
+
*/
|
443
|
+
int
|
444
|
+
archive_wstring_append_from_mbs(struct archive_wstring *dest,
|
445
|
+
const char *p, size_t len)
|
446
|
+
{
|
447
|
+
return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL);
|
448
|
+
}
|
449
|
+
|
450
|
+
static int
|
451
|
+
archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest,
|
452
|
+
const char *s, size_t length, struct archive_string_conv *sc)
|
453
|
+
{
|
454
|
+
int count, ret = 0;
|
455
|
+
UINT from_cp;
|
456
|
+
|
457
|
+
if (sc != NULL)
|
458
|
+
from_cp = sc->from_cp;
|
459
|
+
else
|
460
|
+
from_cp = get_current_codepage();
|
461
|
+
|
462
|
+
if (from_cp == CP_C_LOCALE) {
|
463
|
+
/*
|
464
|
+
* "C" locale special processing.
|
465
|
+
*/
|
466
|
+
wchar_t *ws;
|
467
|
+
const unsigned char *mp;
|
468
|
+
|
469
|
+
if (NULL == archive_wstring_ensure(dest,
|
470
|
+
dest->length + length + 1))
|
471
|
+
return (-1);
|
472
|
+
|
473
|
+
ws = dest->s + dest->length;
|
474
|
+
mp = (const unsigned char *)s;
|
475
|
+
count = 0;
|
476
|
+
while (count < (int)length && *mp) {
|
477
|
+
*ws++ = (wchar_t)*mp++;
|
478
|
+
count++;
|
479
|
+
}
|
480
|
+
} else if (sc != NULL &&
|
481
|
+
(sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) {
|
482
|
+
/*
|
483
|
+
* Normalize UTF-8 and UTF-16BE and convert it directly
|
484
|
+
* to UTF-16 as wchar_t.
|
485
|
+
*/
|
486
|
+
struct archive_string u16;
|
487
|
+
int saved_flag = sc->flag;/* save current flag. */
|
488
|
+
|
489
|
+
if (is_big_endian())
|
490
|
+
sc->flag |= SCONV_TO_UTF16BE;
|
491
|
+
else
|
492
|
+
sc->flag |= SCONV_TO_UTF16LE;
|
493
|
+
|
494
|
+
if (sc->flag & SCONV_FROM_UTF16) {
|
495
|
+
/*
|
496
|
+
* UTF-16BE/LE NFD ===> UTF-16 NFC
|
497
|
+
* UTF-16BE/LE NFC ===> UTF-16 NFD
|
498
|
+
*/
|
499
|
+
count = (int)utf16nbytes(s, length);
|
500
|
+
} else {
|
501
|
+
/*
|
502
|
+
* UTF-8 NFD ===> UTF-16 NFC
|
503
|
+
* UTF-8 NFC ===> UTF-16 NFD
|
504
|
+
*/
|
505
|
+
count = (int)mbsnbytes(s, length);
|
506
|
+
}
|
507
|
+
u16.s = (char *)dest->s;
|
508
|
+
u16.length = dest->length << 1;;
|
509
|
+
u16.buffer_length = dest->buffer_length;
|
510
|
+
if (sc->flag & SCONV_NORMALIZATION_C)
|
511
|
+
ret = archive_string_normalize_C(&u16, s, count, sc);
|
512
|
+
else
|
513
|
+
ret = archive_string_normalize_D(&u16, s, count, sc);
|
514
|
+
dest->s = (wchar_t *)u16.s;
|
515
|
+
dest->length = u16.length >> 1;
|
516
|
+
dest->buffer_length = u16.buffer_length;
|
517
|
+
sc->flag = saved_flag;/* restore the saved flag. */
|
518
|
+
return (ret);
|
519
|
+
} else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) {
|
520
|
+
count = (int)utf16nbytes(s, length);
|
521
|
+
count >>= 1; /* to be WCS length */
|
522
|
+
/* Allocate memory for WCS. */
|
523
|
+
if (NULL == archive_wstring_ensure(dest,
|
524
|
+
dest->length + count + 1))
|
525
|
+
return (-1);
|
526
|
+
wmemcpy(dest->s + dest->length, (const wchar_t *)s, count);
|
527
|
+
if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) {
|
528
|
+
uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
|
529
|
+
int b;
|
530
|
+
for (b = 0; b < count; b++) {
|
531
|
+
uint16_t val = archive_le16dec(u16+b);
|
532
|
+
archive_be16enc(u16+b, val);
|
533
|
+
}
|
534
|
+
} else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) {
|
535
|
+
uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
|
536
|
+
int b;
|
537
|
+
for (b = 0; b < count; b++) {
|
538
|
+
uint16_t val = archive_be16dec(u16+b);
|
539
|
+
archive_le16enc(u16+b, val);
|
540
|
+
}
|
541
|
+
}
|
542
|
+
} else {
|
543
|
+
DWORD mbflag;
|
544
|
+
size_t buffsize;
|
545
|
+
|
546
|
+
if (sc == NULL)
|
547
|
+
mbflag = 0;
|
548
|
+
else if (sc->flag & SCONV_FROM_CHARSET) {
|
549
|
+
/* Do not trust the length which comes from
|
550
|
+
* an archive file. */
|
551
|
+
length = mbsnbytes(s, length);
|
552
|
+
mbflag = 0;
|
553
|
+
} else
|
554
|
+
mbflag = MB_PRECOMPOSED;
|
555
|
+
|
556
|
+
buffsize = dest->length + length + 1;
|
557
|
+
do {
|
558
|
+
/* Allocate memory for WCS. */
|
559
|
+
if (NULL == archive_wstring_ensure(dest, buffsize))
|
560
|
+
return (-1);
|
561
|
+
/* Convert MBS to WCS. */
|
562
|
+
count = MultiByteToWideChar(from_cp,
|
563
|
+
mbflag, s, (int)length, dest->s + dest->length,
|
564
|
+
(int)(dest->buffer_length >> 1) -1);
|
565
|
+
if (count == 0 &&
|
566
|
+
GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
|
567
|
+
/* Expand the WCS buffer. */
|
568
|
+
buffsize = dest->buffer_length << 1;
|
569
|
+
continue;
|
570
|
+
}
|
571
|
+
if (count == 0 && length != 0)
|
572
|
+
ret = -1;
|
573
|
+
break;
|
574
|
+
} while (1);
|
575
|
+
}
|
576
|
+
dest->length += count;
|
577
|
+
dest->s[dest->length] = L'\0';
|
578
|
+
return (ret);
|
579
|
+
}
|
580
|
+
|
581
|
+
#else
|
582
|
+
|
583
|
+
/*
|
584
|
+
* Convert MBS to WCS.
|
585
|
+
* Note: returns -1 if conversion fails.
|
586
|
+
*/
|
587
|
+
int
|
588
|
+
archive_wstring_append_from_mbs(struct archive_wstring *dest,
|
589
|
+
const char *p, size_t len)
|
590
|
+
{
|
591
|
+
size_t r;
|
592
|
+
int ret_val = 0;
|
593
|
+
/*
|
594
|
+
* No single byte will be more than one wide character,
|
595
|
+
* so this length estimate will always be big enough.
|
596
|
+
*/
|
597
|
+
// size_t wcs_length = len;
|
598
|
+
size_t mbs_length = len;
|
599
|
+
const char *mbs = p;
|
600
|
+
wchar_t *wcs;
|
601
|
+
#if HAVE_MBRTOWC
|
602
|
+
mbstate_t shift_state;
|
603
|
+
|
604
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
605
|
+
#endif
|
606
|
+
/*
|
607
|
+
* As we decided to have wcs_length == mbs_length == len
|
608
|
+
* we can use len here instead of wcs_length
|
609
|
+
*/
|
610
|
+
if (NULL == archive_wstring_ensure(dest, dest->length + len + 1))
|
611
|
+
return (-1);
|
612
|
+
wcs = dest->s + dest->length;
|
613
|
+
/*
|
614
|
+
* We cannot use mbsrtowcs/mbstowcs here because those may convert
|
615
|
+
* extra MBS when strlen(p) > len and one wide character consists of
|
616
|
+
* multi bytes.
|
617
|
+
*/
|
618
|
+
while (*mbs && mbs_length > 0) {
|
619
|
+
/*
|
620
|
+
* The buffer we allocated is always big enough.
|
621
|
+
* Keep this code path in a comment if we decide to choose
|
622
|
+
* smaller wcs_length in the future
|
623
|
+
*/
|
624
|
+
/*
|
625
|
+
if (wcs_length == 0) {
|
626
|
+
dest->length = wcs - dest->s;
|
627
|
+
dest->s[dest->length] = L'\0';
|
628
|
+
wcs_length = mbs_length;
|
629
|
+
if (NULL == archive_wstring_ensure(dest,
|
630
|
+
dest->length + wcs_length + 1))
|
631
|
+
return (-1);
|
632
|
+
wcs = dest->s + dest->length;
|
633
|
+
}
|
634
|
+
*/
|
635
|
+
#if HAVE_MBRTOWC
|
636
|
+
r = mbrtowc(wcs, mbs, mbs_length, &shift_state);
|
637
|
+
#else
|
638
|
+
r = mbtowc(wcs, mbs, mbs_length);
|
639
|
+
#endif
|
640
|
+
if (r == (size_t)-1 || r == (size_t)-2) {
|
641
|
+
ret_val = -1;
|
642
|
+
break;
|
643
|
+
}
|
644
|
+
if (r == 0 || r > mbs_length)
|
645
|
+
break;
|
646
|
+
wcs++;
|
647
|
+
// wcs_length--;
|
648
|
+
mbs += r;
|
649
|
+
mbs_length -= r;
|
650
|
+
}
|
651
|
+
dest->length = wcs - dest->s;
|
652
|
+
dest->s[dest->length] = L'\0';
|
653
|
+
return (ret_val);
|
654
|
+
}
|
655
|
+
|
656
|
+
#endif
|
657
|
+
|
658
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
659
|
+
|
660
|
+
/*
|
661
|
+
* WCS ==> MBS.
|
662
|
+
* Note: returns -1 if conversion fails.
|
663
|
+
*
|
664
|
+
* Win32 builds use WideCharToMultiByte from the Windows API.
|
665
|
+
* (Maybe Cygwin should too? WideCharToMultiByte will know a
|
666
|
+
* lot more about local character encodings than the wcrtomb()
|
667
|
+
* wrapper is going to know.)
|
668
|
+
*/
|
669
|
+
int
|
670
|
+
archive_string_append_from_wcs(struct archive_string *as,
|
671
|
+
const wchar_t *w, size_t len)
|
672
|
+
{
|
673
|
+
return archive_string_append_from_wcs_in_codepage(as, w, len, NULL);
|
674
|
+
}
|
675
|
+
|
676
|
+
static int
|
677
|
+
archive_string_append_from_wcs_in_codepage(struct archive_string *as,
|
678
|
+
const wchar_t *ws, size_t len, struct archive_string_conv *sc)
|
679
|
+
{
|
680
|
+
BOOL defchar_used, *dp;
|
681
|
+
int count, ret = 0;
|
682
|
+
UINT to_cp;
|
683
|
+
int wslen = (int)len;
|
684
|
+
|
685
|
+
if (sc != NULL)
|
686
|
+
to_cp = sc->to_cp;
|
687
|
+
else
|
688
|
+
to_cp = get_current_codepage();
|
689
|
+
|
690
|
+
if (to_cp == CP_C_LOCALE) {
|
691
|
+
/*
|
692
|
+
* "C" locale special processing.
|
693
|
+
*/
|
694
|
+
const wchar_t *wp = ws;
|
695
|
+
char *p;
|
696
|
+
|
697
|
+
if (NULL == archive_string_ensure(as,
|
698
|
+
as->length + wslen +1))
|
699
|
+
return (-1);
|
700
|
+
p = as->s + as->length;
|
701
|
+
count = 0;
|
702
|
+
defchar_used = 0;
|
703
|
+
while (count < wslen && *wp) {
|
704
|
+
if (*wp > 255) {
|
705
|
+
*p++ = '?';
|
706
|
+
wp++;
|
707
|
+
defchar_used = 1;
|
708
|
+
} else
|
709
|
+
*p++ = (char)*wp++;
|
710
|
+
count++;
|
711
|
+
}
|
712
|
+
} else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) {
|
713
|
+
uint16_t *u16;
|
714
|
+
|
715
|
+
if (NULL ==
|
716
|
+
archive_string_ensure(as, as->length + len * 2 + 2))
|
717
|
+
return (-1);
|
718
|
+
u16 = (uint16_t *)(as->s + as->length);
|
719
|
+
count = 0;
|
720
|
+
defchar_used = 0;
|
721
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
722
|
+
while (count < (int)len && *ws) {
|
723
|
+
archive_be16enc(u16+count, *ws);
|
724
|
+
ws++;
|
725
|
+
count++;
|
726
|
+
}
|
727
|
+
} else {
|
728
|
+
while (count < (int)len && *ws) {
|
729
|
+
archive_le16enc(u16+count, *ws);
|
730
|
+
ws++;
|
731
|
+
count++;
|
732
|
+
}
|
733
|
+
}
|
734
|
+
count <<= 1; /* to be byte size */
|
735
|
+
} else {
|
736
|
+
/* Make sure the MBS buffer has plenty to set. */
|
737
|
+
if (NULL ==
|
738
|
+
archive_string_ensure(as, as->length + len * 2 + 1))
|
739
|
+
return (-1);
|
740
|
+
do {
|
741
|
+
defchar_used = 0;
|
742
|
+
if (to_cp == CP_UTF8 || sc == NULL)
|
743
|
+
dp = NULL;
|
744
|
+
else
|
745
|
+
dp = &defchar_used;
|
746
|
+
count = WideCharToMultiByte(to_cp, 0, ws, wslen,
|
747
|
+
as->s + as->length,
|
748
|
+
(int)as->buffer_length - (int)as->length - 1, NULL, dp);
|
749
|
+
if (count == 0 &&
|
750
|
+
GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
|
751
|
+
/* Expand the MBS buffer and retry. */
|
752
|
+
if (NULL == archive_string_ensure(as,
|
753
|
+
as->buffer_length + len))
|
754
|
+
return (-1);
|
755
|
+
continue;
|
756
|
+
}
|
757
|
+
if (count == 0)
|
758
|
+
ret = -1;
|
759
|
+
break;
|
760
|
+
} while (1);
|
761
|
+
}
|
762
|
+
as->length += count;
|
763
|
+
as->s[as->length] = '\0';
|
764
|
+
return (defchar_used?-1:ret);
|
765
|
+
}
|
766
|
+
|
767
|
+
#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB)
|
768
|
+
|
769
|
+
/*
|
770
|
+
* Translates a wide character string into current locale character set
|
771
|
+
* and appends to the archive_string. Note: returns -1 if conversion
|
772
|
+
* fails.
|
773
|
+
*/
|
774
|
+
int
|
775
|
+
archive_string_append_from_wcs(struct archive_string *as,
|
776
|
+
const wchar_t *w, size_t len)
|
777
|
+
{
|
778
|
+
/* We cannot use the standard wcstombs() here because it
|
779
|
+
* cannot tell us how big the output buffer should be. So
|
780
|
+
* I've built a loop around wcrtomb() or wctomb() that
|
781
|
+
* converts a character at a time and resizes the string as
|
782
|
+
* needed. We prefer wcrtomb() when it's available because
|
783
|
+
* it's thread-safe. */
|
784
|
+
int n, ret_val = 0;
|
785
|
+
char *p;
|
786
|
+
char *end;
|
787
|
+
#if HAVE_WCRTOMB
|
788
|
+
mbstate_t shift_state;
|
789
|
+
|
790
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
791
|
+
#else
|
792
|
+
/* Clear the shift state before starting. */
|
793
|
+
wctomb(NULL, L'\0');
|
794
|
+
#endif
|
795
|
+
/*
|
796
|
+
* Allocate buffer for MBS.
|
797
|
+
* We need this allocation here since it is possible that
|
798
|
+
* as->s is still NULL.
|
799
|
+
*/
|
800
|
+
if (archive_string_ensure(as, as->length + len + 1) == NULL)
|
801
|
+
return (-1);
|
802
|
+
|
803
|
+
p = as->s + as->length;
|
804
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
805
|
+
while (*w != L'\0' && len > 0) {
|
806
|
+
if (p >= end) {
|
807
|
+
as->length = p - as->s;
|
808
|
+
as->s[as->length] = '\0';
|
809
|
+
/* Re-allocate buffer for MBS. */
|
810
|
+
if (archive_string_ensure(as,
|
811
|
+
as->length + max(len * 2,
|
812
|
+
(size_t)MB_CUR_MAX) + 1) == NULL)
|
813
|
+
return (-1);
|
814
|
+
p = as->s + as->length;
|
815
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
816
|
+
}
|
817
|
+
#if HAVE_WCRTOMB
|
818
|
+
n = wcrtomb(p, *w++, &shift_state);
|
819
|
+
#else
|
820
|
+
n = wctomb(p, *w++);
|
821
|
+
#endif
|
822
|
+
if (n == -1) {
|
823
|
+
if (errno == EILSEQ) {
|
824
|
+
/* Skip an illegal wide char. */
|
825
|
+
*p++ = '?';
|
826
|
+
ret_val = -1;
|
827
|
+
} else {
|
828
|
+
ret_val = -1;
|
829
|
+
break;
|
830
|
+
}
|
831
|
+
} else
|
832
|
+
p += n;
|
833
|
+
len--;
|
834
|
+
}
|
835
|
+
as->length = p - as->s;
|
836
|
+
as->s[as->length] = '\0';
|
837
|
+
return (ret_val);
|
838
|
+
}
|
839
|
+
|
840
|
+
#else /* HAVE_WCTOMB || HAVE_WCRTOMB */
|
841
|
+
|
842
|
+
/*
|
843
|
+
* TODO: Test if __STDC_ISO_10646__ is defined.
|
844
|
+
* Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
|
845
|
+
* one character at a time. If a non-Windows platform doesn't have
|
846
|
+
* either of these, fall back to the built-in UTF8 conversion.
|
847
|
+
*/
|
848
|
+
int
|
849
|
+
archive_string_append_from_wcs(struct archive_string *as,
|
850
|
+
const wchar_t *w, size_t len)
|
851
|
+
{
|
852
|
+
(void)as;/* UNUSED */
|
853
|
+
(void)w;/* UNUSED */
|
854
|
+
(void)len;/* UNUSED */
|
855
|
+
errno = ENOSYS;
|
856
|
+
return (-1);
|
857
|
+
}
|
858
|
+
|
859
|
+
#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */
|
860
|
+
|
861
|
+
/*
|
862
|
+
* Find a string conversion object by a pair of 'from' charset name
|
863
|
+
* and 'to' charset name from an archive object.
|
864
|
+
* Return NULL if not found.
|
865
|
+
*/
|
866
|
+
static struct archive_string_conv *
|
867
|
+
find_sconv_object(struct archive *a, const char *fc, const char *tc)
|
868
|
+
{
|
869
|
+
struct archive_string_conv *sc;
|
870
|
+
|
871
|
+
if (a == NULL)
|
872
|
+
return (NULL);
|
873
|
+
|
874
|
+
for (sc = a->sconv; sc != NULL; sc = sc->next) {
|
875
|
+
if (strcmp(sc->from_charset, fc) == 0 &&
|
876
|
+
strcmp(sc->to_charset, tc) == 0)
|
877
|
+
break;
|
878
|
+
}
|
879
|
+
return (sc);
|
880
|
+
}
|
881
|
+
|
882
|
+
/*
|
883
|
+
* Register a string object to an archive object.
|
884
|
+
*/
|
885
|
+
static void
|
886
|
+
add_sconv_object(struct archive *a, struct archive_string_conv *sc)
|
887
|
+
{
|
888
|
+
struct archive_string_conv **psc;
|
889
|
+
|
890
|
+
/* Add a new sconv to sconv list. */
|
891
|
+
psc = &(a->sconv);
|
892
|
+
while (*psc != NULL)
|
893
|
+
psc = &((*psc)->next);
|
894
|
+
*psc = sc;
|
895
|
+
}
|
896
|
+
|
897
|
+
static void
|
898
|
+
add_converter(struct archive_string_conv *sc, int (*converter)
|
899
|
+
(struct archive_string *, const void *, size_t,
|
900
|
+
struct archive_string_conv *))
|
901
|
+
{
|
902
|
+
if (sc == NULL || sc->nconverter >= 2)
|
903
|
+
__archive_errx(1, "Programming error");
|
904
|
+
sc->converter[sc->nconverter++] = converter;
|
905
|
+
}
|
906
|
+
|
907
|
+
static void
|
908
|
+
setup_converter(struct archive_string_conv *sc)
|
909
|
+
{
|
910
|
+
|
911
|
+
/* Reset. */
|
912
|
+
sc->nconverter = 0;
|
913
|
+
|
914
|
+
/*
|
915
|
+
* Perform special sequence for the incorrect UTF-8 filenames
|
916
|
+
* made by libarchive2.x.
|
917
|
+
*/
|
918
|
+
if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) {
|
919
|
+
add_converter(sc, strncat_from_utf8_libarchive2);
|
920
|
+
return;
|
921
|
+
}
|
922
|
+
|
923
|
+
/*
|
924
|
+
* Convert a string to UTF-16BE/LE.
|
925
|
+
*/
|
926
|
+
if (sc->flag & SCONV_TO_UTF16) {
|
927
|
+
/*
|
928
|
+
* If the current locale is UTF-8, we can translate
|
929
|
+
* a UTF-8 string into a UTF-16BE string.
|
930
|
+
*/
|
931
|
+
if (sc->flag & SCONV_FROM_UTF8) {
|
932
|
+
add_converter(sc, archive_string_append_unicode);
|
933
|
+
return;
|
934
|
+
}
|
935
|
+
|
936
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
937
|
+
if (sc->flag & SCONV_WIN_CP) {
|
938
|
+
if (sc->flag & SCONV_TO_UTF16BE)
|
939
|
+
add_converter(sc, win_strncat_to_utf16be);
|
940
|
+
else
|
941
|
+
add_converter(sc, win_strncat_to_utf16le);
|
942
|
+
return;
|
943
|
+
}
|
944
|
+
#endif
|
945
|
+
|
946
|
+
#if defined(HAVE_ICONV)
|
947
|
+
if (sc->cd != (iconv_t)-1) {
|
948
|
+
add_converter(sc, iconv_strncat_in_locale);
|
949
|
+
return;
|
950
|
+
}
|
951
|
+
#endif
|
952
|
+
|
953
|
+
if (sc->flag & SCONV_BEST_EFFORT) {
|
954
|
+
if (sc->flag & SCONV_TO_UTF16BE)
|
955
|
+
add_converter(sc,
|
956
|
+
best_effort_strncat_to_utf16be);
|
957
|
+
else
|
958
|
+
add_converter(sc,
|
959
|
+
best_effort_strncat_to_utf16le);
|
960
|
+
} else
|
961
|
+
/* Make sure we have no converter. */
|
962
|
+
sc->nconverter = 0;
|
963
|
+
return;
|
964
|
+
}
|
965
|
+
|
966
|
+
/*
|
967
|
+
* Convert a string from UTF-16BE/LE.
|
968
|
+
*/
|
969
|
+
if (sc->flag & SCONV_FROM_UTF16) {
|
970
|
+
/*
|
971
|
+
* At least we should normalize a UTF-16BE string.
|
972
|
+
*/
|
973
|
+
if (sc->flag & SCONV_NORMALIZATION_D)
|
974
|
+
add_converter(sc,archive_string_normalize_D);
|
975
|
+
else if (sc->flag & SCONV_NORMALIZATION_C)
|
976
|
+
add_converter(sc, archive_string_normalize_C);
|
977
|
+
|
978
|
+
if (sc->flag & SCONV_TO_UTF8) {
|
979
|
+
/*
|
980
|
+
* If the current locale is UTF-8, we can translate
|
981
|
+
* a UTF-16BE/LE string into a UTF-8 string directly.
|
982
|
+
*/
|
983
|
+
if (!(sc->flag &
|
984
|
+
(SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
|
985
|
+
add_converter(sc,
|
986
|
+
archive_string_append_unicode);
|
987
|
+
return;
|
988
|
+
}
|
989
|
+
|
990
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
991
|
+
if (sc->flag & SCONV_WIN_CP) {
|
992
|
+
if (sc->flag & SCONV_FROM_UTF16BE)
|
993
|
+
add_converter(sc, win_strncat_from_utf16be);
|
994
|
+
else
|
995
|
+
add_converter(sc, win_strncat_from_utf16le);
|
996
|
+
return;
|
997
|
+
}
|
998
|
+
#endif
|
999
|
+
|
1000
|
+
#if defined(HAVE_ICONV)
|
1001
|
+
if (sc->cd != (iconv_t)-1) {
|
1002
|
+
add_converter(sc, iconv_strncat_in_locale);
|
1003
|
+
return;
|
1004
|
+
}
|
1005
|
+
#endif
|
1006
|
+
|
1007
|
+
if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
|
1008
|
+
== (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
|
1009
|
+
add_converter(sc, best_effort_strncat_from_utf16be);
|
1010
|
+
else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
|
1011
|
+
== (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
|
1012
|
+
add_converter(sc, best_effort_strncat_from_utf16le);
|
1013
|
+
else
|
1014
|
+
/* Make sure we have no converter. */
|
1015
|
+
sc->nconverter = 0;
|
1016
|
+
return;
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
if (sc->flag & SCONV_FROM_UTF8) {
|
1020
|
+
/*
|
1021
|
+
* At least we should normalize a UTF-8 string.
|
1022
|
+
*/
|
1023
|
+
if (sc->flag & SCONV_NORMALIZATION_D)
|
1024
|
+
add_converter(sc,archive_string_normalize_D);
|
1025
|
+
else if (sc->flag & SCONV_NORMALIZATION_C)
|
1026
|
+
add_converter(sc, archive_string_normalize_C);
|
1027
|
+
|
1028
|
+
/*
|
1029
|
+
* Copy UTF-8 string with a check of CESU-8.
|
1030
|
+
* Apparently, iconv does not check surrogate pairs in UTF-8
|
1031
|
+
* when both from-charset and to-charset are UTF-8, and then
|
1032
|
+
* we use our UTF-8 copy code.
|
1033
|
+
*/
|
1034
|
+
if (sc->flag & SCONV_TO_UTF8) {
|
1035
|
+
/*
|
1036
|
+
* If the current locale is UTF-8, we can translate
|
1037
|
+
* a UTF-16BE string into a UTF-8 string directly.
|
1038
|
+
*/
|
1039
|
+
if (!(sc->flag &
|
1040
|
+
(SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
|
1041
|
+
add_converter(sc, strncat_from_utf8_to_utf8);
|
1042
|
+
return;
|
1043
|
+
}
|
1044
|
+
}
|
1045
|
+
|
1046
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1047
|
+
/*
|
1048
|
+
* On Windows we can use Windows API for a string conversion.
|
1049
|
+
*/
|
1050
|
+
if (sc->flag & SCONV_WIN_CP) {
|
1051
|
+
add_converter(sc, strncat_in_codepage);
|
1052
|
+
return;
|
1053
|
+
}
|
1054
|
+
#endif
|
1055
|
+
|
1056
|
+
#if HAVE_ICONV
|
1057
|
+
if (sc->cd != (iconv_t)-1) {
|
1058
|
+
add_converter(sc, iconv_strncat_in_locale);
|
1059
|
+
/*
|
1060
|
+
* iconv generally does not support UTF-8-MAC and so
|
1061
|
+
* we have to the output of iconv from NFC to NFD if
|
1062
|
+
* need.
|
1063
|
+
*/
|
1064
|
+
if ((sc->flag & SCONV_FROM_CHARSET) &&
|
1065
|
+
(sc->flag & SCONV_TO_UTF8)) {
|
1066
|
+
if (sc->flag & SCONV_NORMALIZATION_D)
|
1067
|
+
add_converter(sc, archive_string_normalize_D);
|
1068
|
+
}
|
1069
|
+
return;
|
1070
|
+
}
|
1071
|
+
#endif
|
1072
|
+
|
1073
|
+
/*
|
1074
|
+
* Try conversion in the best effort or no conversion.
|
1075
|
+
*/
|
1076
|
+
if ((sc->flag & SCONV_BEST_EFFORT) || sc->same)
|
1077
|
+
add_converter(sc, best_effort_strncat_in_locale);
|
1078
|
+
else
|
1079
|
+
/* Make sure we have no converter. */
|
1080
|
+
sc->nconverter = 0;
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
/*
|
1084
|
+
* Return canonicalized charset-name but this supports just UTF-8, UTF-16BE
|
1085
|
+
* and CP932 which are referenced in create_sconv_object().
|
1086
|
+
*/
|
1087
|
+
static const char *
|
1088
|
+
canonical_charset_name(const char *charset)
|
1089
|
+
{
|
1090
|
+
char cs[16];
|
1091
|
+
char *p;
|
1092
|
+
const char *s;
|
1093
|
+
|
1094
|
+
if (charset == NULL || charset[0] == '\0'
|
1095
|
+
|| strlen(charset) > 15)
|
1096
|
+
return (charset);
|
1097
|
+
|
1098
|
+
/* Copy name to uppercase. */
|
1099
|
+
p = cs;
|
1100
|
+
s = charset;
|
1101
|
+
while (*s) {
|
1102
|
+
char c = *s++;
|
1103
|
+
if (c >= 'a' && c <= 'z')
|
1104
|
+
c -= 'a' - 'A';
|
1105
|
+
*p++ = c;
|
1106
|
+
}
|
1107
|
+
*p++ = '\0';
|
1108
|
+
|
1109
|
+
if (strcmp(cs, "UTF-8") == 0 ||
|
1110
|
+
strcmp(cs, "UTF8") == 0)
|
1111
|
+
return ("UTF-8");
|
1112
|
+
if (strcmp(cs, "UTF-16BE") == 0 ||
|
1113
|
+
strcmp(cs, "UTF16BE") == 0)
|
1114
|
+
return ("UTF-16BE");
|
1115
|
+
if (strcmp(cs, "UTF-16LE") == 0 ||
|
1116
|
+
strcmp(cs, "UTF16LE") == 0)
|
1117
|
+
return ("UTF-16LE");
|
1118
|
+
if (strcmp(cs, "CP932") == 0)
|
1119
|
+
return ("CP932");
|
1120
|
+
return (charset);
|
1121
|
+
}
|
1122
|
+
|
1123
|
+
/*
|
1124
|
+
* Create a string conversion object.
|
1125
|
+
*/
|
1126
|
+
static struct archive_string_conv *
|
1127
|
+
create_sconv_object(const char *fc, const char *tc,
|
1128
|
+
unsigned current_codepage, int flag)
|
1129
|
+
{
|
1130
|
+
struct archive_string_conv *sc;
|
1131
|
+
|
1132
|
+
sc = calloc(1, sizeof(*sc));
|
1133
|
+
if (sc == NULL)
|
1134
|
+
return (NULL);
|
1135
|
+
sc->next = NULL;
|
1136
|
+
sc->from_charset = strdup(fc);
|
1137
|
+
if (sc->from_charset == NULL) {
|
1138
|
+
free(sc);
|
1139
|
+
return (NULL);
|
1140
|
+
}
|
1141
|
+
sc->to_charset = strdup(tc);
|
1142
|
+
if (sc->to_charset == NULL) {
|
1143
|
+
free(sc->from_charset);
|
1144
|
+
free(sc);
|
1145
|
+
return (NULL);
|
1146
|
+
}
|
1147
|
+
archive_string_init(&sc->utftmp);
|
1148
|
+
|
1149
|
+
if (flag & SCONV_TO_CHARSET) {
|
1150
|
+
/*
|
1151
|
+
* Convert characters from the current locale charset to
|
1152
|
+
* a specified charset.
|
1153
|
+
*/
|
1154
|
+
sc->from_cp = current_codepage;
|
1155
|
+
sc->to_cp = make_codepage_from_charset(tc);
|
1156
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1157
|
+
if (IsValidCodePage(sc->to_cp))
|
1158
|
+
flag |= SCONV_WIN_CP;
|
1159
|
+
#endif
|
1160
|
+
} else if (flag & SCONV_FROM_CHARSET) {
|
1161
|
+
/*
|
1162
|
+
* Convert characters from a specified charset to
|
1163
|
+
* the current locale charset.
|
1164
|
+
*/
|
1165
|
+
sc->to_cp = current_codepage;
|
1166
|
+
sc->from_cp = make_codepage_from_charset(fc);
|
1167
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1168
|
+
if (IsValidCodePage(sc->from_cp))
|
1169
|
+
flag |= SCONV_WIN_CP;
|
1170
|
+
#endif
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
/*
|
1174
|
+
* Check if "from charset" and "to charset" are the same.
|
1175
|
+
*/
|
1176
|
+
if (strcmp(fc, tc) == 0 ||
|
1177
|
+
(sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp))
|
1178
|
+
sc->same = 1;
|
1179
|
+
else
|
1180
|
+
sc->same = 0;
|
1181
|
+
|
1182
|
+
/*
|
1183
|
+
* Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE.
|
1184
|
+
*/
|
1185
|
+
if (strcmp(tc, "UTF-8") == 0)
|
1186
|
+
flag |= SCONV_TO_UTF8;
|
1187
|
+
else if (strcmp(tc, "UTF-16BE") == 0)
|
1188
|
+
flag |= SCONV_TO_UTF16BE;
|
1189
|
+
else if (strcmp(tc, "UTF-16LE") == 0)
|
1190
|
+
flag |= SCONV_TO_UTF16LE;
|
1191
|
+
if (strcmp(fc, "UTF-8") == 0)
|
1192
|
+
flag |= SCONV_FROM_UTF8;
|
1193
|
+
else if (strcmp(fc, "UTF-16BE") == 0)
|
1194
|
+
flag |= SCONV_FROM_UTF16BE;
|
1195
|
+
else if (strcmp(fc, "UTF-16LE") == 0)
|
1196
|
+
flag |= SCONV_FROM_UTF16LE;
|
1197
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1198
|
+
if (sc->to_cp == CP_UTF8)
|
1199
|
+
flag |= SCONV_TO_UTF8;
|
1200
|
+
else if (sc->to_cp == CP_UTF16BE)
|
1201
|
+
flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP;
|
1202
|
+
else if (sc->to_cp == CP_UTF16LE)
|
1203
|
+
flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP;
|
1204
|
+
if (sc->from_cp == CP_UTF8)
|
1205
|
+
flag |= SCONV_FROM_UTF8;
|
1206
|
+
else if (sc->from_cp == CP_UTF16BE)
|
1207
|
+
flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP;
|
1208
|
+
else if (sc->from_cp == CP_UTF16LE)
|
1209
|
+
flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP;
|
1210
|
+
#endif
|
1211
|
+
|
1212
|
+
/*
|
1213
|
+
* Set a flag for Unicode NFD. Usually iconv cannot correctly
|
1214
|
+
* handle it. So we have to translate NFD characters to NFC ones
|
1215
|
+
* ourselves before iconv handles. Another reason is to prevent
|
1216
|
+
* that the same sight of two filenames, one is NFC and other
|
1217
|
+
* is NFD, would be in its directory.
|
1218
|
+
* On Mac OS X, although its filesystem layer automatically
|
1219
|
+
* convert filenames to NFD, it would be useful for filename
|
1220
|
+
* comparing to find out the same filenames that we normalize
|
1221
|
+
* that to be NFD ourselves.
|
1222
|
+
*/
|
1223
|
+
if ((flag & SCONV_FROM_CHARSET) &&
|
1224
|
+
(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) {
|
1225
|
+
#if defined(__APPLE__)
|
1226
|
+
if (flag & SCONV_TO_UTF8)
|
1227
|
+
flag |= SCONV_NORMALIZATION_D;
|
1228
|
+
else
|
1229
|
+
#endif
|
1230
|
+
flag |= SCONV_NORMALIZATION_C;
|
1231
|
+
}
|
1232
|
+
#if defined(__APPLE__)
|
1233
|
+
/*
|
1234
|
+
* In case writing an archive file, make sure that a filename
|
1235
|
+
* going to be passed to iconv is a Unicode NFC string since
|
1236
|
+
* a filename in HFS Plus filesystem is a Unicode NFD one and
|
1237
|
+
* iconv cannot handle it with "UTF-8" charset. It is simpler
|
1238
|
+
* than a use of "UTF-8-MAC" charset.
|
1239
|
+
*/
|
1240
|
+
if ((flag & SCONV_TO_CHARSET) &&
|
1241
|
+
(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
|
1242
|
+
!(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
|
1243
|
+
flag |= SCONV_NORMALIZATION_C;
|
1244
|
+
/*
|
1245
|
+
* In case reading an archive file. make sure that a filename
|
1246
|
+
* will be passed to users is a Unicode NFD string in order to
|
1247
|
+
* correctly compare the filename with other one which comes
|
1248
|
+
* from HFS Plus filesystem.
|
1249
|
+
*/
|
1250
|
+
if ((flag & SCONV_FROM_CHARSET) &&
|
1251
|
+
!(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
|
1252
|
+
(flag & SCONV_TO_UTF8))
|
1253
|
+
flag |= SCONV_NORMALIZATION_D;
|
1254
|
+
#endif
|
1255
|
+
|
1256
|
+
#if defined(HAVE_ICONV)
|
1257
|
+
sc->cd_w = (iconv_t)-1;
|
1258
|
+
/*
|
1259
|
+
* Create an iconv object.
|
1260
|
+
*/
|
1261
|
+
if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) &&
|
1262
|
+
(flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) ||
|
1263
|
+
(flag & SCONV_WIN_CP)) {
|
1264
|
+
/* This case we won't use iconv. */
|
1265
|
+
sc->cd = (iconv_t)-1;
|
1266
|
+
} else {
|
1267
|
+
sc->cd = iconv_open(tc, fc);
|
1268
|
+
if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) {
|
1269
|
+
/*
|
1270
|
+
* Unfortunately, all of iconv implements do support
|
1271
|
+
* "CP932" character-set, so we should use "SJIS"
|
1272
|
+
* instead if iconv_open failed.
|
1273
|
+
*/
|
1274
|
+
if (strcmp(tc, "CP932") == 0)
|
1275
|
+
sc->cd = iconv_open("SJIS", fc);
|
1276
|
+
else if (strcmp(fc, "CP932") == 0)
|
1277
|
+
sc->cd = iconv_open(tc, "SJIS");
|
1278
|
+
}
|
1279
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1280
|
+
/*
|
1281
|
+
* archive_mstring on Windows directly convert multi-bytes
|
1282
|
+
* into archive_wstring in order not to depend on locale
|
1283
|
+
* so that you can do a I18N programming. This will be
|
1284
|
+
* used only in archive_mstring_copy_mbs_len_l so far.
|
1285
|
+
*/
|
1286
|
+
if (flag & SCONV_FROM_CHARSET) {
|
1287
|
+
sc->cd_w = iconv_open("UTF-8", fc);
|
1288
|
+
if (sc->cd_w == (iconv_t)-1 &&
|
1289
|
+
(sc->flag & SCONV_BEST_EFFORT)) {
|
1290
|
+
if (strcmp(fc, "CP932") == 0)
|
1291
|
+
sc->cd_w = iconv_open("UTF-8", "SJIS");
|
1292
|
+
}
|
1293
|
+
}
|
1294
|
+
#endif /* _WIN32 && !__CYGWIN__ */
|
1295
|
+
}
|
1296
|
+
#endif /* HAVE_ICONV */
|
1297
|
+
|
1298
|
+
sc->flag = flag;
|
1299
|
+
|
1300
|
+
/*
|
1301
|
+
* Set up converters.
|
1302
|
+
*/
|
1303
|
+
setup_converter(sc);
|
1304
|
+
|
1305
|
+
return (sc);
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
/*
|
1309
|
+
* Free a string conversion object.
|
1310
|
+
*/
|
1311
|
+
static void
|
1312
|
+
free_sconv_object(struct archive_string_conv *sc)
|
1313
|
+
{
|
1314
|
+
free(sc->from_charset);
|
1315
|
+
free(sc->to_charset);
|
1316
|
+
archive_string_free(&sc->utftmp);
|
1317
|
+
#if HAVE_ICONV
|
1318
|
+
if (sc->cd != (iconv_t)-1)
|
1319
|
+
iconv_close(sc->cd);
|
1320
|
+
if (sc->cd_w != (iconv_t)-1)
|
1321
|
+
iconv_close(sc->cd_w);
|
1322
|
+
#endif
|
1323
|
+
free(sc);
|
1324
|
+
}
|
1325
|
+
|
1326
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1327
|
+
static unsigned
|
1328
|
+
my_atoi(const char *p)
|
1329
|
+
{
|
1330
|
+
unsigned cp;
|
1331
|
+
|
1332
|
+
cp = 0;
|
1333
|
+
while (*p) {
|
1334
|
+
if (*p >= '0' && *p <= '9')
|
1335
|
+
cp = cp * 10 + (*p - '0');
|
1336
|
+
else
|
1337
|
+
return (-1);
|
1338
|
+
p++;
|
1339
|
+
}
|
1340
|
+
return (cp);
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
/*
|
1344
|
+
* Translate Charset name (as used by iconv) into CodePage (as used by Windows)
|
1345
|
+
* Return -1 if failed.
|
1346
|
+
*
|
1347
|
+
* Note: This translation code may be insufficient.
|
1348
|
+
*/
|
1349
|
+
static struct charset {
|
1350
|
+
const char *name;
|
1351
|
+
unsigned cp;
|
1352
|
+
} charsets[] = {
|
1353
|
+
/* MUST BE SORTED! */
|
1354
|
+
{"ASCII", 1252},
|
1355
|
+
{"ASMO-708", 708},
|
1356
|
+
{"BIG5", 950},
|
1357
|
+
{"CHINESE", 936},
|
1358
|
+
{"CP367", 1252},
|
1359
|
+
{"CP819", 1252},
|
1360
|
+
{"CP1025", 21025},
|
1361
|
+
{"DOS-720", 720},
|
1362
|
+
{"DOS-862", 862},
|
1363
|
+
{"EUC-CN", 51936},
|
1364
|
+
{"EUC-JP", 51932},
|
1365
|
+
{"EUC-KR", 949},
|
1366
|
+
{"EUCCN", 51936},
|
1367
|
+
{"EUCJP", 51932},
|
1368
|
+
{"EUCKR", 949},
|
1369
|
+
{"GB18030", 54936},
|
1370
|
+
{"GB2312", 936},
|
1371
|
+
{"HEBREW", 1255},
|
1372
|
+
{"HZ-GB-2312", 52936},
|
1373
|
+
{"IBM273", 20273},
|
1374
|
+
{"IBM277", 20277},
|
1375
|
+
{"IBM278", 20278},
|
1376
|
+
{"IBM280", 20280},
|
1377
|
+
{"IBM284", 20284},
|
1378
|
+
{"IBM285", 20285},
|
1379
|
+
{"IBM290", 20290},
|
1380
|
+
{"IBM297", 20297},
|
1381
|
+
{"IBM367", 1252},
|
1382
|
+
{"IBM420", 20420},
|
1383
|
+
{"IBM423", 20423},
|
1384
|
+
{"IBM424", 20424},
|
1385
|
+
{"IBM819", 1252},
|
1386
|
+
{"IBM871", 20871},
|
1387
|
+
{"IBM880", 20880},
|
1388
|
+
{"IBM905", 20905},
|
1389
|
+
{"IBM924", 20924},
|
1390
|
+
{"ISO-8859-1", 28591},
|
1391
|
+
{"ISO-8859-13", 28603},
|
1392
|
+
{"ISO-8859-15", 28605},
|
1393
|
+
{"ISO-8859-2", 28592},
|
1394
|
+
{"ISO-8859-3", 28593},
|
1395
|
+
{"ISO-8859-4", 28594},
|
1396
|
+
{"ISO-8859-5", 28595},
|
1397
|
+
{"ISO-8859-6", 28596},
|
1398
|
+
{"ISO-8859-7", 28597},
|
1399
|
+
{"ISO-8859-8", 28598},
|
1400
|
+
{"ISO-8859-9", 28599},
|
1401
|
+
{"ISO8859-1", 28591},
|
1402
|
+
{"ISO8859-13", 28603},
|
1403
|
+
{"ISO8859-15", 28605},
|
1404
|
+
{"ISO8859-2", 28592},
|
1405
|
+
{"ISO8859-3", 28593},
|
1406
|
+
{"ISO8859-4", 28594},
|
1407
|
+
{"ISO8859-5", 28595},
|
1408
|
+
{"ISO8859-6", 28596},
|
1409
|
+
{"ISO8859-7", 28597},
|
1410
|
+
{"ISO8859-8", 28598},
|
1411
|
+
{"ISO8859-9", 28599},
|
1412
|
+
{"JOHAB", 1361},
|
1413
|
+
{"KOI8-R", 20866},
|
1414
|
+
{"KOI8-U", 21866},
|
1415
|
+
{"KS_C_5601-1987", 949},
|
1416
|
+
{"LATIN1", 1252},
|
1417
|
+
{"LATIN2", 28592},
|
1418
|
+
{"MACINTOSH", 10000},
|
1419
|
+
{"SHIFT-JIS", 932},
|
1420
|
+
{"SHIFT_JIS", 932},
|
1421
|
+
{"SJIS", 932},
|
1422
|
+
{"US", 1252},
|
1423
|
+
{"US-ASCII", 1252},
|
1424
|
+
{"UTF-16", 1200},
|
1425
|
+
{"UTF-16BE", 1201},
|
1426
|
+
{"UTF-16LE", 1200},
|
1427
|
+
{"UTF-8", CP_UTF8},
|
1428
|
+
{"X-EUROPA", 29001},
|
1429
|
+
{"X-MAC-ARABIC", 10004},
|
1430
|
+
{"X-MAC-CE", 10029},
|
1431
|
+
{"X-MAC-CHINESEIMP", 10008},
|
1432
|
+
{"X-MAC-CHINESETRAD", 10002},
|
1433
|
+
{"X-MAC-CROATIAN", 10082},
|
1434
|
+
{"X-MAC-CYRILLIC", 10007},
|
1435
|
+
{"X-MAC-GREEK", 10006},
|
1436
|
+
{"X-MAC-HEBREW", 10005},
|
1437
|
+
{"X-MAC-ICELANDIC", 10079},
|
1438
|
+
{"X-MAC-JAPANESE", 10001},
|
1439
|
+
{"X-MAC-KOREAN", 10003},
|
1440
|
+
{"X-MAC-ROMANIAN", 10010},
|
1441
|
+
{"X-MAC-THAI", 10021},
|
1442
|
+
{"X-MAC-TURKISH", 10081},
|
1443
|
+
{"X-MAC-UKRAINIAN", 10017},
|
1444
|
+
};
|
1445
|
+
static unsigned
|
1446
|
+
make_codepage_from_charset(const char *charset)
|
1447
|
+
{
|
1448
|
+
char cs[16];
|
1449
|
+
char *p;
|
1450
|
+
unsigned cp;
|
1451
|
+
int a, b;
|
1452
|
+
|
1453
|
+
if (charset == NULL || strlen(charset) > 15)
|
1454
|
+
return -1;
|
1455
|
+
|
1456
|
+
/* Copy name to uppercase. */
|
1457
|
+
p = cs;
|
1458
|
+
while (*charset) {
|
1459
|
+
char c = *charset++;
|
1460
|
+
if (c >= 'a' && c <= 'z')
|
1461
|
+
c -= 'a' - 'A';
|
1462
|
+
*p++ = c;
|
1463
|
+
}
|
1464
|
+
*p++ = '\0';
|
1465
|
+
cp = -1;
|
1466
|
+
|
1467
|
+
/* Look it up in the table first, so that we can easily
|
1468
|
+
* override CP367, which we map to 1252 instead of 367. */
|
1469
|
+
a = 0;
|
1470
|
+
b = sizeof(charsets)/sizeof(charsets[0]);
|
1471
|
+
while (b > a) {
|
1472
|
+
int c = (b + a) / 2;
|
1473
|
+
int r = strcmp(charsets[c].name, cs);
|
1474
|
+
if (r < 0)
|
1475
|
+
a = c + 1;
|
1476
|
+
else if (r > 0)
|
1477
|
+
b = c;
|
1478
|
+
else
|
1479
|
+
return charsets[c].cp;
|
1480
|
+
}
|
1481
|
+
|
1482
|
+
/* If it's not in the table, try to parse it. */
|
1483
|
+
switch (*cs) {
|
1484
|
+
case 'C':
|
1485
|
+
if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') {
|
1486
|
+
cp = my_atoi(cs + 2);
|
1487
|
+
} else if (strcmp(cs, "CP_ACP") == 0)
|
1488
|
+
cp = get_current_codepage();
|
1489
|
+
else if (strcmp(cs, "CP_OEMCP") == 0)
|
1490
|
+
cp = get_current_oemcp();
|
1491
|
+
break;
|
1492
|
+
case 'I':
|
1493
|
+
if (cs[1] == 'B' && cs[2] == 'M' &&
|
1494
|
+
cs[3] >= '0' && cs[3] <= '9') {
|
1495
|
+
cp = my_atoi(cs + 3);
|
1496
|
+
}
|
1497
|
+
break;
|
1498
|
+
case 'W':
|
1499
|
+
if (strncmp(cs, "WINDOWS-", 8) == 0) {
|
1500
|
+
cp = my_atoi(cs + 8);
|
1501
|
+
if (cp != 874 && (cp < 1250 || cp > 1258))
|
1502
|
+
cp = -1;/* This may invalid code. */
|
1503
|
+
}
|
1504
|
+
break;
|
1505
|
+
}
|
1506
|
+
return (cp);
|
1507
|
+
}
|
1508
|
+
|
1509
|
+
/*
|
1510
|
+
* Return ANSI Code Page of current locale set by setlocale().
|
1511
|
+
*/
|
1512
|
+
static unsigned
|
1513
|
+
get_current_codepage(void)
|
1514
|
+
{
|
1515
|
+
char *locale, *p;
|
1516
|
+
unsigned cp;
|
1517
|
+
|
1518
|
+
locale = setlocale(LC_CTYPE, NULL);
|
1519
|
+
if (locale == NULL)
|
1520
|
+
return (GetACP());
|
1521
|
+
if (locale[0] == 'C' && locale[1] == '\0')
|
1522
|
+
return (CP_C_LOCALE);
|
1523
|
+
p = strrchr(locale, '.');
|
1524
|
+
if (p == NULL)
|
1525
|
+
return (GetACP());
|
1526
|
+
if (strcmp(p+1, "utf8") == 0)
|
1527
|
+
return CP_UTF8;
|
1528
|
+
cp = my_atoi(p+1);
|
1529
|
+
if ((int)cp <= 0)
|
1530
|
+
return (GetACP());
|
1531
|
+
return (cp);
|
1532
|
+
}
|
1533
|
+
|
1534
|
+
/*
|
1535
|
+
* Translation table between Locale Name and ACP/OEMCP.
|
1536
|
+
*/
|
1537
|
+
static struct {
|
1538
|
+
unsigned acp;
|
1539
|
+
unsigned ocp;
|
1540
|
+
const char *locale;
|
1541
|
+
} acp_ocp_map[] = {
|
1542
|
+
{ 950, 950, "Chinese_Taiwan" },
|
1543
|
+
{ 936, 936, "Chinese_People's Republic of China" },
|
1544
|
+
{ 950, 950, "Chinese_Taiwan" },
|
1545
|
+
{ 1250, 852, "Czech_Czech Republic" },
|
1546
|
+
{ 1252, 850, "Danish_Denmark" },
|
1547
|
+
{ 1252, 850, "Dutch_Netherlands" },
|
1548
|
+
{ 1252, 850, "Dutch_Belgium" },
|
1549
|
+
{ 1252, 437, "English_United States" },
|
1550
|
+
{ 1252, 850, "English_Australia" },
|
1551
|
+
{ 1252, 850, "English_Canada" },
|
1552
|
+
{ 1252, 850, "English_New Zealand" },
|
1553
|
+
{ 1252, 850, "English_United Kingdom" },
|
1554
|
+
{ 1252, 437, "English_United States" },
|
1555
|
+
{ 1252, 850, "Finnish_Finland" },
|
1556
|
+
{ 1252, 850, "French_France" },
|
1557
|
+
{ 1252, 850, "French_Belgium" },
|
1558
|
+
{ 1252, 850, "French_Canada" },
|
1559
|
+
{ 1252, 850, "French_Switzerland" },
|
1560
|
+
{ 1252, 850, "German_Germany" },
|
1561
|
+
{ 1252, 850, "German_Austria" },
|
1562
|
+
{ 1252, 850, "German_Switzerland" },
|
1563
|
+
{ 1253, 737, "Greek_Greece" },
|
1564
|
+
{ 1250, 852, "Hungarian_Hungary" },
|
1565
|
+
{ 1252, 850, "Icelandic_Iceland" },
|
1566
|
+
{ 1252, 850, "Italian_Italy" },
|
1567
|
+
{ 1252, 850, "Italian_Switzerland" },
|
1568
|
+
{ 932, 932, "Japanese_Japan" },
|
1569
|
+
{ 949, 949, "Korean_Korea" },
|
1570
|
+
{ 1252, 850, "Norwegian (BokmOl)_Norway" },
|
1571
|
+
{ 1252, 850, "Norwegian (BokmOl)_Norway" },
|
1572
|
+
{ 1252, 850, "Norwegian-Nynorsk_Norway" },
|
1573
|
+
{ 1250, 852, "Polish_Poland" },
|
1574
|
+
{ 1252, 850, "Portuguese_Portugal" },
|
1575
|
+
{ 1252, 850, "Portuguese_Brazil" },
|
1576
|
+
{ 1251, 866, "Russian_Russia" },
|
1577
|
+
{ 1250, 852, "Slovak_Slovakia" },
|
1578
|
+
{ 1252, 850, "Spanish_Spain" },
|
1579
|
+
{ 1252, 850, "Spanish_Mexico" },
|
1580
|
+
{ 1252, 850, "Spanish_Spain" },
|
1581
|
+
{ 1252, 850, "Swedish_Sweden" },
|
1582
|
+
{ 1254, 857, "Turkish_Turkey" },
|
1583
|
+
{ 0, 0, NULL}
|
1584
|
+
};
|
1585
|
+
|
1586
|
+
/*
|
1587
|
+
* Return OEM Code Page of current locale set by setlocale().
|
1588
|
+
*/
|
1589
|
+
static unsigned
|
1590
|
+
get_current_oemcp(void)
|
1591
|
+
{
|
1592
|
+
int i;
|
1593
|
+
char *locale, *p;
|
1594
|
+
size_t len;
|
1595
|
+
|
1596
|
+
locale = setlocale(LC_CTYPE, NULL);
|
1597
|
+
if (locale == NULL)
|
1598
|
+
return (GetOEMCP());
|
1599
|
+
if (locale[0] == 'C' && locale[1] == '\0')
|
1600
|
+
return (CP_C_LOCALE);
|
1601
|
+
|
1602
|
+
p = strrchr(locale, '.');
|
1603
|
+
if (p == NULL)
|
1604
|
+
return (GetOEMCP());
|
1605
|
+
len = p - locale;
|
1606
|
+
for (i = 0; acp_ocp_map[i].acp; i++) {
|
1607
|
+
if (strncmp(acp_ocp_map[i].locale, locale, len) == 0)
|
1608
|
+
return (acp_ocp_map[i].ocp);
|
1609
|
+
}
|
1610
|
+
return (GetOEMCP());
|
1611
|
+
}
|
1612
|
+
#else
|
1613
|
+
|
1614
|
+
/*
|
1615
|
+
* POSIX platform does not use CodePage.
|
1616
|
+
*/
|
1617
|
+
|
1618
|
+
static unsigned
|
1619
|
+
get_current_codepage(void)
|
1620
|
+
{
|
1621
|
+
return (-1);/* Unknown */
|
1622
|
+
}
|
1623
|
+
static unsigned
|
1624
|
+
make_codepage_from_charset(const char *charset)
|
1625
|
+
{
|
1626
|
+
(void)charset; /* UNUSED */
|
1627
|
+
return (-1);/* Unknown */
|
1628
|
+
}
|
1629
|
+
static unsigned
|
1630
|
+
get_current_oemcp(void)
|
1631
|
+
{
|
1632
|
+
return (-1);/* Unknown */
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
#endif /* defined(_WIN32) && !defined(__CYGWIN__) */
|
1636
|
+
|
1637
|
+
/*
|
1638
|
+
* Return a string conversion object.
|
1639
|
+
*/
|
1640
|
+
static struct archive_string_conv *
|
1641
|
+
get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag)
|
1642
|
+
{
|
1643
|
+
struct archive_string_conv *sc;
|
1644
|
+
unsigned current_codepage;
|
1645
|
+
|
1646
|
+
/* Check if we have made the sconv object. */
|
1647
|
+
sc = find_sconv_object(a, fc, tc);
|
1648
|
+
if (sc != NULL)
|
1649
|
+
return (sc);
|
1650
|
+
|
1651
|
+
if (a == NULL)
|
1652
|
+
current_codepage = get_current_codepage();
|
1653
|
+
else
|
1654
|
+
current_codepage = a->current_codepage;
|
1655
|
+
|
1656
|
+
sc = create_sconv_object(canonical_charset_name(fc),
|
1657
|
+
canonical_charset_name(tc), current_codepage, flag);
|
1658
|
+
if (sc == NULL) {
|
1659
|
+
if (a != NULL)
|
1660
|
+
archive_set_error(a, ENOMEM,
|
1661
|
+
"Could not allocate memory for "
|
1662
|
+
"a string conversion object");
|
1663
|
+
return (NULL);
|
1664
|
+
}
|
1665
|
+
|
1666
|
+
/*
|
1667
|
+
* If there is no converter for current string conversion object,
|
1668
|
+
* we cannot handle this conversion.
|
1669
|
+
*/
|
1670
|
+
if (sc->nconverter == 0) {
|
1671
|
+
if (a != NULL) {
|
1672
|
+
#if HAVE_ICONV
|
1673
|
+
archive_set_error(a, ARCHIVE_ERRNO_MISC,
|
1674
|
+
"iconv_open failed : Cannot handle ``%s''",
|
1675
|
+
(flag & SCONV_TO_CHARSET)?tc:fc);
|
1676
|
+
#else
|
1677
|
+
archive_set_error(a, ARCHIVE_ERRNO_MISC,
|
1678
|
+
"A character-set conversion not fully supported "
|
1679
|
+
"on this platform");
|
1680
|
+
#endif
|
1681
|
+
}
|
1682
|
+
/* Failed; free a sconv object. */
|
1683
|
+
free_sconv_object(sc);
|
1684
|
+
return (NULL);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
/*
|
1688
|
+
* Success!
|
1689
|
+
*/
|
1690
|
+
if (a != NULL)
|
1691
|
+
add_sconv_object(a, sc);
|
1692
|
+
return (sc);
|
1693
|
+
}
|
1694
|
+
|
1695
|
+
static const char *
|
1696
|
+
get_current_charset(struct archive *a)
|
1697
|
+
{
|
1698
|
+
const char *cur_charset;
|
1699
|
+
|
1700
|
+
if (a == NULL)
|
1701
|
+
cur_charset = default_iconv_charset("");
|
1702
|
+
else {
|
1703
|
+
cur_charset = default_iconv_charset(a->current_code);
|
1704
|
+
if (a->current_code == NULL) {
|
1705
|
+
a->current_code = strdup(cur_charset);
|
1706
|
+
a->current_codepage = get_current_codepage();
|
1707
|
+
a->current_oemcp = get_current_oemcp();
|
1708
|
+
}
|
1709
|
+
}
|
1710
|
+
return (cur_charset);
|
1711
|
+
}
|
1712
|
+
|
1713
|
+
/*
|
1714
|
+
* Make and Return a string conversion object.
|
1715
|
+
* Return NULL if the platform does not support the specified conversion
|
1716
|
+
* and best_effort is 0.
|
1717
|
+
* If best_effort is set, A string conversion object must be returned
|
1718
|
+
* unless memory allocation for the object fails, but the conversion
|
1719
|
+
* might fail when non-ASCII code is found.
|
1720
|
+
*/
|
1721
|
+
struct archive_string_conv *
|
1722
|
+
archive_string_conversion_to_charset(struct archive *a, const char *charset,
|
1723
|
+
int best_effort)
|
1724
|
+
{
|
1725
|
+
int flag = SCONV_TO_CHARSET;
|
1726
|
+
|
1727
|
+
if (best_effort)
|
1728
|
+
flag |= SCONV_BEST_EFFORT;
|
1729
|
+
return (get_sconv_object(a, get_current_charset(a), charset, flag));
|
1730
|
+
}
|
1731
|
+
|
1732
|
+
struct archive_string_conv *
|
1733
|
+
archive_string_conversion_from_charset(struct archive *a, const char *charset,
|
1734
|
+
int best_effort)
|
1735
|
+
{
|
1736
|
+
int flag = SCONV_FROM_CHARSET;
|
1737
|
+
|
1738
|
+
if (best_effort)
|
1739
|
+
flag |= SCONV_BEST_EFFORT;
|
1740
|
+
return (get_sconv_object(a, charset, get_current_charset(a), flag));
|
1741
|
+
}
|
1742
|
+
|
1743
|
+
/*
|
1744
|
+
* archive_string_default_conversion_*_archive() are provided for Windows
|
1745
|
+
* platform because other archiver application use CP_OEMCP for
|
1746
|
+
* MultiByteToWideChar() and WideCharToMultiByte() for the filenames
|
1747
|
+
* in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP
|
1748
|
+
* unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP).
|
1749
|
+
* So we should make a string conversion between CP_ACP and CP_OEMCP
|
1750
|
+
* for compatibility.
|
1751
|
+
*/
|
1752
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1753
|
+
struct archive_string_conv *
|
1754
|
+
archive_string_default_conversion_for_read(struct archive *a)
|
1755
|
+
{
|
1756
|
+
const char *cur_charset = get_current_charset(a);
|
1757
|
+
char oemcp[16];
|
1758
|
+
|
1759
|
+
/* NOTE: a check of cur_charset is unneeded but we need
|
1760
|
+
* that get_current_charset() has been surely called at
|
1761
|
+
* this time whatever C compiler optimized. */
|
1762
|
+
if (cur_charset != NULL &&
|
1763
|
+
(a->current_codepage == CP_C_LOCALE ||
|
1764
|
+
a->current_codepage == a->current_oemcp))
|
1765
|
+
return (NULL);/* no conversion. */
|
1766
|
+
|
1767
|
+
_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
|
1768
|
+
/* Make sure a null termination must be set. */
|
1769
|
+
oemcp[sizeof(oemcp)-1] = '\0';
|
1770
|
+
return (get_sconv_object(a, oemcp, cur_charset,
|
1771
|
+
SCONV_FROM_CHARSET));
|
1772
|
+
}
|
1773
|
+
|
1774
|
+
struct archive_string_conv *
|
1775
|
+
archive_string_default_conversion_for_write(struct archive *a)
|
1776
|
+
{
|
1777
|
+
const char *cur_charset = get_current_charset(a);
|
1778
|
+
char oemcp[16];
|
1779
|
+
|
1780
|
+
/* NOTE: a check of cur_charset is unneeded but we need
|
1781
|
+
* that get_current_charset() has been surely called at
|
1782
|
+
* this time whatever C compiler optimized. */
|
1783
|
+
if (cur_charset != NULL &&
|
1784
|
+
(a->current_codepage == CP_C_LOCALE ||
|
1785
|
+
a->current_codepage == a->current_oemcp))
|
1786
|
+
return (NULL);/* no conversion. */
|
1787
|
+
|
1788
|
+
_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
|
1789
|
+
/* Make sure a null termination must be set. */
|
1790
|
+
oemcp[sizeof(oemcp)-1] = '\0';
|
1791
|
+
return (get_sconv_object(a, cur_charset, oemcp,
|
1792
|
+
SCONV_TO_CHARSET));
|
1793
|
+
}
|
1794
|
+
#else
|
1795
|
+
struct archive_string_conv *
|
1796
|
+
archive_string_default_conversion_for_read(struct archive *a)
|
1797
|
+
{
|
1798
|
+
(void)a; /* UNUSED */
|
1799
|
+
return (NULL);
|
1800
|
+
}
|
1801
|
+
|
1802
|
+
struct archive_string_conv *
|
1803
|
+
archive_string_default_conversion_for_write(struct archive *a)
|
1804
|
+
{
|
1805
|
+
(void)a; /* UNUSED */
|
1806
|
+
return (NULL);
|
1807
|
+
}
|
1808
|
+
#endif
|
1809
|
+
|
1810
|
+
/*
|
1811
|
+
* Dispose of all character conversion objects in the archive object.
|
1812
|
+
*/
|
1813
|
+
void
|
1814
|
+
archive_string_conversion_free(struct archive *a)
|
1815
|
+
{
|
1816
|
+
struct archive_string_conv *sc;
|
1817
|
+
struct archive_string_conv *sc_next;
|
1818
|
+
|
1819
|
+
for (sc = a->sconv; sc != NULL; sc = sc_next) {
|
1820
|
+
sc_next = sc->next;
|
1821
|
+
free_sconv_object(sc);
|
1822
|
+
}
|
1823
|
+
a->sconv = NULL;
|
1824
|
+
free(a->current_code);
|
1825
|
+
a->current_code = NULL;
|
1826
|
+
}
|
1827
|
+
|
1828
|
+
/*
|
1829
|
+
* Return a conversion charset name.
|
1830
|
+
*/
|
1831
|
+
const char *
|
1832
|
+
archive_string_conversion_charset_name(struct archive_string_conv *sc)
|
1833
|
+
{
|
1834
|
+
if (sc->flag & SCONV_TO_CHARSET)
|
1835
|
+
return (sc->to_charset);
|
1836
|
+
else
|
1837
|
+
return (sc->from_charset);
|
1838
|
+
}
|
1839
|
+
|
1840
|
+
/*
|
1841
|
+
* Change the behavior of a string conversion.
|
1842
|
+
*/
|
1843
|
+
void
|
1844
|
+
archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt)
|
1845
|
+
{
|
1846
|
+
switch (opt) {
|
1847
|
+
/*
|
1848
|
+
* A filename in UTF-8 was made with libarchive 2.x in a wrong
|
1849
|
+
* assumption that wchar_t was Unicode.
|
1850
|
+
* This option enables simulating the assumption in order to read
|
1851
|
+
* that filename correctly.
|
1852
|
+
*/
|
1853
|
+
case SCONV_SET_OPT_UTF8_LIBARCHIVE2X:
|
1854
|
+
#if (defined(_WIN32) && !defined(__CYGWIN__)) \
|
1855
|
+
|| defined(__STDC_ISO_10646__) || defined(__APPLE__)
|
1856
|
+
/*
|
1857
|
+
* Nothing to do for it since wchar_t on these platforms
|
1858
|
+
* is really Unicode.
|
1859
|
+
*/
|
1860
|
+
(void)sc; /* UNUSED */
|
1861
|
+
#else
|
1862
|
+
if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) {
|
1863
|
+
sc->flag |= SCONV_UTF8_LIBARCHIVE_2;
|
1864
|
+
/* Set up string converters. */
|
1865
|
+
setup_converter(sc);
|
1866
|
+
}
|
1867
|
+
#endif
|
1868
|
+
break;
|
1869
|
+
case SCONV_SET_OPT_NORMALIZATION_C:
|
1870
|
+
if ((sc->flag & SCONV_NORMALIZATION_C) == 0) {
|
1871
|
+
sc->flag |= SCONV_NORMALIZATION_C;
|
1872
|
+
sc->flag &= ~SCONV_NORMALIZATION_D;
|
1873
|
+
/* Set up string converters. */
|
1874
|
+
setup_converter(sc);
|
1875
|
+
}
|
1876
|
+
break;
|
1877
|
+
case SCONV_SET_OPT_NORMALIZATION_D:
|
1878
|
+
#if defined(HAVE_ICONV)
|
1879
|
+
/*
|
1880
|
+
* If iconv will take the string, do not change the
|
1881
|
+
* setting of the normalization.
|
1882
|
+
*/
|
1883
|
+
if (!(sc->flag & SCONV_WIN_CP) &&
|
1884
|
+
(sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
|
1885
|
+
!(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
|
1886
|
+
break;
|
1887
|
+
#endif
|
1888
|
+
if ((sc->flag & SCONV_NORMALIZATION_D) == 0) {
|
1889
|
+
sc->flag |= SCONV_NORMALIZATION_D;
|
1890
|
+
sc->flag &= ~SCONV_NORMALIZATION_C;
|
1891
|
+
/* Set up string converters. */
|
1892
|
+
setup_converter(sc);
|
1893
|
+
}
|
1894
|
+
break;
|
1895
|
+
default:
|
1896
|
+
break;
|
1897
|
+
}
|
1898
|
+
}
|
1899
|
+
|
1900
|
+
/*
|
1901
|
+
*
|
1902
|
+
* Copy one archive_string to another in locale conversion.
|
1903
|
+
*
|
1904
|
+
* archive_strncat_l();
|
1905
|
+
* archive_strncpy_l();
|
1906
|
+
*
|
1907
|
+
*/
|
1908
|
+
|
1909
|
+
static size_t
|
1910
|
+
mbsnbytes(const void *_p, size_t n)
|
1911
|
+
{
|
1912
|
+
size_t s;
|
1913
|
+
const char *p, *pp;
|
1914
|
+
|
1915
|
+
if (_p == NULL)
|
1916
|
+
return (0);
|
1917
|
+
p = (const char *)_p;
|
1918
|
+
|
1919
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
1920
|
+
s = 0;
|
1921
|
+
pp = p;
|
1922
|
+
while (s < n && *pp) {
|
1923
|
+
pp++;
|
1924
|
+
s++;
|
1925
|
+
}
|
1926
|
+
return (s);
|
1927
|
+
}
|
1928
|
+
|
1929
|
+
static size_t
|
1930
|
+
utf16nbytes(const void *_p, size_t n)
|
1931
|
+
{
|
1932
|
+
size_t s;
|
1933
|
+
const char *p, *pp;
|
1934
|
+
|
1935
|
+
if (_p == NULL)
|
1936
|
+
return (0);
|
1937
|
+
p = (const char *)_p;
|
1938
|
+
|
1939
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
1940
|
+
s = 0;
|
1941
|
+
pp = p;
|
1942
|
+
n >>= 1;
|
1943
|
+
while (s < n && (pp[0] || pp[1])) {
|
1944
|
+
pp += 2;
|
1945
|
+
s++;
|
1946
|
+
}
|
1947
|
+
return (s<<1);
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
int
|
1951
|
+
archive_strncpy_l(struct archive_string *as, const void *_p, size_t n,
|
1952
|
+
struct archive_string_conv *sc)
|
1953
|
+
{
|
1954
|
+
as->length = 0;
|
1955
|
+
return (archive_strncat_l(as, _p, n, sc));
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
int
|
1959
|
+
archive_strncat_l(struct archive_string *as, const void *_p, size_t n,
|
1960
|
+
struct archive_string_conv *sc)
|
1961
|
+
{
|
1962
|
+
const void *s;
|
1963
|
+
size_t length = 0;
|
1964
|
+
int i, r = 0, r2;
|
1965
|
+
|
1966
|
+
if (_p != NULL && n > 0) {
|
1967
|
+
if (sc != NULL && (sc->flag & SCONV_FROM_UTF16))
|
1968
|
+
length = utf16nbytes(_p, n);
|
1969
|
+
else
|
1970
|
+
length = mbsnbytes(_p, n);
|
1971
|
+
}
|
1972
|
+
|
1973
|
+
/* We must allocate memory even if there is no data for conversion
|
1974
|
+
* or copy. This simulates archive_string_append behavior. */
|
1975
|
+
if (length == 0) {
|
1976
|
+
int tn = 1;
|
1977
|
+
if (sc != NULL && (sc->flag & SCONV_TO_UTF16))
|
1978
|
+
tn = 2;
|
1979
|
+
if (archive_string_ensure(as, as->length + tn) == NULL)
|
1980
|
+
return (-1);
|
1981
|
+
as->s[as->length] = 0;
|
1982
|
+
if (tn == 2)
|
1983
|
+
as->s[as->length+1] = 0;
|
1984
|
+
return (0);
|
1985
|
+
}
|
1986
|
+
|
1987
|
+
/*
|
1988
|
+
* If sc is NULL, we just make a copy.
|
1989
|
+
*/
|
1990
|
+
if (sc == NULL) {
|
1991
|
+
if (archive_string_append(as, _p, length) == NULL)
|
1992
|
+
return (-1);/* No memory */
|
1993
|
+
return (0);
|
1994
|
+
}
|
1995
|
+
|
1996
|
+
s = _p;
|
1997
|
+
i = 0;
|
1998
|
+
if (sc->nconverter > 1) {
|
1999
|
+
sc->utftmp.length = 0;
|
2000
|
+
r2 = sc->converter[0](&(sc->utftmp), s, length, sc);
|
2001
|
+
if (r2 != 0 && errno == ENOMEM)
|
2002
|
+
return (r2);
|
2003
|
+
if (r > r2)
|
2004
|
+
r = r2;
|
2005
|
+
s = sc->utftmp.s;
|
2006
|
+
length = sc->utftmp.length;
|
2007
|
+
++i;
|
2008
|
+
}
|
2009
|
+
r2 = sc->converter[i](as, s, length, sc);
|
2010
|
+
if (r > r2)
|
2011
|
+
r = r2;
|
2012
|
+
return (r);
|
2013
|
+
}
|
2014
|
+
|
2015
|
+
#if HAVE_ICONV
|
2016
|
+
|
2017
|
+
/*
|
2018
|
+
* Return -1 if conversion fails.
|
2019
|
+
*/
|
2020
|
+
static int
|
2021
|
+
iconv_strncat_in_locale(struct archive_string *as, const void *_p,
|
2022
|
+
size_t length, struct archive_string_conv *sc)
|
2023
|
+
{
|
2024
|
+
ICONV_CONST char *itp;
|
2025
|
+
size_t remaining;
|
2026
|
+
iconv_t cd;
|
2027
|
+
char *outp;
|
2028
|
+
size_t avail, bs;
|
2029
|
+
int return_value = 0; /* success */
|
2030
|
+
int to_size, from_size;
|
2031
|
+
|
2032
|
+
if (sc->flag & SCONV_TO_UTF16)
|
2033
|
+
to_size = 2;
|
2034
|
+
else
|
2035
|
+
to_size = 1;
|
2036
|
+
if (sc->flag & SCONV_FROM_UTF16)
|
2037
|
+
from_size = 2;
|
2038
|
+
else
|
2039
|
+
from_size = 1;
|
2040
|
+
|
2041
|
+
if (archive_string_ensure(as, as->length + length*2+to_size) == NULL)
|
2042
|
+
return (-1);
|
2043
|
+
|
2044
|
+
cd = sc->cd;
|
2045
|
+
itp = (char *)(uintptr_t)_p;
|
2046
|
+
remaining = length;
|
2047
|
+
outp = as->s + as->length;
|
2048
|
+
avail = as->buffer_length - as->length - to_size;
|
2049
|
+
while (remaining >= (size_t)from_size) {
|
2050
|
+
size_t result = iconv(cd, &itp, &remaining, &outp, &avail);
|
2051
|
+
|
2052
|
+
if (result != (size_t)-1)
|
2053
|
+
break; /* Conversion completed. */
|
2054
|
+
|
2055
|
+
if (errno == EILSEQ || errno == EINVAL) {
|
2056
|
+
/*
|
2057
|
+
* If an output charset is UTF-8 or UTF-16BE/LE,
|
2058
|
+
* unknown character should be U+FFFD
|
2059
|
+
* (replacement character).
|
2060
|
+
*/
|
2061
|
+
if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
|
2062
|
+
size_t rbytes;
|
2063
|
+
if (sc->flag & SCONV_TO_UTF8)
|
2064
|
+
rbytes = sizeof(utf8_replacement_char);
|
2065
|
+
else
|
2066
|
+
rbytes = 2;
|
2067
|
+
|
2068
|
+
if (avail < rbytes) {
|
2069
|
+
as->length = outp - as->s;
|
2070
|
+
bs = as->buffer_length +
|
2071
|
+
(remaining * to_size) + rbytes;
|
2072
|
+
if (NULL ==
|
2073
|
+
archive_string_ensure(as, bs))
|
2074
|
+
return (-1);
|
2075
|
+
outp = as->s + as->length;
|
2076
|
+
avail = as->buffer_length
|
2077
|
+
- as->length - to_size;
|
2078
|
+
}
|
2079
|
+
if (sc->flag & SCONV_TO_UTF8)
|
2080
|
+
memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
|
2081
|
+
else if (sc->flag & SCONV_TO_UTF16BE)
|
2082
|
+
archive_be16enc(outp, UNICODE_R_CHAR);
|
2083
|
+
else
|
2084
|
+
archive_le16enc(outp, UNICODE_R_CHAR);
|
2085
|
+
outp += rbytes;
|
2086
|
+
avail -= rbytes;
|
2087
|
+
} else {
|
2088
|
+
/* Skip the illegal input bytes. */
|
2089
|
+
*outp++ = '?';
|
2090
|
+
avail--;
|
2091
|
+
}
|
2092
|
+
itp += from_size;
|
2093
|
+
remaining -= from_size;
|
2094
|
+
return_value = -1; /* failure */
|
2095
|
+
} else {
|
2096
|
+
/* E2BIG no output buffer,
|
2097
|
+
* Increase an output buffer. */
|
2098
|
+
as->length = outp - as->s;
|
2099
|
+
bs = as->buffer_length + remaining * 2;
|
2100
|
+
if (NULL == archive_string_ensure(as, bs))
|
2101
|
+
return (-1);
|
2102
|
+
outp = as->s + as->length;
|
2103
|
+
avail = as->buffer_length - as->length - to_size;
|
2104
|
+
}
|
2105
|
+
}
|
2106
|
+
as->length = outp - as->s;
|
2107
|
+
as->s[as->length] = 0;
|
2108
|
+
if (to_size == 2)
|
2109
|
+
as->s[as->length+1] = 0;
|
2110
|
+
return (return_value);
|
2111
|
+
}
|
2112
|
+
|
2113
|
+
#endif /* HAVE_ICONV */
|
2114
|
+
|
2115
|
+
|
2116
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
2117
|
+
|
2118
|
+
/*
|
2119
|
+
* Translate a string from a some CodePage to an another CodePage by
|
2120
|
+
* Windows APIs, and copy the result. Return -1 if conversion fails.
|
2121
|
+
*/
|
2122
|
+
static int
|
2123
|
+
strncat_in_codepage(struct archive_string *as,
|
2124
|
+
const void *_p, size_t length, struct archive_string_conv *sc)
|
2125
|
+
{
|
2126
|
+
const char *s = (const char *)_p;
|
2127
|
+
struct archive_wstring aws;
|
2128
|
+
size_t l;
|
2129
|
+
int r, saved_flag;
|
2130
|
+
|
2131
|
+
archive_string_init(&aws);
|
2132
|
+
saved_flag = sc->flag;
|
2133
|
+
sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C);
|
2134
|
+
r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc);
|
2135
|
+
sc->flag = saved_flag;
|
2136
|
+
if (r != 0) {
|
2137
|
+
archive_wstring_free(&aws);
|
2138
|
+
if (errno != ENOMEM)
|
2139
|
+
archive_string_append(as, s, length);
|
2140
|
+
return (-1);
|
2141
|
+
}
|
2142
|
+
|
2143
|
+
l = as->length;
|
2144
|
+
r = archive_string_append_from_wcs_in_codepage(
|
2145
|
+
as, aws.s, aws.length, sc);
|
2146
|
+
if (r != 0 && errno != ENOMEM && l == as->length)
|
2147
|
+
archive_string_append(as, s, length);
|
2148
|
+
archive_wstring_free(&aws);
|
2149
|
+
return (r);
|
2150
|
+
}
|
2151
|
+
|
2152
|
+
/*
|
2153
|
+
* Test whether MBS ==> WCS is okay.
|
2154
|
+
*/
|
2155
|
+
static int
|
2156
|
+
invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
|
2157
|
+
{
|
2158
|
+
const char *p = (const char *)_p;
|
2159
|
+
unsigned codepage;
|
2160
|
+
DWORD mbflag = MB_ERR_INVALID_CHARS;
|
2161
|
+
|
2162
|
+
if (sc->flag & SCONV_FROM_CHARSET)
|
2163
|
+
codepage = sc->to_cp;
|
2164
|
+
else
|
2165
|
+
codepage = sc->from_cp;
|
2166
|
+
|
2167
|
+
if (codepage == CP_C_LOCALE)
|
2168
|
+
return (0);
|
2169
|
+
if (codepage != CP_UTF8)
|
2170
|
+
mbflag |= MB_PRECOMPOSED;
|
2171
|
+
|
2172
|
+
if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0)
|
2173
|
+
return (-1); /* Invalid */
|
2174
|
+
return (0); /* Okay */
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
#else
|
2178
|
+
|
2179
|
+
/*
|
2180
|
+
* Test whether MBS ==> WCS is okay.
|
2181
|
+
*/
|
2182
|
+
static int
|
2183
|
+
invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
|
2184
|
+
{
|
2185
|
+
const char *p = (const char *)_p;
|
2186
|
+
size_t r;
|
2187
|
+
|
2188
|
+
#if HAVE_MBRTOWC
|
2189
|
+
mbstate_t shift_state;
|
2190
|
+
|
2191
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
2192
|
+
#else
|
2193
|
+
/* Clear the shift state before starting. */
|
2194
|
+
mbtowc(NULL, NULL, 0);
|
2195
|
+
#endif
|
2196
|
+
while (n) {
|
2197
|
+
wchar_t wc;
|
2198
|
+
|
2199
|
+
#if HAVE_MBRTOWC
|
2200
|
+
r = mbrtowc(&wc, p, n, &shift_state);
|
2201
|
+
#else
|
2202
|
+
r = mbtowc(&wc, p, n);
|
2203
|
+
#endif
|
2204
|
+
if (r == (size_t)-1 || r == (size_t)-2)
|
2205
|
+
return (-1);/* Invalid. */
|
2206
|
+
if (r == 0)
|
2207
|
+
break;
|
2208
|
+
p += r;
|
2209
|
+
n -= r;
|
2210
|
+
}
|
2211
|
+
(void)sc; /* UNUSED */
|
2212
|
+
return (0); /* All Okey. */
|
2213
|
+
}
|
2214
|
+
|
2215
|
+
#endif /* defined(_WIN32) && !defined(__CYGWIN__) */
|
2216
|
+
|
2217
|
+
/*
|
2218
|
+
* Basically returns -1 because we cannot make a conversion of charset
|
2219
|
+
* without iconv but in some cases this would return 0.
|
2220
|
+
* Returns 0 if all copied characters are ASCII.
|
2221
|
+
* Returns 0 if both from-locale and to-locale are the same and those
|
2222
|
+
* can be WCS with no error.
|
2223
|
+
*/
|
2224
|
+
static int
|
2225
|
+
best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
|
2226
|
+
size_t length, struct archive_string_conv *sc)
|
2227
|
+
{
|
2228
|
+
size_t remaining;
|
2229
|
+
const uint8_t *itp;
|
2230
|
+
int return_value = 0; /* success */
|
2231
|
+
|
2232
|
+
/*
|
2233
|
+
* If both from-locale and to-locale is the same, this makes a copy.
|
2234
|
+
* And then this checks all copied MBS can be WCS if so returns 0.
|
2235
|
+
*/
|
2236
|
+
if (sc->same) {
|
2237
|
+
if (archive_string_append(as, _p, length) == NULL)
|
2238
|
+
return (-1);/* No memory */
|
2239
|
+
return (invalid_mbs(_p, length, sc));
|
2240
|
+
}
|
2241
|
+
|
2242
|
+
/*
|
2243
|
+
* If a character is ASCII, this just copies it. If not, this
|
2244
|
+
* assigns '?' character instead but in UTF-8 locale this assigns
|
2245
|
+
* byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
|
2246
|
+
* a Replacement Character in Unicode.
|
2247
|
+
*/
|
2248
|
+
|
2249
|
+
remaining = length;
|
2250
|
+
itp = (const uint8_t *)_p;
|
2251
|
+
while (*itp && remaining > 0) {
|
2252
|
+
if (*itp > 127) {
|
2253
|
+
// Non-ASCII: Substitute with suitable replacement
|
2254
|
+
if (sc->flag & SCONV_TO_UTF8) {
|
2255
|
+
if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
|
2256
|
+
__archive_errx(1, "Out of memory");
|
2257
|
+
}
|
2258
|
+
} else {
|
2259
|
+
archive_strappend_char(as, '?');
|
2260
|
+
}
|
2261
|
+
return_value = -1;
|
2262
|
+
} else {
|
2263
|
+
archive_strappend_char(as, *itp);
|
2264
|
+
}
|
2265
|
+
++itp;
|
2266
|
+
}
|
2267
|
+
return (return_value);
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
|
2271
|
+
/*
|
2272
|
+
* Unicode conversion functions.
|
2273
|
+
* - UTF-8 <===> UTF-8 in removing surrogate pairs.
|
2274
|
+
* - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs.
|
2275
|
+
* - UTF-8 made by libarchive 2.x ===> UTF-8.
|
2276
|
+
* - UTF-16BE <===> UTF-8.
|
2277
|
+
*
|
2278
|
+
*/
|
2279
|
+
|
2280
|
+
/*
|
2281
|
+
* Utility to convert a single UTF-8 sequence.
|
2282
|
+
*
|
2283
|
+
* Usually return used bytes, return used byte in negative value when
|
2284
|
+
* a unicode character is replaced with U+FFFD.
|
2285
|
+
* See also http://unicode.org/review/pr-121.html Public Review Issue #121
|
2286
|
+
* Recommended Practice for Replacement Characters.
|
2287
|
+
*/
|
2288
|
+
static int
|
2289
|
+
_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2290
|
+
{
|
2291
|
+
static const char utf8_count[256] = {
|
2292
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */
|
2293
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */
|
2294
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */
|
2295
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */
|
2296
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */
|
2297
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */
|
2298
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */
|
2299
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */
|
2300
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */
|
2301
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */
|
2302
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */
|
2303
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */
|
2304
|
+
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */
|
2305
|
+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */
|
2306
|
+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */
|
2307
|
+
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */
|
2308
|
+
};
|
2309
|
+
int ch, i;
|
2310
|
+
int cnt;
|
2311
|
+
uint32_t wc;
|
2312
|
+
|
2313
|
+
/* Sanity check. */
|
2314
|
+
if (n == 0)
|
2315
|
+
return (0);
|
2316
|
+
/*
|
2317
|
+
* Decode 1-4 bytes depending on the value of the first byte.
|
2318
|
+
*/
|
2319
|
+
ch = (unsigned char)*s;
|
2320
|
+
if (ch == 0)
|
2321
|
+
return (0); /* Standard: return 0 for end-of-string. */
|
2322
|
+
cnt = utf8_count[ch];
|
2323
|
+
|
2324
|
+
/* Invalid sequence or there are not plenty bytes. */
|
2325
|
+
if ((int)n < cnt) {
|
2326
|
+
cnt = (int)n;
|
2327
|
+
for (i = 1; i < cnt; i++) {
|
2328
|
+
if ((s[i] & 0xc0) != 0x80) {
|
2329
|
+
cnt = i;
|
2330
|
+
break;
|
2331
|
+
}
|
2332
|
+
}
|
2333
|
+
goto invalid_sequence;
|
2334
|
+
}
|
2335
|
+
|
2336
|
+
/* Make a Unicode code point from a single UTF-8 sequence. */
|
2337
|
+
switch (cnt) {
|
2338
|
+
case 1: /* 1 byte sequence. */
|
2339
|
+
*pwc = ch & 0x7f;
|
2340
|
+
return (cnt);
|
2341
|
+
case 2: /* 2 bytes sequence. */
|
2342
|
+
if ((s[1] & 0xc0) != 0x80) {
|
2343
|
+
cnt = 1;
|
2344
|
+
goto invalid_sequence;
|
2345
|
+
}
|
2346
|
+
*pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
|
2347
|
+
return (cnt);
|
2348
|
+
case 3: /* 3 bytes sequence. */
|
2349
|
+
if ((s[1] & 0xc0) != 0x80) {
|
2350
|
+
cnt = 1;
|
2351
|
+
goto invalid_sequence;
|
2352
|
+
}
|
2353
|
+
if ((s[2] & 0xc0) != 0x80) {
|
2354
|
+
cnt = 2;
|
2355
|
+
goto invalid_sequence;
|
2356
|
+
}
|
2357
|
+
wc = ((ch & 0x0f) << 12)
|
2358
|
+
| ((s[1] & 0x3f) << 6)
|
2359
|
+
| (s[2] & 0x3f);
|
2360
|
+
if (wc < 0x800)
|
2361
|
+
goto invalid_sequence;/* Overlong sequence. */
|
2362
|
+
break;
|
2363
|
+
case 4: /* 4 bytes sequence. */
|
2364
|
+
if ((s[1] & 0xc0) != 0x80) {
|
2365
|
+
cnt = 1;
|
2366
|
+
goto invalid_sequence;
|
2367
|
+
}
|
2368
|
+
if ((s[2] & 0xc0) != 0x80) {
|
2369
|
+
cnt = 2;
|
2370
|
+
goto invalid_sequence;
|
2371
|
+
}
|
2372
|
+
if ((s[3] & 0xc0) != 0x80) {
|
2373
|
+
cnt = 3;
|
2374
|
+
goto invalid_sequence;
|
2375
|
+
}
|
2376
|
+
wc = ((ch & 0x07) << 18)
|
2377
|
+
| ((s[1] & 0x3f) << 12)
|
2378
|
+
| ((s[2] & 0x3f) << 6)
|
2379
|
+
| (s[3] & 0x3f);
|
2380
|
+
if (wc < 0x10000)
|
2381
|
+
goto invalid_sequence;/* Overlong sequence. */
|
2382
|
+
break;
|
2383
|
+
default: /* Others are all invalid sequence. */
|
2384
|
+
if (ch == 0xc0 || ch == 0xc1)
|
2385
|
+
cnt = 2;
|
2386
|
+
else if (ch >= 0xf5 && ch <= 0xf7)
|
2387
|
+
cnt = 4;
|
2388
|
+
else if (ch >= 0xf8 && ch <= 0xfb)
|
2389
|
+
cnt = 5;
|
2390
|
+
else if (ch == 0xfc || ch == 0xfd)
|
2391
|
+
cnt = 6;
|
2392
|
+
else
|
2393
|
+
cnt = 1;
|
2394
|
+
if ((int)n < cnt)
|
2395
|
+
cnt = (int)n;
|
2396
|
+
for (i = 1; i < cnt; i++) {
|
2397
|
+
if ((s[i] & 0xc0) != 0x80) {
|
2398
|
+
cnt = i;
|
2399
|
+
break;
|
2400
|
+
}
|
2401
|
+
}
|
2402
|
+
goto invalid_sequence;
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
/* The code point larger than 0x10FFFF is not legal
|
2406
|
+
* Unicode values. */
|
2407
|
+
if (wc > UNICODE_MAX)
|
2408
|
+
goto invalid_sequence;
|
2409
|
+
/* Correctly gets a Unicode, returns used bytes. */
|
2410
|
+
*pwc = wc;
|
2411
|
+
return (cnt);
|
2412
|
+
invalid_sequence:
|
2413
|
+
*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
|
2414
|
+
return (cnt * -1);
|
2415
|
+
}
|
2416
|
+
|
2417
|
+
static int
|
2418
|
+
utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2419
|
+
{
|
2420
|
+
int cnt;
|
2421
|
+
|
2422
|
+
cnt = _utf8_to_unicode(pwc, s, n);
|
2423
|
+
/* Any of Surrogate pair is not legal Unicode values. */
|
2424
|
+
if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
|
2425
|
+
return (-3);
|
2426
|
+
return (cnt);
|
2427
|
+
}
|
2428
|
+
|
2429
|
+
static inline uint32_t
|
2430
|
+
combine_surrogate_pair(uint32_t uc, uint32_t uc2)
|
2431
|
+
{
|
2432
|
+
uc -= 0xD800;
|
2433
|
+
uc *= 0x400;
|
2434
|
+
uc += uc2 - 0xDC00;
|
2435
|
+
uc += 0x10000;
|
2436
|
+
return (uc);
|
2437
|
+
}
|
2438
|
+
|
2439
|
+
/*
|
2440
|
+
* Convert a single UTF-8/CESU-8 sequence to a Unicode code point in
|
2441
|
+
* removing surrogate pairs.
|
2442
|
+
*
|
2443
|
+
* CESU-8: The Compatibility Encoding Scheme for UTF-16.
|
2444
|
+
*
|
2445
|
+
* Usually return used bytes, return used byte in negative value when
|
2446
|
+
* a unicode character is replaced with U+FFFD.
|
2447
|
+
*/
|
2448
|
+
static int
|
2449
|
+
cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2450
|
+
{
|
2451
|
+
uint32_t wc = 0;
|
2452
|
+
int cnt;
|
2453
|
+
|
2454
|
+
cnt = _utf8_to_unicode(&wc, s, n);
|
2455
|
+
if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) {
|
2456
|
+
uint32_t wc2 = 0;
|
2457
|
+
if (n - 3 < 3) {
|
2458
|
+
/* Invalid byte sequence. */
|
2459
|
+
goto invalid_sequence;
|
2460
|
+
}
|
2461
|
+
cnt = _utf8_to_unicode(&wc2, s+3, n-3);
|
2462
|
+
if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) {
|
2463
|
+
/* Invalid byte sequence. */
|
2464
|
+
goto invalid_sequence;
|
2465
|
+
}
|
2466
|
+
wc = combine_surrogate_pair(wc, wc2);
|
2467
|
+
cnt = 6;
|
2468
|
+
} else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) {
|
2469
|
+
/* Invalid byte sequence. */
|
2470
|
+
goto invalid_sequence;
|
2471
|
+
}
|
2472
|
+
*pwc = wc;
|
2473
|
+
return (cnt);
|
2474
|
+
invalid_sequence:
|
2475
|
+
*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
|
2476
|
+
if (cnt > 0)
|
2477
|
+
cnt *= -1;
|
2478
|
+
return (cnt);
|
2479
|
+
}
|
2480
|
+
|
2481
|
+
/*
|
2482
|
+
* Convert a Unicode code point to a single UTF-8 sequence.
|
2483
|
+
*
|
2484
|
+
* NOTE:This function does not check if the Unicode is legal or not.
|
2485
|
+
* Please you definitely check it before calling this.
|
2486
|
+
*/
|
2487
|
+
static size_t
|
2488
|
+
unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
|
2489
|
+
{
|
2490
|
+
char *_p = p;
|
2491
|
+
|
2492
|
+
/* Invalid Unicode char maps to Replacement character */
|
2493
|
+
if (uc > UNICODE_MAX)
|
2494
|
+
uc = UNICODE_R_CHAR;
|
2495
|
+
/* Translate code point to UTF8 */
|
2496
|
+
if (uc <= 0x7f) {
|
2497
|
+
if (remaining == 0)
|
2498
|
+
return (0);
|
2499
|
+
*p++ = (char)uc;
|
2500
|
+
} else if (uc <= 0x7ff) {
|
2501
|
+
if (remaining < 2)
|
2502
|
+
return (0);
|
2503
|
+
*p++ = 0xc0 | ((uc >> 6) & 0x1f);
|
2504
|
+
*p++ = 0x80 | (uc & 0x3f);
|
2505
|
+
} else if (uc <= 0xffff) {
|
2506
|
+
if (remaining < 3)
|
2507
|
+
return (0);
|
2508
|
+
*p++ = 0xe0 | ((uc >> 12) & 0x0f);
|
2509
|
+
*p++ = 0x80 | ((uc >> 6) & 0x3f);
|
2510
|
+
*p++ = 0x80 | (uc & 0x3f);
|
2511
|
+
} else {
|
2512
|
+
if (remaining < 4)
|
2513
|
+
return (0);
|
2514
|
+
*p++ = 0xf0 | ((uc >> 18) & 0x07);
|
2515
|
+
*p++ = 0x80 | ((uc >> 12) & 0x3f);
|
2516
|
+
*p++ = 0x80 | ((uc >> 6) & 0x3f);
|
2517
|
+
*p++ = 0x80 | (uc & 0x3f);
|
2518
|
+
}
|
2519
|
+
return (p - _p);
|
2520
|
+
}
|
2521
|
+
|
2522
|
+
static int
|
2523
|
+
utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2524
|
+
{
|
2525
|
+
return (utf16_to_unicode(pwc, s, n, 1));
|
2526
|
+
}
|
2527
|
+
|
2528
|
+
static int
|
2529
|
+
utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2530
|
+
{
|
2531
|
+
return (utf16_to_unicode(pwc, s, n, 0));
|
2532
|
+
}
|
2533
|
+
|
2534
|
+
static int
|
2535
|
+
utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be)
|
2536
|
+
{
|
2537
|
+
const char *utf16 = s;
|
2538
|
+
unsigned uc;
|
2539
|
+
|
2540
|
+
if (n == 0)
|
2541
|
+
return (0);
|
2542
|
+
if (n == 1) {
|
2543
|
+
/* set the Replacement Character instead. */
|
2544
|
+
*pwc = UNICODE_R_CHAR;
|
2545
|
+
return (-1);
|
2546
|
+
}
|
2547
|
+
|
2548
|
+
if (be)
|
2549
|
+
uc = archive_be16dec(utf16);
|
2550
|
+
else
|
2551
|
+
uc = archive_le16dec(utf16);
|
2552
|
+
utf16 += 2;
|
2553
|
+
|
2554
|
+
/* If this is a surrogate pair, assemble the full code point.*/
|
2555
|
+
if (IS_HIGH_SURROGATE_LA(uc)) {
|
2556
|
+
unsigned uc2;
|
2557
|
+
|
2558
|
+
if (n >= 4) {
|
2559
|
+
if (be)
|
2560
|
+
uc2 = archive_be16dec(utf16);
|
2561
|
+
else
|
2562
|
+
uc2 = archive_le16dec(utf16);
|
2563
|
+
} else
|
2564
|
+
uc2 = 0;
|
2565
|
+
if (IS_LOW_SURROGATE_LA(uc2)) {
|
2566
|
+
uc = combine_surrogate_pair(uc, uc2);
|
2567
|
+
utf16 += 2;
|
2568
|
+
} else {
|
2569
|
+
/* Undescribed code point should be U+FFFD
|
2570
|
+
* (replacement character). */
|
2571
|
+
*pwc = UNICODE_R_CHAR;
|
2572
|
+
return (-2);
|
2573
|
+
}
|
2574
|
+
}
|
2575
|
+
|
2576
|
+
/*
|
2577
|
+
* Surrogate pair values(0xd800 through 0xdfff) are only
|
2578
|
+
* used by UTF-16, so, after above calculation, the code
|
2579
|
+
* must not be surrogate values, and Unicode has no codes
|
2580
|
+
* larger than 0x10ffff. Thus, those are not legal Unicode
|
2581
|
+
* values.
|
2582
|
+
*/
|
2583
|
+
if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
|
2584
|
+
/* Undescribed code point should be U+FFFD
|
2585
|
+
* (replacement character). */
|
2586
|
+
*pwc = UNICODE_R_CHAR;
|
2587
|
+
return (((int)(utf16 - s)) * -1);
|
2588
|
+
}
|
2589
|
+
*pwc = uc;
|
2590
|
+
return ((int)(utf16 - s));
|
2591
|
+
}
|
2592
|
+
|
2593
|
+
static size_t
|
2594
|
+
unicode_to_utf16be(char *p, size_t remaining, uint32_t uc)
|
2595
|
+
{
|
2596
|
+
char *utf16 = p;
|
2597
|
+
|
2598
|
+
if (uc > 0xffff) {
|
2599
|
+
/* We have a code point that won't fit into a
|
2600
|
+
* wchar_t; convert it to a surrogate pair. */
|
2601
|
+
if (remaining < 4)
|
2602
|
+
return (0);
|
2603
|
+
uc -= 0x10000;
|
2604
|
+
archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
|
2605
|
+
archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
|
2606
|
+
return (4);
|
2607
|
+
} else {
|
2608
|
+
if (remaining < 2)
|
2609
|
+
return (0);
|
2610
|
+
archive_be16enc(utf16, uc);
|
2611
|
+
return (2);
|
2612
|
+
}
|
2613
|
+
}
|
2614
|
+
|
2615
|
+
static size_t
|
2616
|
+
unicode_to_utf16le(char *p, size_t remaining, uint32_t uc)
|
2617
|
+
{
|
2618
|
+
char *utf16 = p;
|
2619
|
+
|
2620
|
+
if (uc > 0xffff) {
|
2621
|
+
/* We have a code point that won't fit into a
|
2622
|
+
* wchar_t; convert it to a surrogate pair. */
|
2623
|
+
if (remaining < 4)
|
2624
|
+
return (0);
|
2625
|
+
uc -= 0x10000;
|
2626
|
+
archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
|
2627
|
+
archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
|
2628
|
+
return (4);
|
2629
|
+
} else {
|
2630
|
+
if (remaining < 2)
|
2631
|
+
return (0);
|
2632
|
+
archive_le16enc(utf16, uc);
|
2633
|
+
return (2);
|
2634
|
+
}
|
2635
|
+
}
|
2636
|
+
|
2637
|
+
/*
|
2638
|
+
* Copy UTF-8 string in checking surrogate pair.
|
2639
|
+
* If any surrogate pair are found, it would be canonicalized.
|
2640
|
+
*/
|
2641
|
+
static int
|
2642
|
+
strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p,
|
2643
|
+
size_t len, struct archive_string_conv *sc)
|
2644
|
+
{
|
2645
|
+
const char *s;
|
2646
|
+
char *p, *endp;
|
2647
|
+
int n, ret = 0;
|
2648
|
+
|
2649
|
+
(void)sc; /* UNUSED */
|
2650
|
+
|
2651
|
+
if (archive_string_ensure(as, as->length + len + 1) == NULL)
|
2652
|
+
return (-1);
|
2653
|
+
|
2654
|
+
s = (const char *)_p;
|
2655
|
+
p = as->s + as->length;
|
2656
|
+
endp = as->s + as->buffer_length -1;
|
2657
|
+
do {
|
2658
|
+
uint32_t uc;
|
2659
|
+
const char *ss = s;
|
2660
|
+
size_t w;
|
2661
|
+
|
2662
|
+
/*
|
2663
|
+
* Forward byte sequence until a conversion of that is needed.
|
2664
|
+
*/
|
2665
|
+
while ((n = utf8_to_unicode(&uc, s, len)) > 0) {
|
2666
|
+
s += n;
|
2667
|
+
len -= n;
|
2668
|
+
}
|
2669
|
+
if (ss < s) {
|
2670
|
+
if (p + (s - ss) > endp) {
|
2671
|
+
as->length = p - as->s;
|
2672
|
+
if (archive_string_ensure(as,
|
2673
|
+
as->buffer_length + len + 1) == NULL)
|
2674
|
+
return (-1);
|
2675
|
+
p = as->s + as->length;
|
2676
|
+
endp = as->s + as->buffer_length -1;
|
2677
|
+
}
|
2678
|
+
|
2679
|
+
memcpy(p, ss, s - ss);
|
2680
|
+
p += s - ss;
|
2681
|
+
}
|
2682
|
+
|
2683
|
+
/*
|
2684
|
+
* If n is negative, current byte sequence needs a replacement.
|
2685
|
+
*/
|
2686
|
+
if (n < 0) {
|
2687
|
+
if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) {
|
2688
|
+
/* Current byte sequence may be CESU-8. */
|
2689
|
+
n = cesu8_to_unicode(&uc, s, len);
|
2690
|
+
}
|
2691
|
+
if (n < 0) {
|
2692
|
+
ret = -1;
|
2693
|
+
n *= -1;/* Use a replaced unicode character. */
|
2694
|
+
}
|
2695
|
+
|
2696
|
+
/* Rebuild UTF-8 byte sequence. */
|
2697
|
+
while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) {
|
2698
|
+
as->length = p - as->s;
|
2699
|
+
if (archive_string_ensure(as,
|
2700
|
+
as->buffer_length + len + 1) == NULL)
|
2701
|
+
return (-1);
|
2702
|
+
p = as->s + as->length;
|
2703
|
+
endp = as->s + as->buffer_length -1;
|
2704
|
+
}
|
2705
|
+
p += w;
|
2706
|
+
s += n;
|
2707
|
+
len -= n;
|
2708
|
+
}
|
2709
|
+
} while (n > 0);
|
2710
|
+
as->length = p - as->s;
|
2711
|
+
as->s[as->length] = '\0';
|
2712
|
+
return (ret);
|
2713
|
+
}
|
2714
|
+
|
2715
|
+
static int
|
2716
|
+
archive_string_append_unicode(struct archive_string *as, const void *_p,
|
2717
|
+
size_t len, struct archive_string_conv *sc)
|
2718
|
+
{
|
2719
|
+
const char *s;
|
2720
|
+
char *p, *endp;
|
2721
|
+
uint32_t uc;
|
2722
|
+
size_t w;
|
2723
|
+
int n, ret = 0, ts, tm;
|
2724
|
+
int (*parse)(uint32_t *, const char *, size_t);
|
2725
|
+
size_t (*unparse)(char *, size_t, uint32_t);
|
2726
|
+
|
2727
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
2728
|
+
unparse = unicode_to_utf16be;
|
2729
|
+
ts = 2;
|
2730
|
+
} else if (sc->flag & SCONV_TO_UTF16LE) {
|
2731
|
+
unparse = unicode_to_utf16le;
|
2732
|
+
ts = 2;
|
2733
|
+
} else if (sc->flag & SCONV_TO_UTF8) {
|
2734
|
+
unparse = unicode_to_utf8;
|
2735
|
+
ts = 1;
|
2736
|
+
} else {
|
2737
|
+
/*
|
2738
|
+
* This case is going to be converted to another
|
2739
|
+
* character-set through iconv.
|
2740
|
+
*/
|
2741
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2742
|
+
unparse = unicode_to_utf16be;
|
2743
|
+
ts = 2;
|
2744
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2745
|
+
unparse = unicode_to_utf16le;
|
2746
|
+
ts = 2;
|
2747
|
+
} else {
|
2748
|
+
unparse = unicode_to_utf8;
|
2749
|
+
ts = 1;
|
2750
|
+
}
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2754
|
+
parse = utf16be_to_unicode;
|
2755
|
+
tm = 1;
|
2756
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2757
|
+
parse = utf16le_to_unicode;
|
2758
|
+
tm = 1;
|
2759
|
+
} else {
|
2760
|
+
parse = cesu8_to_unicode;
|
2761
|
+
tm = ts;
|
2762
|
+
}
|
2763
|
+
|
2764
|
+
if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
|
2765
|
+
return (-1);
|
2766
|
+
|
2767
|
+
s = (const char *)_p;
|
2768
|
+
p = as->s + as->length;
|
2769
|
+
endp = as->s + as->buffer_length - ts;
|
2770
|
+
while ((n = parse(&uc, s, len)) != 0) {
|
2771
|
+
if (n < 0) {
|
2772
|
+
/* Use a replaced unicode character. */
|
2773
|
+
n *= -1;
|
2774
|
+
ret = -1;
|
2775
|
+
}
|
2776
|
+
s += n;
|
2777
|
+
len -= n;
|
2778
|
+
while ((w = unparse(p, endp - p, uc)) == 0) {
|
2779
|
+
/* There is not enough output buffer so
|
2780
|
+
* we have to expand it. */
|
2781
|
+
as->length = p - as->s;
|
2782
|
+
if (archive_string_ensure(as,
|
2783
|
+
as->buffer_length + len * tm + ts) == NULL)
|
2784
|
+
return (-1);
|
2785
|
+
p = as->s + as->length;
|
2786
|
+
endp = as->s + as->buffer_length - ts;
|
2787
|
+
}
|
2788
|
+
p += w;
|
2789
|
+
}
|
2790
|
+
as->length = p - as->s;
|
2791
|
+
as->s[as->length] = '\0';
|
2792
|
+
if (ts == 2)
|
2793
|
+
as->s[as->length+1] = '\0';
|
2794
|
+
return (ret);
|
2795
|
+
}
|
2796
|
+
|
2797
|
+
/*
|
2798
|
+
* Following Constants for Hangul compositions this information comes from
|
2799
|
+
* Unicode Standard Annex #15 http://unicode.org/reports/tr15/
|
2800
|
+
*/
|
2801
|
+
#define HC_SBASE 0xAC00
|
2802
|
+
#define HC_LBASE 0x1100
|
2803
|
+
#define HC_VBASE 0x1161
|
2804
|
+
#define HC_TBASE 0x11A7
|
2805
|
+
#define HC_LCOUNT 19
|
2806
|
+
#define HC_VCOUNT 21
|
2807
|
+
#define HC_TCOUNT 28
|
2808
|
+
#define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT)
|
2809
|
+
#define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT)
|
2810
|
+
|
2811
|
+
static uint32_t
|
2812
|
+
get_nfc(uint32_t uc, uint32_t uc2)
|
2813
|
+
{
|
2814
|
+
int t, b;
|
2815
|
+
|
2816
|
+
t = 0;
|
2817
|
+
b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1;
|
2818
|
+
while (b >= t) {
|
2819
|
+
int m = (t + b) / 2;
|
2820
|
+
if (u_composition_table[m].cp1 < uc)
|
2821
|
+
t = m + 1;
|
2822
|
+
else if (u_composition_table[m].cp1 > uc)
|
2823
|
+
b = m - 1;
|
2824
|
+
else if (u_composition_table[m].cp2 < uc2)
|
2825
|
+
t = m + 1;
|
2826
|
+
else if (u_composition_table[m].cp2 > uc2)
|
2827
|
+
b = m - 1;
|
2828
|
+
else
|
2829
|
+
return (u_composition_table[m].nfc);
|
2830
|
+
}
|
2831
|
+
return (0);
|
2832
|
+
}
|
2833
|
+
|
2834
|
+
#define FDC_MAX 10 /* The maximum number of Following Decomposable
|
2835
|
+
* Characters. */
|
2836
|
+
|
2837
|
+
/*
|
2838
|
+
* Update first code point.
|
2839
|
+
*/
|
2840
|
+
#define UPDATE_UC(new_uc) do { \
|
2841
|
+
uc = new_uc; \
|
2842
|
+
ucptr = NULL; \
|
2843
|
+
} while (0)
|
2844
|
+
|
2845
|
+
/*
|
2846
|
+
* Replace first code point with second code point.
|
2847
|
+
*/
|
2848
|
+
#define REPLACE_UC_WITH_UC2() do { \
|
2849
|
+
uc = uc2; \
|
2850
|
+
ucptr = uc2ptr; \
|
2851
|
+
n = n2; \
|
2852
|
+
} while (0)
|
2853
|
+
|
2854
|
+
#define EXPAND_BUFFER() do { \
|
2855
|
+
as->length = p - as->s; \
|
2856
|
+
if (archive_string_ensure(as, \
|
2857
|
+
as->buffer_length + len * tm + ts) == NULL)\
|
2858
|
+
return (-1); \
|
2859
|
+
p = as->s + as->length; \
|
2860
|
+
endp = as->s + as->buffer_length - ts; \
|
2861
|
+
} while (0)
|
2862
|
+
|
2863
|
+
#define UNPARSE(p, endp, uc) do { \
|
2864
|
+
while ((w = unparse(p, (endp) - (p), uc)) == 0) {\
|
2865
|
+
EXPAND_BUFFER(); \
|
2866
|
+
} \
|
2867
|
+
p += w; \
|
2868
|
+
} while (0)
|
2869
|
+
|
2870
|
+
/*
|
2871
|
+
* Write first code point.
|
2872
|
+
* If the code point has not be changed from its original code,
|
2873
|
+
* this just copies it from its original buffer pointer.
|
2874
|
+
* If not, this converts it to UTF-8 byte sequence and copies it.
|
2875
|
+
*/
|
2876
|
+
#define WRITE_UC() do { \
|
2877
|
+
if (ucptr) { \
|
2878
|
+
if (p + n > endp) \
|
2879
|
+
EXPAND_BUFFER(); \
|
2880
|
+
switch (n) { \
|
2881
|
+
case 4: \
|
2882
|
+
*p++ = *ucptr++; \
|
2883
|
+
/* FALL THROUGH */ \
|
2884
|
+
case 3: \
|
2885
|
+
*p++ = *ucptr++; \
|
2886
|
+
/* FALL THROUGH */ \
|
2887
|
+
case 2: \
|
2888
|
+
*p++ = *ucptr++; \
|
2889
|
+
/* FALL THROUGH */ \
|
2890
|
+
case 1: \
|
2891
|
+
*p++ = *ucptr; \
|
2892
|
+
break; \
|
2893
|
+
} \
|
2894
|
+
ucptr = NULL; \
|
2895
|
+
} else { \
|
2896
|
+
UNPARSE(p, endp, uc); \
|
2897
|
+
} \
|
2898
|
+
} while (0)
|
2899
|
+
|
2900
|
+
/*
|
2901
|
+
* Collect following decomposable code points.
|
2902
|
+
*/
|
2903
|
+
#define COLLECT_CPS(start) do { \
|
2904
|
+
int _i; \
|
2905
|
+
for (_i = start; _i < FDC_MAX ; _i++) { \
|
2906
|
+
nx = parse(&ucx[_i], s, len); \
|
2907
|
+
if (nx <= 0) \
|
2908
|
+
break; \
|
2909
|
+
cx = CCC(ucx[_i]); \
|
2910
|
+
if (cl >= cx && cl != 228 && cx != 228)\
|
2911
|
+
break; \
|
2912
|
+
s += nx; \
|
2913
|
+
len -= nx; \
|
2914
|
+
cl = cx; \
|
2915
|
+
ccx[_i] = cx; \
|
2916
|
+
} \
|
2917
|
+
if (_i >= FDC_MAX) { \
|
2918
|
+
ret = -1; \
|
2919
|
+
ucx_size = FDC_MAX; \
|
2920
|
+
} else \
|
2921
|
+
ucx_size = _i; \
|
2922
|
+
} while (0)
|
2923
|
+
|
2924
|
+
/*
|
2925
|
+
* Normalize UTF-8/UTF-16BE characters to Form C and copy the result.
|
2926
|
+
*
|
2927
|
+
* TODO: Convert composition exclusions, which are never converted
|
2928
|
+
* from NFC,NFD,NFKC and NFKD, to Form C.
|
2929
|
+
*/
|
2930
|
+
static int
|
2931
|
+
archive_string_normalize_C(struct archive_string *as, const void *_p,
|
2932
|
+
size_t len, struct archive_string_conv *sc)
|
2933
|
+
{
|
2934
|
+
const char *s = (const char *)_p;
|
2935
|
+
char *p, *endp;
|
2936
|
+
uint32_t uc, uc2;
|
2937
|
+
size_t w;
|
2938
|
+
int always_replace, n, n2, ret = 0, spair, ts, tm;
|
2939
|
+
int (*parse)(uint32_t *, const char *, size_t);
|
2940
|
+
size_t (*unparse)(char *, size_t, uint32_t);
|
2941
|
+
|
2942
|
+
always_replace = 1;
|
2943
|
+
ts = 1;/* text size. */
|
2944
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
2945
|
+
unparse = unicode_to_utf16be;
|
2946
|
+
ts = 2;
|
2947
|
+
if (sc->flag & SCONV_FROM_UTF16BE)
|
2948
|
+
always_replace = 0;
|
2949
|
+
} else if (sc->flag & SCONV_TO_UTF16LE) {
|
2950
|
+
unparse = unicode_to_utf16le;
|
2951
|
+
ts = 2;
|
2952
|
+
if (sc->flag & SCONV_FROM_UTF16LE)
|
2953
|
+
always_replace = 0;
|
2954
|
+
} else if (sc->flag & SCONV_TO_UTF8) {
|
2955
|
+
unparse = unicode_to_utf8;
|
2956
|
+
if (sc->flag & SCONV_FROM_UTF8)
|
2957
|
+
always_replace = 0;
|
2958
|
+
} else {
|
2959
|
+
/*
|
2960
|
+
* This case is going to be converted to another
|
2961
|
+
* character-set through iconv.
|
2962
|
+
*/
|
2963
|
+
always_replace = 0;
|
2964
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2965
|
+
unparse = unicode_to_utf16be;
|
2966
|
+
ts = 2;
|
2967
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2968
|
+
unparse = unicode_to_utf16le;
|
2969
|
+
ts = 2;
|
2970
|
+
} else {
|
2971
|
+
unparse = unicode_to_utf8;
|
2972
|
+
}
|
2973
|
+
}
|
2974
|
+
|
2975
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2976
|
+
parse = utf16be_to_unicode;
|
2977
|
+
tm = 1;
|
2978
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
2979
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2980
|
+
parse = utf16le_to_unicode;
|
2981
|
+
tm = 1;
|
2982
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
2983
|
+
} else {
|
2984
|
+
parse = cesu8_to_unicode;
|
2985
|
+
tm = ts;
|
2986
|
+
spair = 6;/* surrogate pair size in UTF-8. */
|
2987
|
+
}
|
2988
|
+
|
2989
|
+
if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
|
2990
|
+
return (-1);
|
2991
|
+
|
2992
|
+
p = as->s + as->length;
|
2993
|
+
endp = as->s + as->buffer_length - ts;
|
2994
|
+
while ((n = parse(&uc, s, len)) != 0) {
|
2995
|
+
const char *ucptr, *uc2ptr;
|
2996
|
+
|
2997
|
+
if (n < 0) {
|
2998
|
+
/* Use a replaced unicode character. */
|
2999
|
+
UNPARSE(p, endp, uc);
|
3000
|
+
s += n*-1;
|
3001
|
+
len -= n*-1;
|
3002
|
+
ret = -1;
|
3003
|
+
continue;
|
3004
|
+
} else if (n == spair || always_replace)
|
3005
|
+
/* uc is converted from a surrogate pair.
|
3006
|
+
* this should be treated as a changed code. */
|
3007
|
+
ucptr = NULL;
|
3008
|
+
else
|
3009
|
+
ucptr = s;
|
3010
|
+
s += n;
|
3011
|
+
len -= n;
|
3012
|
+
|
3013
|
+
/* Read second code point. */
|
3014
|
+
while ((n2 = parse(&uc2, s, len)) > 0) {
|
3015
|
+
uint32_t ucx[FDC_MAX];
|
3016
|
+
int ccx[FDC_MAX];
|
3017
|
+
int cl, cx, i, nx, ucx_size;
|
3018
|
+
int LIndex,SIndex;
|
3019
|
+
uint32_t nfc;
|
3020
|
+
|
3021
|
+
if (n2 == spair || always_replace)
|
3022
|
+
/* uc2 is converted from a surrogate pair.
|
3023
|
+
* this should be treated as a changed code. */
|
3024
|
+
uc2ptr = NULL;
|
3025
|
+
else
|
3026
|
+
uc2ptr = s;
|
3027
|
+
s += n2;
|
3028
|
+
len -= n2;
|
3029
|
+
|
3030
|
+
/*
|
3031
|
+
* If current second code point is out of decomposable
|
3032
|
+
* code points, finding compositions is unneeded.
|
3033
|
+
*/
|
3034
|
+
if (!IS_DECOMPOSABLE_BLOCK(uc2)) {
|
3035
|
+
WRITE_UC();
|
3036
|
+
REPLACE_UC_WITH_UC2();
|
3037
|
+
continue;
|
3038
|
+
}
|
3039
|
+
|
3040
|
+
/*
|
3041
|
+
* Try to combine current code points.
|
3042
|
+
*/
|
3043
|
+
/*
|
3044
|
+
* We have to combine Hangul characters according to
|
3045
|
+
* http://uniicode.org/reports/tr15/#Hangul
|
3046
|
+
*/
|
3047
|
+
if (0 <= (LIndex = uc - HC_LBASE) &&
|
3048
|
+
LIndex < HC_LCOUNT) {
|
3049
|
+
/*
|
3050
|
+
* Hangul Composition.
|
3051
|
+
* 1. Two current code points are L and V.
|
3052
|
+
*/
|
3053
|
+
int VIndex = uc2 - HC_VBASE;
|
3054
|
+
if (0 <= VIndex && VIndex < HC_VCOUNT) {
|
3055
|
+
/* Make syllable of form LV. */
|
3056
|
+
UPDATE_UC(HC_SBASE +
|
3057
|
+
(LIndex * HC_VCOUNT + VIndex) *
|
3058
|
+
HC_TCOUNT);
|
3059
|
+
} else {
|
3060
|
+
WRITE_UC();
|
3061
|
+
REPLACE_UC_WITH_UC2();
|
3062
|
+
}
|
3063
|
+
continue;
|
3064
|
+
} else if (0 <= (SIndex = uc - HC_SBASE) &&
|
3065
|
+
SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) {
|
3066
|
+
/*
|
3067
|
+
* Hangul Composition.
|
3068
|
+
* 2. Two current code points are LV and T.
|
3069
|
+
*/
|
3070
|
+
int TIndex = uc2 - HC_TBASE;
|
3071
|
+
if (0 < TIndex && TIndex < HC_TCOUNT) {
|
3072
|
+
/* Make syllable of form LVT. */
|
3073
|
+
UPDATE_UC(uc + TIndex);
|
3074
|
+
} else {
|
3075
|
+
WRITE_UC();
|
3076
|
+
REPLACE_UC_WITH_UC2();
|
3077
|
+
}
|
3078
|
+
continue;
|
3079
|
+
} else if ((nfc = get_nfc(uc, uc2)) != 0) {
|
3080
|
+
/* A composition to current code points
|
3081
|
+
* is found. */
|
3082
|
+
UPDATE_UC(nfc);
|
3083
|
+
continue;
|
3084
|
+
} else if ((cl = CCC(uc2)) == 0) {
|
3085
|
+
/* Clearly 'uc2' the second code point is not
|
3086
|
+
* a decomposable code. */
|
3087
|
+
WRITE_UC();
|
3088
|
+
REPLACE_UC_WITH_UC2();
|
3089
|
+
continue;
|
3090
|
+
}
|
3091
|
+
|
3092
|
+
/*
|
3093
|
+
* Collect following decomposable code points.
|
3094
|
+
*/
|
3095
|
+
cx = 0;
|
3096
|
+
ucx[0] = uc2;
|
3097
|
+
ccx[0] = cl;
|
3098
|
+
COLLECT_CPS(1);
|
3099
|
+
|
3100
|
+
/*
|
3101
|
+
* Find a composed code in the collected code points.
|
3102
|
+
*/
|
3103
|
+
i = 1;
|
3104
|
+
while (i < ucx_size) {
|
3105
|
+
int j;
|
3106
|
+
|
3107
|
+
if ((nfc = get_nfc(uc, ucx[i])) == 0) {
|
3108
|
+
i++;
|
3109
|
+
continue;
|
3110
|
+
}
|
3111
|
+
|
3112
|
+
/*
|
3113
|
+
* nfc is composed of uc and ucx[i].
|
3114
|
+
*/
|
3115
|
+
UPDATE_UC(nfc);
|
3116
|
+
|
3117
|
+
/*
|
3118
|
+
* Remove ucx[i] by shifting
|
3119
|
+
* following code points.
|
3120
|
+
*/
|
3121
|
+
for (j = i; j+1 < ucx_size; j++) {
|
3122
|
+
ucx[j] = ucx[j+1];
|
3123
|
+
ccx[j] = ccx[j+1];
|
3124
|
+
}
|
3125
|
+
ucx_size --;
|
3126
|
+
|
3127
|
+
/*
|
3128
|
+
* Collect following code points blocked
|
3129
|
+
* by ucx[i] the removed code point.
|
3130
|
+
*/
|
3131
|
+
if (ucx_size > 0 && i == ucx_size &&
|
3132
|
+
nx > 0 && cx == cl) {
|
3133
|
+
cl = ccx[ucx_size-1];
|
3134
|
+
COLLECT_CPS(ucx_size);
|
3135
|
+
}
|
3136
|
+
/*
|
3137
|
+
* Restart finding a composed code with
|
3138
|
+
* the updated uc from the top of the
|
3139
|
+
* collected code points.
|
3140
|
+
*/
|
3141
|
+
i = 0;
|
3142
|
+
}
|
3143
|
+
|
3144
|
+
/*
|
3145
|
+
* Apparently the current code points are not
|
3146
|
+
* decomposed characters or already composed.
|
3147
|
+
*/
|
3148
|
+
WRITE_UC();
|
3149
|
+
for (i = 0; i < ucx_size; i++)
|
3150
|
+
UNPARSE(p, endp, ucx[i]);
|
3151
|
+
|
3152
|
+
/*
|
3153
|
+
* Flush out remaining canonical combining characters.
|
3154
|
+
*/
|
3155
|
+
if (nx > 0 && cx == cl && len > 0) {
|
3156
|
+
while ((nx = parse(&ucx[0], s, len))
|
3157
|
+
> 0) {
|
3158
|
+
cx = CCC(ucx[0]);
|
3159
|
+
if (cl > cx)
|
3160
|
+
break;
|
3161
|
+
s += nx;
|
3162
|
+
len -= nx;
|
3163
|
+
cl = cx;
|
3164
|
+
UNPARSE(p, endp, ucx[0]);
|
3165
|
+
}
|
3166
|
+
}
|
3167
|
+
break;
|
3168
|
+
}
|
3169
|
+
if (n2 < 0) {
|
3170
|
+
WRITE_UC();
|
3171
|
+
/* Use a replaced unicode character. */
|
3172
|
+
UNPARSE(p, endp, uc2);
|
3173
|
+
s += n2*-1;
|
3174
|
+
len -= n2*-1;
|
3175
|
+
ret = -1;
|
3176
|
+
continue;
|
3177
|
+
} else if (n2 == 0) {
|
3178
|
+
WRITE_UC();
|
3179
|
+
break;
|
3180
|
+
}
|
3181
|
+
}
|
3182
|
+
as->length = p - as->s;
|
3183
|
+
as->s[as->length] = '\0';
|
3184
|
+
if (ts == 2)
|
3185
|
+
as->s[as->length+1] = '\0';
|
3186
|
+
return (ret);
|
3187
|
+
}
|
3188
|
+
|
3189
|
+
static int
|
3190
|
+
get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc)
|
3191
|
+
{
|
3192
|
+
int t, b;
|
3193
|
+
|
3194
|
+
/*
|
3195
|
+
* These are not converted to NFD on Mac OS.
|
3196
|
+
*/
|
3197
|
+
if ((uc >= 0x2000 && uc <= 0x2FFF) ||
|
3198
|
+
(uc >= 0xF900 && uc <= 0xFAFF) ||
|
3199
|
+
(uc >= 0x2F800 && uc <= 0x2FAFF))
|
3200
|
+
return (0);
|
3201
|
+
/*
|
3202
|
+
* Those code points are not converted to NFD on Mac OS.
|
3203
|
+
* I do not know the reason because it is undocumented.
|
3204
|
+
* NFC NFD
|
3205
|
+
* 1109A ==> 11099 110BA
|
3206
|
+
* 1109C ==> 1109B 110BA
|
3207
|
+
* 110AB ==> 110A5 110BA
|
3208
|
+
*/
|
3209
|
+
if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB)
|
3210
|
+
return (0);
|
3211
|
+
|
3212
|
+
t = 0;
|
3213
|
+
b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1;
|
3214
|
+
while (b >= t) {
|
3215
|
+
int m = (t + b) / 2;
|
3216
|
+
if (u_decomposition_table[m].nfc < uc)
|
3217
|
+
t = m + 1;
|
3218
|
+
else if (u_decomposition_table[m].nfc > uc)
|
3219
|
+
b = m - 1;
|
3220
|
+
else {
|
3221
|
+
*cp1 = u_decomposition_table[m].cp1;
|
3222
|
+
*cp2 = u_decomposition_table[m].cp2;
|
3223
|
+
return (1);
|
3224
|
+
}
|
3225
|
+
}
|
3226
|
+
return (0);
|
3227
|
+
}
|
3228
|
+
|
3229
|
+
#define REPLACE_UC_WITH(cp) do { \
|
3230
|
+
uc = cp; \
|
3231
|
+
ucptr = NULL; \
|
3232
|
+
} while (0)
|
3233
|
+
|
3234
|
+
/*
|
3235
|
+
* Normalize UTF-8 characters to Form D and copy the result.
|
3236
|
+
*/
|
3237
|
+
static int
|
3238
|
+
archive_string_normalize_D(struct archive_string *as, const void *_p,
|
3239
|
+
size_t len, struct archive_string_conv *sc)
|
3240
|
+
{
|
3241
|
+
const char *s = (const char *)_p;
|
3242
|
+
char *p, *endp;
|
3243
|
+
uint32_t uc, uc2;
|
3244
|
+
size_t w;
|
3245
|
+
int always_replace, n, n2, ret = 0, spair, ts, tm;
|
3246
|
+
int (*parse)(uint32_t *, const char *, size_t);
|
3247
|
+
size_t (*unparse)(char *, size_t, uint32_t);
|
3248
|
+
|
3249
|
+
always_replace = 1;
|
3250
|
+
ts = 1;/* text size. */
|
3251
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
3252
|
+
unparse = unicode_to_utf16be;
|
3253
|
+
ts = 2;
|
3254
|
+
if (sc->flag & SCONV_FROM_UTF16BE)
|
3255
|
+
always_replace = 0;
|
3256
|
+
} else if (sc->flag & SCONV_TO_UTF16LE) {
|
3257
|
+
unparse = unicode_to_utf16le;
|
3258
|
+
ts = 2;
|
3259
|
+
if (sc->flag & SCONV_FROM_UTF16LE)
|
3260
|
+
always_replace = 0;
|
3261
|
+
} else if (sc->flag & SCONV_TO_UTF8) {
|
3262
|
+
unparse = unicode_to_utf8;
|
3263
|
+
if (sc->flag & SCONV_FROM_UTF8)
|
3264
|
+
always_replace = 0;
|
3265
|
+
} else {
|
3266
|
+
/*
|
3267
|
+
* This case is going to be converted to another
|
3268
|
+
* character-set through iconv.
|
3269
|
+
*/
|
3270
|
+
always_replace = 0;
|
3271
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
3272
|
+
unparse = unicode_to_utf16be;
|
3273
|
+
ts = 2;
|
3274
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
3275
|
+
unparse = unicode_to_utf16le;
|
3276
|
+
ts = 2;
|
3277
|
+
} else {
|
3278
|
+
unparse = unicode_to_utf8;
|
3279
|
+
}
|
3280
|
+
}
|
3281
|
+
|
3282
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
3283
|
+
parse = utf16be_to_unicode;
|
3284
|
+
tm = 1;
|
3285
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
3286
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
3287
|
+
parse = utf16le_to_unicode;
|
3288
|
+
tm = 1;
|
3289
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
3290
|
+
} else {
|
3291
|
+
parse = cesu8_to_unicode;
|
3292
|
+
tm = ts;
|
3293
|
+
spair = 6;/* surrogate pair size in UTF-8. */
|
3294
|
+
}
|
3295
|
+
|
3296
|
+
if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
|
3297
|
+
return (-1);
|
3298
|
+
|
3299
|
+
p = as->s + as->length;
|
3300
|
+
endp = as->s + as->buffer_length - ts;
|
3301
|
+
while ((n = parse(&uc, s, len)) != 0) {
|
3302
|
+
const char *ucptr;
|
3303
|
+
uint32_t cp1, cp2;
|
3304
|
+
int SIndex;
|
3305
|
+
struct {
|
3306
|
+
uint32_t uc;
|
3307
|
+
int ccc;
|
3308
|
+
} fdc[FDC_MAX];
|
3309
|
+
int fdi, fdj;
|
3310
|
+
int ccc;
|
3311
|
+
|
3312
|
+
check_first_code:
|
3313
|
+
if (n < 0) {
|
3314
|
+
/* Use a replaced unicode character. */
|
3315
|
+
UNPARSE(p, endp, uc);
|
3316
|
+
s += n*-1;
|
3317
|
+
len -= n*-1;
|
3318
|
+
ret = -1;
|
3319
|
+
continue;
|
3320
|
+
} else if (n == spair || always_replace)
|
3321
|
+
/* uc is converted from a surrogate pair.
|
3322
|
+
* this should be treated as a changed code. */
|
3323
|
+
ucptr = NULL;
|
3324
|
+
else
|
3325
|
+
ucptr = s;
|
3326
|
+
s += n;
|
3327
|
+
len -= n;
|
3328
|
+
|
3329
|
+
/* Hangul Decomposition. */
|
3330
|
+
if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) {
|
3331
|
+
int L = HC_LBASE + SIndex / HC_NCOUNT;
|
3332
|
+
int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT;
|
3333
|
+
int T = HC_TBASE + SIndex % HC_TCOUNT;
|
3334
|
+
|
3335
|
+
REPLACE_UC_WITH(L);
|
3336
|
+
WRITE_UC();
|
3337
|
+
REPLACE_UC_WITH(V);
|
3338
|
+
WRITE_UC();
|
3339
|
+
if (T != HC_TBASE) {
|
3340
|
+
REPLACE_UC_WITH(T);
|
3341
|
+
WRITE_UC();
|
3342
|
+
}
|
3343
|
+
continue;
|
3344
|
+
}
|
3345
|
+
if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) {
|
3346
|
+
WRITE_UC();
|
3347
|
+
continue;
|
3348
|
+
}
|
3349
|
+
|
3350
|
+
fdi = 0;
|
3351
|
+
while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) {
|
3352
|
+
int k;
|
3353
|
+
|
3354
|
+
for (k = fdi; k > 0; k--)
|
3355
|
+
fdc[k] = fdc[k-1];
|
3356
|
+
fdc[0].ccc = CCC(cp2);
|
3357
|
+
fdc[0].uc = cp2;
|
3358
|
+
fdi++;
|
3359
|
+
REPLACE_UC_WITH(cp1);
|
3360
|
+
}
|
3361
|
+
|
3362
|
+
/* Read following code points. */
|
3363
|
+
while ((n2 = parse(&uc2, s, len)) > 0 &&
|
3364
|
+
(ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) {
|
3365
|
+
int j, k;
|
3366
|
+
|
3367
|
+
s += n2;
|
3368
|
+
len -= n2;
|
3369
|
+
for (j = 0; j < fdi; j++) {
|
3370
|
+
if (fdc[j].ccc > ccc)
|
3371
|
+
break;
|
3372
|
+
}
|
3373
|
+
if (j < fdi) {
|
3374
|
+
for (k = fdi; k > j; k--)
|
3375
|
+
fdc[k] = fdc[k-1];
|
3376
|
+
fdc[j].ccc = ccc;
|
3377
|
+
fdc[j].uc = uc2;
|
3378
|
+
} else {
|
3379
|
+
fdc[fdi].ccc = ccc;
|
3380
|
+
fdc[fdi].uc = uc2;
|
3381
|
+
}
|
3382
|
+
fdi++;
|
3383
|
+
}
|
3384
|
+
|
3385
|
+
WRITE_UC();
|
3386
|
+
for (fdj = 0; fdj < fdi; fdj++) {
|
3387
|
+
REPLACE_UC_WITH(fdc[fdj].uc);
|
3388
|
+
WRITE_UC();
|
3389
|
+
}
|
3390
|
+
|
3391
|
+
if (n2 == 0)
|
3392
|
+
break;
|
3393
|
+
REPLACE_UC_WITH(uc2);
|
3394
|
+
n = n2;
|
3395
|
+
goto check_first_code;
|
3396
|
+
}
|
3397
|
+
as->length = p - as->s;
|
3398
|
+
as->s[as->length] = '\0';
|
3399
|
+
if (ts == 2)
|
3400
|
+
as->s[as->length+1] = '\0';
|
3401
|
+
return (ret);
|
3402
|
+
}
|
3403
|
+
|
3404
|
+
/*
|
3405
|
+
* libarchive 2.x made incorrect UTF-8 strings in the wrong assumption
|
3406
|
+
* that WCS is Unicode. It is true for several platforms but some are false.
|
3407
|
+
* And then people who did not use UTF-8 locale on the non Unicode WCS
|
3408
|
+
* platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those
|
3409
|
+
* now cannot get right filename from libarchive 3.x and later since we
|
3410
|
+
* fixed the wrong assumption and it is incompatible to older its versions.
|
3411
|
+
* So we provide special option, "compat-2x.x", for resolving it.
|
3412
|
+
* That option enable the string conversion of libarchive 2.x.
|
3413
|
+
*
|
3414
|
+
* Translates the wrong UTF-8 string made by libarchive 2.x into current
|
3415
|
+
* locale character set and appends to the archive_string.
|
3416
|
+
* Note: returns -1 if conversion fails.
|
3417
|
+
*/
|
3418
|
+
static int
|
3419
|
+
strncat_from_utf8_libarchive2(struct archive_string *as,
|
3420
|
+
const void *_p, size_t len, struct archive_string_conv *sc)
|
3421
|
+
{
|
3422
|
+
const char *s;
|
3423
|
+
int n;
|
3424
|
+
char *p;
|
3425
|
+
char *end;
|
3426
|
+
uint32_t unicode;
|
3427
|
+
#if HAVE_WCRTOMB
|
3428
|
+
mbstate_t shift_state;
|
3429
|
+
|
3430
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
3431
|
+
#else
|
3432
|
+
/* Clear the shift state before starting. */
|
3433
|
+
wctomb(NULL, L'\0');
|
3434
|
+
#endif
|
3435
|
+
(void)sc; /* UNUSED */
|
3436
|
+
/*
|
3437
|
+
* Allocate buffer for MBS.
|
3438
|
+
* We need this allocation here since it is possible that
|
3439
|
+
* as->s is still NULL.
|
3440
|
+
*/
|
3441
|
+
if (archive_string_ensure(as, as->length + len + 1) == NULL)
|
3442
|
+
return (-1);
|
3443
|
+
|
3444
|
+
s = (const char *)_p;
|
3445
|
+
p = as->s + as->length;
|
3446
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
3447
|
+
while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) {
|
3448
|
+
wchar_t wc;
|
3449
|
+
|
3450
|
+
if (p >= end) {
|
3451
|
+
as->length = p - as->s;
|
3452
|
+
/* Re-allocate buffer for MBS. */
|
3453
|
+
if (archive_string_ensure(as,
|
3454
|
+
as->length + max(len * 2,
|
3455
|
+
(size_t)MB_CUR_MAX) + 1) == NULL)
|
3456
|
+
return (-1);
|
3457
|
+
p = as->s + as->length;
|
3458
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
3459
|
+
}
|
3460
|
+
|
3461
|
+
/*
|
3462
|
+
* As libarchive 2.x, translates the UTF-8 characters into
|
3463
|
+
* wide-characters in the assumption that WCS is Unicode.
|
3464
|
+
*/
|
3465
|
+
if (n < 0) {
|
3466
|
+
n *= -1;
|
3467
|
+
wc = L'?';
|
3468
|
+
} else
|
3469
|
+
wc = (wchar_t)unicode;
|
3470
|
+
|
3471
|
+
s += n;
|
3472
|
+
len -= n;
|
3473
|
+
/*
|
3474
|
+
* Translates the wide-character into the current locale MBS.
|
3475
|
+
*/
|
3476
|
+
#if HAVE_WCRTOMB
|
3477
|
+
n = (int)wcrtomb(p, wc, &shift_state);
|
3478
|
+
#else
|
3479
|
+
n = (int)wctomb(p, wc);
|
3480
|
+
#endif
|
3481
|
+
if (n == -1)
|
3482
|
+
return (-1);
|
3483
|
+
p += n;
|
3484
|
+
}
|
3485
|
+
as->length = p - as->s;
|
3486
|
+
as->s[as->length] = '\0';
|
3487
|
+
return (0);
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
|
3491
|
+
/*
|
3492
|
+
* Conversion functions between current locale dependent MBS and UTF-16BE.
|
3493
|
+
* strncat_from_utf16be() : UTF-16BE --> MBS
|
3494
|
+
* strncat_to_utf16be() : MBS --> UTF16BE
|
3495
|
+
*/
|
3496
|
+
|
3497
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
3498
|
+
|
3499
|
+
/*
|
3500
|
+
* Convert a UTF-16BE/LE string to current locale and copy the result.
|
3501
|
+
* Return -1 if conversion fails.
|
3502
|
+
*/
|
3503
|
+
static int
|
3504
|
+
win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes,
|
3505
|
+
struct archive_string_conv *sc, int be)
|
3506
|
+
{
|
3507
|
+
struct archive_string tmp;
|
3508
|
+
const char *u16;
|
3509
|
+
int ll;
|
3510
|
+
BOOL defchar;
|
3511
|
+
char *mbs;
|
3512
|
+
size_t mbs_size, b;
|
3513
|
+
int ret = 0;
|
3514
|
+
|
3515
|
+
bytes &= ~1;
|
3516
|
+
if (archive_string_ensure(as, as->length + bytes +1) == NULL)
|
3517
|
+
return (-1);
|
3518
|
+
|
3519
|
+
mbs = as->s + as->length;
|
3520
|
+
mbs_size = as->buffer_length - as->length -1;
|
3521
|
+
|
3522
|
+
if (sc->to_cp == CP_C_LOCALE) {
|
3523
|
+
/*
|
3524
|
+
* "C" locale special process.
|
3525
|
+
*/
|
3526
|
+
u16 = _p;
|
3527
|
+
ll = 0;
|
3528
|
+
for (b = 0; b < bytes; b += 2) {
|
3529
|
+
uint16_t val;
|
3530
|
+
if (be)
|
3531
|
+
val = archive_be16dec(u16+b);
|
3532
|
+
else
|
3533
|
+
val = archive_le16dec(u16+b);
|
3534
|
+
if (val > 255) {
|
3535
|
+
*mbs++ = '?';
|
3536
|
+
ret = -1;
|
3537
|
+
} else
|
3538
|
+
*mbs++ = (char)(val&0xff);
|
3539
|
+
ll++;
|
3540
|
+
}
|
3541
|
+
as->length += ll;
|
3542
|
+
as->s[as->length] = '\0';
|
3543
|
+
return (ret);
|
3544
|
+
}
|
3545
|
+
|
3546
|
+
archive_string_init(&tmp);
|
3547
|
+
if (be) {
|
3548
|
+
if (is_big_endian()) {
|
3549
|
+
u16 = _p;
|
3550
|
+
} else {
|
3551
|
+
if (archive_string_ensure(&tmp, bytes+2) == NULL)
|
3552
|
+
return (-1);
|
3553
|
+
memcpy(tmp.s, _p, bytes);
|
3554
|
+
for (b = 0; b < bytes; b += 2) {
|
3555
|
+
uint16_t val = archive_be16dec(tmp.s+b);
|
3556
|
+
archive_le16enc(tmp.s+b, val);
|
3557
|
+
}
|
3558
|
+
u16 = tmp.s;
|
3559
|
+
}
|
3560
|
+
} else {
|
3561
|
+
if (!is_big_endian()) {
|
3562
|
+
u16 = _p;
|
3563
|
+
} else {
|
3564
|
+
if (archive_string_ensure(&tmp, bytes+2) == NULL)
|
3565
|
+
return (-1);
|
3566
|
+
memcpy(tmp.s, _p, bytes);
|
3567
|
+
for (b = 0; b < bytes; b += 2) {
|
3568
|
+
uint16_t val = archive_le16dec(tmp.s+b);
|
3569
|
+
archive_be16enc(tmp.s+b, val);
|
3570
|
+
}
|
3571
|
+
u16 = tmp.s;
|
3572
|
+
}
|
3573
|
+
}
|
3574
|
+
|
3575
|
+
do {
|
3576
|
+
defchar = 0;
|
3577
|
+
ll = WideCharToMultiByte(sc->to_cp, 0,
|
3578
|
+
(LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size,
|
3579
|
+
NULL, &defchar);
|
3580
|
+
/* Exit loop if we succeeded */
|
3581
|
+
if (ll != 0 ||
|
3582
|
+
GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
3583
|
+
break;
|
3584
|
+
}
|
3585
|
+
/* Else expand buffer and loop to try again. */
|
3586
|
+
ll = WideCharToMultiByte(sc->to_cp, 0,
|
3587
|
+
(LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL);
|
3588
|
+
if (archive_string_ensure(as, ll +1) == NULL)
|
3589
|
+
return (-1);
|
3590
|
+
mbs = as->s + as->length;
|
3591
|
+
mbs_size = as->buffer_length - as->length -1;
|
3592
|
+
} while (1);
|
3593
|
+
archive_string_free(&tmp);
|
3594
|
+
as->length += ll;
|
3595
|
+
as->s[as->length] = '\0';
|
3596
|
+
if (ll == 0 || defchar)
|
3597
|
+
ret = -1;
|
3598
|
+
return (ret);
|
3599
|
+
}
|
3600
|
+
|
3601
|
+
static int
|
3602
|
+
win_strncat_from_utf16be(struct archive_string *as, const void *_p,
|
3603
|
+
size_t bytes, struct archive_string_conv *sc)
|
3604
|
+
{
|
3605
|
+
return (win_strncat_from_utf16(as, _p, bytes, sc, 1));
|
3606
|
+
}
|
3607
|
+
|
3608
|
+
static int
|
3609
|
+
win_strncat_from_utf16le(struct archive_string *as, const void *_p,
|
3610
|
+
size_t bytes, struct archive_string_conv *sc)
|
3611
|
+
{
|
3612
|
+
return (win_strncat_from_utf16(as, _p, bytes, sc, 0));
|
3613
|
+
}
|
3614
|
+
|
3615
|
+
static int
|
3616
|
+
is_big_endian(void)
|
3617
|
+
{
|
3618
|
+
uint16_t d = 1;
|
3619
|
+
|
3620
|
+
return (archive_be16dec(&d) == 1);
|
3621
|
+
}
|
3622
|
+
|
3623
|
+
/*
|
3624
|
+
* Convert a current locale string to UTF-16BE/LE and copy the result.
|
3625
|
+
* Return -1 if conversion fails.
|
3626
|
+
*/
|
3627
|
+
static int
|
3628
|
+
win_strncat_to_utf16(struct archive_string *as16, const void *_p,
|
3629
|
+
size_t length, struct archive_string_conv *sc, int bigendian)
|
3630
|
+
{
|
3631
|
+
const char *s = (const char *)_p;
|
3632
|
+
char *u16;
|
3633
|
+
size_t count, avail;
|
3634
|
+
|
3635
|
+
if (archive_string_ensure(as16,
|
3636
|
+
as16->length + (length + 1) * 2) == NULL)
|
3637
|
+
return (-1);
|
3638
|
+
|
3639
|
+
u16 = as16->s + as16->length;
|
3640
|
+
avail = as16->buffer_length - 2;
|
3641
|
+
if (sc->from_cp == CP_C_LOCALE) {
|
3642
|
+
/*
|
3643
|
+
* "C" locale special process.
|
3644
|
+
*/
|
3645
|
+
count = 0;
|
3646
|
+
while (count < length && *s) {
|
3647
|
+
if (bigendian)
|
3648
|
+
archive_be16enc(u16, *s);
|
3649
|
+
else
|
3650
|
+
archive_le16enc(u16, *s);
|
3651
|
+
u16 += 2;
|
3652
|
+
s++;
|
3653
|
+
count++;
|
3654
|
+
}
|
3655
|
+
as16->length += count << 1;
|
3656
|
+
as16->s[as16->length] = 0;
|
3657
|
+
as16->s[as16->length+1] = 0;
|
3658
|
+
return (0);
|
3659
|
+
}
|
3660
|
+
do {
|
3661
|
+
count = MultiByteToWideChar(sc->from_cp,
|
3662
|
+
MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1);
|
3663
|
+
/* Exit loop if we succeeded */
|
3664
|
+
if (count != 0 ||
|
3665
|
+
GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
3666
|
+
break;
|
3667
|
+
}
|
3668
|
+
/* Expand buffer and try again */
|
3669
|
+
count = MultiByteToWideChar(sc->from_cp,
|
3670
|
+
MB_PRECOMPOSED, s, (int)length, NULL, 0);
|
3671
|
+
if (archive_string_ensure(as16, (count +1) * 2)
|
3672
|
+
== NULL)
|
3673
|
+
return (-1);
|
3674
|
+
u16 = as16->s + as16->length;
|
3675
|
+
avail = as16->buffer_length - 2;
|
3676
|
+
} while (1);
|
3677
|
+
as16->length += count * 2;
|
3678
|
+
as16->s[as16->length] = 0;
|
3679
|
+
as16->s[as16->length+1] = 0;
|
3680
|
+
if (count == 0)
|
3681
|
+
return (-1);
|
3682
|
+
|
3683
|
+
if (is_big_endian()) {
|
3684
|
+
if (!bigendian) {
|
3685
|
+
while (count > 0) {
|
3686
|
+
uint16_t v = archive_be16dec(u16);
|
3687
|
+
archive_le16enc(u16, v);
|
3688
|
+
u16 += 2;
|
3689
|
+
count--;
|
3690
|
+
}
|
3691
|
+
}
|
3692
|
+
} else {
|
3693
|
+
if (bigendian) {
|
3694
|
+
while (count > 0) {
|
3695
|
+
uint16_t v = archive_le16dec(u16);
|
3696
|
+
archive_be16enc(u16, v);
|
3697
|
+
u16 += 2;
|
3698
|
+
count--;
|
3699
|
+
}
|
3700
|
+
}
|
3701
|
+
}
|
3702
|
+
return (0);
|
3703
|
+
}
|
3704
|
+
|
3705
|
+
static int
|
3706
|
+
win_strncat_to_utf16be(struct archive_string *as16, const void *_p,
|
3707
|
+
size_t length, struct archive_string_conv *sc)
|
3708
|
+
{
|
3709
|
+
return (win_strncat_to_utf16(as16, _p, length, sc, 1));
|
3710
|
+
}
|
3711
|
+
|
3712
|
+
static int
|
3713
|
+
win_strncat_to_utf16le(struct archive_string *as16, const void *_p,
|
3714
|
+
size_t length, struct archive_string_conv *sc)
|
3715
|
+
{
|
3716
|
+
return (win_strncat_to_utf16(as16, _p, length, sc, 0));
|
3717
|
+
}
|
3718
|
+
|
3719
|
+
#endif /* _WIN32 && !__CYGWIN__ */
|
3720
|
+
|
3721
|
+
/*
|
3722
|
+
* Do the best effort for conversions.
|
3723
|
+
* We cannot handle UTF-16BE character-set without such iconv,
|
3724
|
+
* but there is a chance if a string consists just ASCII code or
|
3725
|
+
* a current locale is UTF-8.
|
3726
|
+
*/
|
3727
|
+
|
3728
|
+
/*
|
3729
|
+
* Convert a UTF-16BE string to current locale and copy the result.
|
3730
|
+
* Return -1 if conversion fails.
|
3731
|
+
*/
|
3732
|
+
static int
|
3733
|
+
best_effort_strncat_from_utf16(struct archive_string *as, const void *_p,
|
3734
|
+
size_t bytes, struct archive_string_conv *sc, int be)
|
3735
|
+
{
|
3736
|
+
const char *utf16 = (const char *)_p;
|
3737
|
+
char *mbs;
|
3738
|
+
uint32_t uc;
|
3739
|
+
int n, ret;
|
3740
|
+
|
3741
|
+
(void)sc; /* UNUSED */
|
3742
|
+
/*
|
3743
|
+
* Other case, we should do the best effort.
|
3744
|
+
* If all character are ASCII(<0x7f), we can convert it.
|
3745
|
+
* if not , we set a alternative character and return -1.
|
3746
|
+
*/
|
3747
|
+
ret = 0;
|
3748
|
+
if (archive_string_ensure(as, as->length + bytes +1) == NULL)
|
3749
|
+
return (-1);
|
3750
|
+
mbs = as->s + as->length;
|
3751
|
+
|
3752
|
+
while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) {
|
3753
|
+
if (n < 0) {
|
3754
|
+
n *= -1;
|
3755
|
+
ret = -1;
|
3756
|
+
}
|
3757
|
+
bytes -= n;
|
3758
|
+
utf16 += n;
|
3759
|
+
|
3760
|
+
if (uc > 127) {
|
3761
|
+
/* We cannot handle it. */
|
3762
|
+
*mbs++ = '?';
|
3763
|
+
ret = -1;
|
3764
|
+
} else
|
3765
|
+
*mbs++ = (char)uc;
|
3766
|
+
}
|
3767
|
+
as->length = mbs - as->s;
|
3768
|
+
as->s[as->length] = '\0';
|
3769
|
+
return (ret);
|
3770
|
+
}
|
3771
|
+
|
3772
|
+
static int
|
3773
|
+
best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p,
|
3774
|
+
size_t bytes, struct archive_string_conv *sc)
|
3775
|
+
{
|
3776
|
+
return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1));
|
3777
|
+
}
|
3778
|
+
|
3779
|
+
static int
|
3780
|
+
best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p,
|
3781
|
+
size_t bytes, struct archive_string_conv *sc)
|
3782
|
+
{
|
3783
|
+
return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0));
|
3784
|
+
}
|
3785
|
+
|
3786
|
+
/*
|
3787
|
+
* Convert a current locale string to UTF-16BE/LE and copy the result.
|
3788
|
+
* Return -1 if conversion fails.
|
3789
|
+
*/
|
3790
|
+
static int
|
3791
|
+
best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p,
|
3792
|
+
size_t length, struct archive_string_conv *sc, int bigendian)
|
3793
|
+
{
|
3794
|
+
const char *s = (const char *)_p;
|
3795
|
+
char *utf16;
|
3796
|
+
size_t remaining;
|
3797
|
+
int ret;
|
3798
|
+
|
3799
|
+
(void)sc; /* UNUSED */
|
3800
|
+
/*
|
3801
|
+
* Other case, we should do the best effort.
|
3802
|
+
* If all character are ASCII(<0x7f), we can convert it.
|
3803
|
+
* if not , we set a alternative character and return -1.
|
3804
|
+
*/
|
3805
|
+
ret = 0;
|
3806
|
+
remaining = length;
|
3807
|
+
|
3808
|
+
if (archive_string_ensure(as16,
|
3809
|
+
as16->length + (length + 1) * 2) == NULL)
|
3810
|
+
return (-1);
|
3811
|
+
|
3812
|
+
utf16 = as16->s + as16->length;
|
3813
|
+
while (remaining--) {
|
3814
|
+
unsigned c = *s++;
|
3815
|
+
if (c > 127) {
|
3816
|
+
/* We cannot handle it. */
|
3817
|
+
c = UNICODE_R_CHAR;
|
3818
|
+
ret = -1;
|
3819
|
+
}
|
3820
|
+
if (bigendian)
|
3821
|
+
archive_be16enc(utf16, c);
|
3822
|
+
else
|
3823
|
+
archive_le16enc(utf16, c);
|
3824
|
+
utf16 += 2;
|
3825
|
+
}
|
3826
|
+
as16->length = utf16 - as16->s;
|
3827
|
+
as16->s[as16->length] = 0;
|
3828
|
+
as16->s[as16->length+1] = 0;
|
3829
|
+
return (ret);
|
3830
|
+
}
|
3831
|
+
|
3832
|
+
static int
|
3833
|
+
best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p,
|
3834
|
+
size_t length, struct archive_string_conv *sc)
|
3835
|
+
{
|
3836
|
+
return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1));
|
3837
|
+
}
|
3838
|
+
|
3839
|
+
static int
|
3840
|
+
best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p,
|
3841
|
+
size_t length, struct archive_string_conv *sc)
|
3842
|
+
{
|
3843
|
+
return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0));
|
3844
|
+
}
|
3845
|
+
|
3846
|
+
|
3847
|
+
/*
|
3848
|
+
* Multistring operations.
|
3849
|
+
*/
|
3850
|
+
|
3851
|
+
void
|
3852
|
+
archive_mstring_clean(struct archive_mstring *aes)
|
3853
|
+
{
|
3854
|
+
archive_wstring_free(&(aes->aes_wcs));
|
3855
|
+
archive_string_free(&(aes->aes_mbs));
|
3856
|
+
archive_string_free(&(aes->aes_utf8));
|
3857
|
+
archive_string_free(&(aes->aes_mbs_in_locale));
|
3858
|
+
aes->aes_set = 0;
|
3859
|
+
}
|
3860
|
+
|
3861
|
+
void
|
3862
|
+
archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src)
|
3863
|
+
{
|
3864
|
+
dest->aes_set = src->aes_set;
|
3865
|
+
archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs));
|
3866
|
+
archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8));
|
3867
|
+
archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs));
|
3868
|
+
}
|
3869
|
+
|
3870
|
+
int
|
3871
|
+
archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
|
3872
|
+
const char **p)
|
3873
|
+
{
|
3874
|
+
struct archive_string_conv *sc;
|
3875
|
+
int r;
|
3876
|
+
|
3877
|
+
/* If we already have a UTF8 form, return that immediately. */
|
3878
|
+
if (aes->aes_set & AES_SET_UTF8) {
|
3879
|
+
*p = aes->aes_utf8.s;
|
3880
|
+
return (0);
|
3881
|
+
}
|
3882
|
+
|
3883
|
+
*p = NULL;
|
3884
|
+
/* Try converting WCS to MBS first if MBS does not exist yet. */
|
3885
|
+
if ((aes->aes_set & AES_SET_MBS) == 0) {
|
3886
|
+
const char *pm; /* unused */
|
3887
|
+
archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
|
3888
|
+
}
|
3889
|
+
if (aes->aes_set & AES_SET_MBS) {
|
3890
|
+
sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
|
3891
|
+
if (sc == NULL)
|
3892
|
+
return (-1);/* Couldn't allocate memory for sc. */
|
3893
|
+
r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s,
|
3894
|
+
aes->aes_mbs.length, sc);
|
3895
|
+
if (a == NULL)
|
3896
|
+
free_sconv_object(sc);
|
3897
|
+
if (r == 0) {
|
3898
|
+
aes->aes_set |= AES_SET_UTF8;
|
3899
|
+
*p = aes->aes_utf8.s;
|
3900
|
+
return (0);/* success. */
|
3901
|
+
} else
|
3902
|
+
return (-1);/* failure. */
|
3903
|
+
}
|
3904
|
+
return (0);/* success. */
|
3905
|
+
}
|
3906
|
+
|
3907
|
+
int
|
3908
|
+
archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes,
|
3909
|
+
const char **p)
|
3910
|
+
{
|
3911
|
+
struct archive_string_conv *sc;
|
3912
|
+
int r, ret = 0;
|
3913
|
+
|
3914
|
+
/* If we already have an MBS form, return that immediately. */
|
3915
|
+
if (aes->aes_set & AES_SET_MBS) {
|
3916
|
+
*p = aes->aes_mbs.s;
|
3917
|
+
return (ret);
|
3918
|
+
}
|
3919
|
+
|
3920
|
+
*p = NULL;
|
3921
|
+
/* If there's a WCS form, try converting with the native locale. */
|
3922
|
+
if (aes->aes_set & AES_SET_WCS) {
|
3923
|
+
archive_string_empty(&(aes->aes_mbs));
|
3924
|
+
r = archive_string_append_from_wcs(&(aes->aes_mbs),
|
3925
|
+
aes->aes_wcs.s, aes->aes_wcs.length);
|
3926
|
+
*p = aes->aes_mbs.s;
|
3927
|
+
if (r == 0) {
|
3928
|
+
aes->aes_set |= AES_SET_MBS;
|
3929
|
+
return (ret);
|
3930
|
+
} else
|
3931
|
+
ret = -1;
|
3932
|
+
}
|
3933
|
+
|
3934
|
+
/* If there's a UTF-8 form, try converting with the native locale. */
|
3935
|
+
if (aes->aes_set & AES_SET_UTF8) {
|
3936
|
+
archive_string_empty(&(aes->aes_mbs));
|
3937
|
+
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
|
3938
|
+
if (sc == NULL)
|
3939
|
+
return (-1);/* Couldn't allocate memory for sc. */
|
3940
|
+
r = archive_strncpy_l(&(aes->aes_mbs),
|
3941
|
+
aes->aes_utf8.s, aes->aes_utf8.length, sc);
|
3942
|
+
if (a == NULL)
|
3943
|
+
free_sconv_object(sc);
|
3944
|
+
*p = aes->aes_mbs.s;
|
3945
|
+
if (r == 0) {
|
3946
|
+
aes->aes_set |= AES_SET_MBS;
|
3947
|
+
ret = 0;/* success; overwrite previous error. */
|
3948
|
+
} else
|
3949
|
+
ret = -1;/* failure. */
|
3950
|
+
}
|
3951
|
+
return (ret);
|
3952
|
+
}
|
3953
|
+
|
3954
|
+
int
|
3955
|
+
archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
|
3956
|
+
const wchar_t **wp)
|
3957
|
+
{
|
3958
|
+
int r, ret = 0;
|
3959
|
+
|
3960
|
+
(void)a;/* UNUSED */
|
3961
|
+
/* Return WCS form if we already have it. */
|
3962
|
+
if (aes->aes_set & AES_SET_WCS) {
|
3963
|
+
*wp = aes->aes_wcs.s;
|
3964
|
+
return (ret);
|
3965
|
+
}
|
3966
|
+
|
3967
|
+
*wp = NULL;
|
3968
|
+
/* Try converting UTF8 to MBS first if MBS does not exist yet. */
|
3969
|
+
if ((aes->aes_set & AES_SET_MBS) == 0) {
|
3970
|
+
const char *p; /* unused */
|
3971
|
+
archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */
|
3972
|
+
}
|
3973
|
+
/* Try converting MBS to WCS using native locale. */
|
3974
|
+
if (aes->aes_set & AES_SET_MBS) {
|
3975
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
3976
|
+
r = archive_wstring_append_from_mbs(&(aes->aes_wcs),
|
3977
|
+
aes->aes_mbs.s, aes->aes_mbs.length);
|
3978
|
+
if (r == 0) {
|
3979
|
+
aes->aes_set |= AES_SET_WCS;
|
3980
|
+
*wp = aes->aes_wcs.s;
|
3981
|
+
} else
|
3982
|
+
ret = -1;/* failure. */
|
3983
|
+
}
|
3984
|
+
return (ret);
|
3985
|
+
}
|
3986
|
+
|
3987
|
+
int
|
3988
|
+
archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes,
|
3989
|
+
const char **p, size_t *length, struct archive_string_conv *sc)
|
3990
|
+
{
|
3991
|
+
int ret = 0;
|
3992
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
3993
|
+
int r;
|
3994
|
+
|
3995
|
+
/*
|
3996
|
+
* Internationalization programming on Windows must use Wide
|
3997
|
+
* characters because Windows platform cannot make locale UTF-8.
|
3998
|
+
*/
|
3999
|
+
if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) {
|
4000
|
+
archive_string_empty(&(aes->aes_mbs_in_locale));
|
4001
|
+
r = archive_string_append_from_wcs_in_codepage(
|
4002
|
+
&(aes->aes_mbs_in_locale), aes->aes_wcs.s,
|
4003
|
+
aes->aes_wcs.length, sc);
|
4004
|
+
if (r == 0) {
|
4005
|
+
*p = aes->aes_mbs_in_locale.s;
|
4006
|
+
if (length != NULL)
|
4007
|
+
*length = aes->aes_mbs_in_locale.length;
|
4008
|
+
return (0);
|
4009
|
+
} else if (errno == ENOMEM)
|
4010
|
+
return (-1);
|
4011
|
+
else
|
4012
|
+
ret = -1;
|
4013
|
+
}
|
4014
|
+
#endif
|
4015
|
+
|
4016
|
+
/* If there is not an MBS form but there is a WCS or UTF8 form, try converting
|
4017
|
+
* with the native locale to be used for translating it to specified
|
4018
|
+
* character-set. */
|
4019
|
+
if ((aes->aes_set & AES_SET_MBS) == 0) {
|
4020
|
+
const char *pm; /* unused */
|
4021
|
+
archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
|
4022
|
+
}
|
4023
|
+
/* If we already have an MBS form, use it to be translated to
|
4024
|
+
* specified character-set. */
|
4025
|
+
if (aes->aes_set & AES_SET_MBS) {
|
4026
|
+
if (sc == NULL) {
|
4027
|
+
/* Conversion is unneeded. */
|
4028
|
+
*p = aes->aes_mbs.s;
|
4029
|
+
if (length != NULL)
|
4030
|
+
*length = aes->aes_mbs.length;
|
4031
|
+
return (0);
|
4032
|
+
}
|
4033
|
+
ret = archive_strncpy_l(&(aes->aes_mbs_in_locale),
|
4034
|
+
aes->aes_mbs.s, aes->aes_mbs.length, sc);
|
4035
|
+
*p = aes->aes_mbs_in_locale.s;
|
4036
|
+
if (length != NULL)
|
4037
|
+
*length = aes->aes_mbs_in_locale.length;
|
4038
|
+
} else {
|
4039
|
+
*p = NULL;
|
4040
|
+
if (length != NULL)
|
4041
|
+
*length = 0;
|
4042
|
+
}
|
4043
|
+
return (ret);
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
int
|
4047
|
+
archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs)
|
4048
|
+
{
|
4049
|
+
if (mbs == NULL) {
|
4050
|
+
aes->aes_set = 0;
|
4051
|
+
return (0);
|
4052
|
+
}
|
4053
|
+
return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs)));
|
4054
|
+
}
|
4055
|
+
|
4056
|
+
int
|
4057
|
+
archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs,
|
4058
|
+
size_t len)
|
4059
|
+
{
|
4060
|
+
if (mbs == NULL) {
|
4061
|
+
aes->aes_set = 0;
|
4062
|
+
return (0);
|
4063
|
+
}
|
4064
|
+
aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
|
4065
|
+
archive_strncpy(&(aes->aes_mbs), mbs, len);
|
4066
|
+
archive_string_empty(&(aes->aes_utf8));
|
4067
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
4068
|
+
return (0);
|
4069
|
+
}
|
4070
|
+
|
4071
|
+
int
|
4072
|
+
archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs)
|
4073
|
+
{
|
4074
|
+
return archive_mstring_copy_wcs_len(aes, wcs,
|
4075
|
+
wcs == NULL ? 0 : wcslen(wcs));
|
4076
|
+
}
|
4077
|
+
|
4078
|
+
int
|
4079
|
+
archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8)
|
4080
|
+
{
|
4081
|
+
if (utf8 == NULL) {
|
4082
|
+
aes->aes_set = 0;
|
4083
|
+
return (0);
|
4084
|
+
}
|
4085
|
+
aes->aes_set = AES_SET_UTF8;
|
4086
|
+
archive_string_empty(&(aes->aes_mbs));
|
4087
|
+
archive_string_empty(&(aes->aes_wcs));
|
4088
|
+
archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8));
|
4089
|
+
return (int)strlen(utf8);
|
4090
|
+
}
|
4091
|
+
|
4092
|
+
int
|
4093
|
+
archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs,
|
4094
|
+
size_t len)
|
4095
|
+
{
|
4096
|
+
if (wcs == NULL) {
|
4097
|
+
aes->aes_set = 0;
|
4098
|
+
return (0);
|
4099
|
+
}
|
4100
|
+
aes->aes_set = AES_SET_WCS; /* Only WCS form set. */
|
4101
|
+
archive_string_empty(&(aes->aes_mbs));
|
4102
|
+
archive_string_empty(&(aes->aes_utf8));
|
4103
|
+
archive_wstrncpy(&(aes->aes_wcs), wcs, len);
|
4104
|
+
return (0);
|
4105
|
+
}
|
4106
|
+
|
4107
|
+
int
|
4108
|
+
archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,
|
4109
|
+
const char *mbs, size_t len, struct archive_string_conv *sc)
|
4110
|
+
{
|
4111
|
+
int r;
|
4112
|
+
|
4113
|
+
if (mbs == NULL) {
|
4114
|
+
aes->aes_set = 0;
|
4115
|
+
return (0);
|
4116
|
+
}
|
4117
|
+
archive_string_empty(&(aes->aes_mbs));
|
4118
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
4119
|
+
archive_string_empty(&(aes->aes_utf8));
|
4120
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
4121
|
+
/*
|
4122
|
+
* Internationalization programming on Windows must use Wide
|
4123
|
+
* characters because Windows platform cannot make locale UTF-8.
|
4124
|
+
*/
|
4125
|
+
if (sc == NULL) {
|
4126
|
+
if (archive_string_append(&(aes->aes_mbs),
|
4127
|
+
mbs, mbsnbytes(mbs, len)) == NULL) {
|
4128
|
+
aes->aes_set = 0;
|
4129
|
+
r = -1;
|
4130
|
+
} else {
|
4131
|
+
aes->aes_set = AES_SET_MBS;
|
4132
|
+
r = 0;
|
4133
|
+
}
|
4134
|
+
#if defined(HAVE_ICONV)
|
4135
|
+
} else if (sc != NULL && sc->cd_w != (iconv_t)-1) {
|
4136
|
+
/*
|
4137
|
+
* This case happens only when MultiByteToWideChar() cannot
|
4138
|
+
* handle sc->from_cp, and we have to iconv in order to
|
4139
|
+
* translate character-set to wchar_t,UTF-16.
|
4140
|
+
*/
|
4141
|
+
iconv_t cd = sc->cd;
|
4142
|
+
unsigned from_cp;
|
4143
|
+
int flag;
|
4144
|
+
|
4145
|
+
/*
|
4146
|
+
* Translate multi-bytes from some character-set to UTF-8.
|
4147
|
+
*/
|
4148
|
+
sc->cd = sc->cd_w;
|
4149
|
+
r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc);
|
4150
|
+
sc->cd = cd;
|
4151
|
+
if (r != 0) {
|
4152
|
+
aes->aes_set = 0;
|
4153
|
+
return (r);
|
4154
|
+
}
|
4155
|
+
aes->aes_set = AES_SET_UTF8;
|
4156
|
+
|
4157
|
+
/*
|
4158
|
+
* Append the UTF-8 string into wstring.
|
4159
|
+
*/
|
4160
|
+
flag = sc->flag;
|
4161
|
+
sc->flag &= ~(SCONV_NORMALIZATION_C
|
4162
|
+
| SCONV_TO_UTF16| SCONV_FROM_UTF16);
|
4163
|
+
from_cp = sc->from_cp;
|
4164
|
+
sc->from_cp = CP_UTF8;
|
4165
|
+
r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
|
4166
|
+
aes->aes_utf8.s, aes->aes_utf8.length, sc);
|
4167
|
+
sc->flag = flag;
|
4168
|
+
sc->from_cp = from_cp;
|
4169
|
+
if (r == 0)
|
4170
|
+
aes->aes_set |= AES_SET_WCS;
|
4171
|
+
#endif
|
4172
|
+
} else {
|
4173
|
+
r = archive_wstring_append_from_mbs_in_codepage(
|
4174
|
+
&(aes->aes_wcs), mbs, len, sc);
|
4175
|
+
if (r == 0)
|
4176
|
+
aes->aes_set = AES_SET_WCS;
|
4177
|
+
else
|
4178
|
+
aes->aes_set = 0;
|
4179
|
+
}
|
4180
|
+
#else
|
4181
|
+
r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc);
|
4182
|
+
if (r == 0)
|
4183
|
+
aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
|
4184
|
+
else
|
4185
|
+
aes->aes_set = 0;
|
4186
|
+
#endif
|
4187
|
+
return (r);
|
4188
|
+
}
|
4189
|
+
|
4190
|
+
/*
|
4191
|
+
* The 'update' form tries to proactively update all forms of
|
4192
|
+
* this string (WCS and MBS) and returns an error if any of
|
4193
|
+
* them fail. This is used by the 'pax' handler, for instance,
|
4194
|
+
* to detect and report character-conversion failures early while
|
4195
|
+
* still allowing clients to get potentially useful values from
|
4196
|
+
* the more tolerant lazy conversions. (get_mbs and get_wcs will
|
4197
|
+
* strive to give the user something useful, so you can get hopefully
|
4198
|
+
* usable values even if some of the character conversions are failing.)
|
4199
|
+
*/
|
4200
|
+
int
|
4201
|
+
archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
|
4202
|
+
const char *utf8)
|
4203
|
+
{
|
4204
|
+
struct archive_string_conv *sc;
|
4205
|
+
int r;
|
4206
|
+
|
4207
|
+
if (utf8 == NULL) {
|
4208
|
+
aes->aes_set = 0;
|
4209
|
+
return (0); /* Succeeded in clearing everything. */
|
4210
|
+
}
|
4211
|
+
|
4212
|
+
/* Save the UTF8 string. */
|
4213
|
+
archive_strcpy(&(aes->aes_utf8), utf8);
|
4214
|
+
|
4215
|
+
/* Empty the mbs and wcs strings. */
|
4216
|
+
archive_string_empty(&(aes->aes_mbs));
|
4217
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
4218
|
+
|
4219
|
+
aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */
|
4220
|
+
|
4221
|
+
/* Try converting UTF-8 to MBS, return false on failure. */
|
4222
|
+
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
|
4223
|
+
if (sc == NULL)
|
4224
|
+
return (-1);/* Couldn't allocate memory for sc. */
|
4225
|
+
r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
|
4226
|
+
if (a == NULL)
|
4227
|
+
free_sconv_object(sc);
|
4228
|
+
if (r != 0)
|
4229
|
+
return (-1);
|
4230
|
+
aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */
|
4231
|
+
|
4232
|
+
/* Try converting MBS to WCS, return false on failure. */
|
4233
|
+
if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
|
4234
|
+
aes->aes_mbs.length))
|
4235
|
+
return (-1);
|
4236
|
+
aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
|
4237
|
+
|
4238
|
+
/* All conversions succeeded. */
|
4239
|
+
return (0);
|
4240
|
+
}
|