libarchive-static 1.0.5 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/ext/extconf.rb +2 -9
- data/ext/libarchive-0.1.1/ext/archive_read_support_compression.c +6 -6
- data/ext/libarchive-0.1.1/ext/archive_read_support_compression.o +0 -0
- data/ext/libarchive-0.1.1/ext/archive_read_support_format.o +0 -0
- data/ext/libarchive-0.1.1/ext/archive_write_open_rb_str.c +1 -1
- data/ext/libarchive-0.1.1/ext/archive_write_open_rb_str.o +0 -0
- data/ext/libarchive-0.1.1/ext/archive_write_set_compression.c +5 -5
- data/ext/libarchive-0.1.1/ext/archive_write_set_compression.o +0 -0
- data/ext/libarchive-0.1.1/ext/config.h +23 -0
- data/ext/libarchive-0.1.1/ext/config.log +230 -0
- data/ext/libarchive-0.1.1/ext/config.status +671 -0
- data/ext/libarchive-0.1.1/ext/libarchive.c +1 -1
- data/ext/libarchive-0.1.1/ext/libarchive.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_archive.c +7 -7
- data/ext/libarchive-0.1.1/ext/libarchive_archive.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_entry.c +6 -0
- data/ext/libarchive-0.1.1/ext/libarchive_entry.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_internal.h +0 -1
- data/ext/libarchive-0.1.1/ext/libarchive_reader.c +6 -4
- data/ext/libarchive-0.1.1/ext/libarchive_reader.o +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_ruby.so +0 -0
- data/ext/libarchive-0.1.1/ext/libarchive_win32.h +1 -1
- data/ext/libarchive-0.1.1/ext/libarchive_writer.c +2 -2
- data/ext/libarchive-0.1.1/ext/libarchive_writer.o +0 -0
- data/ext/libarchive-3.6.2/Makefile.in +16892 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_append_compile_flags.m4 +67 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_append_flag.m4 +71 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_check_compile_flag.m4 +74 -0
- data/ext/libarchive-3.6.2/build/autoconf/ax_require_defined.m4 +37 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/autoconf/check_stdcall_func.m4 +0 -0
- data/ext/libarchive-3.6.2/build/autoconf/compile +348 -0
- data/ext/libarchive-3.6.2/build/autoconf/config.guess +1754 -0
- data/ext/libarchive-3.6.2/build/autoconf/config.rpath +696 -0
- data/ext/libarchive-3.6.2/build/autoconf/config.sub +1890 -0
- data/ext/libarchive-3.6.2/build/autoconf/depcomp +791 -0
- data/ext/libarchive-3.6.2/build/autoconf/iconv.m4 +271 -0
- data/ext/libarchive-3.6.2/build/autoconf/install-sh +541 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/autoconf/la_uid_t.m4 +0 -0
- data/ext/libarchive-3.6.2/build/autoconf/lib-ld.m4 +109 -0
- data/ext/libarchive-3.6.2/build/autoconf/lib-link.m4 +777 -0
- data/ext/libarchive-3.6.2/build/autoconf/lib-prefix.m4 +224 -0
- data/ext/libarchive-3.6.2/build/autoconf/ltmain.sh +11251 -0
- data/ext/libarchive-3.6.2/build/autoconf/m4_ax_compile_check_sizeof.m4 +115 -0
- data/ext/libarchive-3.6.2/build/autoconf/missing +215 -0
- data/ext/libarchive-3.6.2/build/autoconf/test-driver +153 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/build/pkgconfig/libarchive.pc.in +4 -1
- data/ext/libarchive-3.6.2/config.h.in +1504 -0
- data/ext/libarchive-3.6.2/configure +25558 -0
- data/ext/libarchive-3.6.2/libarchive/archive.h +1212 -0
- data/ext/libarchive-3.6.2/libarchive/archive_acl.c +2097 -0
- data/ext/libarchive-3.6.2/libarchive/archive_acl_private.h +83 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2.h +197 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2_impl.h +161 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2s_ref.c +369 -0
- data/ext/libarchive-3.6.2/libarchive/archive_blake2sp_ref.c +361 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_check_magic.c +63 -22
- data/ext/libarchive-3.6.2/libarchive/archive_cmdline.c +227 -0
- data/ext/libarchive-3.6.2/libarchive/archive_cmdline_private.h +47 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_crc32.h +17 -0
- data/ext/libarchive-3.6.2/libarchive/archive_cryptor.c +534 -0
- data/ext/libarchive-3.6.2/libarchive/archive_cryptor_private.h +188 -0
- data/ext/libarchive-3.6.2/libarchive/archive_digest.c +1505 -0
- data/ext/libarchive-3.6.2/libarchive/archive_digest_private.h +416 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_darwin.c +559 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_freebsd.c +712 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_linux.c +760 -0
- data/ext/libarchive-3.6.2/libarchive/archive_disk_acl_sunos.c +824 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_endian.h +48 -15
- data/ext/libarchive-3.6.2/libarchive/archive_entry.c +2149 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry.h +305 -106
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_copy_bhfi.c +5 -4
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_copy_stat.c +9 -3
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_link_resolver.c +104 -62
- data/ext/libarchive-3.6.2/libarchive/archive_entry_locale.h +92 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_private.h +65 -49
- data/ext/libarchive-3.6.2/libarchive/archive_entry_sparse.c +156 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_stat.c +6 -6
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_strmode.c +1 -1
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_entry_xattr.c +4 -6
- data/ext/libarchive-3.6.2/libarchive/archive_getdate.c +1165 -0
- data/ext/libarchive-3.6.2/libarchive/archive_getdate.h +39 -0
- data/ext/libarchive-3.6.2/libarchive/archive_hmac.c +334 -0
- data/ext/libarchive-3.6.2/libarchive/archive_hmac_private.h +117 -0
- data/ext/libarchive-3.6.2/libarchive/archive_match.c +1875 -0
- data/ext/libarchive-3.6.2/libarchive/archive_openssl_evp_private.h +53 -0
- data/ext/libarchive-3.6.2/libarchive/archive_openssl_hmac_private.h +54 -0
- data/ext/libarchive-3.6.2/libarchive/archive_options.c +218 -0
- data/ext/libarchive-3.6.2/libarchive/archive_options_private.h +51 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pack_dev.c +337 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pack_dev.h +49 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pathmatch.c +463 -0
- data/ext/libarchive-3.6.2/libarchive/archive_pathmatch.h +52 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_platform.h +77 -9
- data/ext/libarchive-3.6.2/libarchive/archive_platform_acl.h +55 -0
- data/ext/libarchive-3.6.2/libarchive/archive_platform_xattr.h +47 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd7.c +1168 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd7_private.h +119 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd8.c +1287 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd8_private.h +148 -0
- data/ext/libarchive-3.6.2/libarchive/archive_ppmd_private.h +151 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_private.h +74 -18
- data/ext/libarchive-3.6.2/libarchive/archive_random.c +272 -0
- data/ext/libarchive-3.6.2/libarchive/archive_random_private.h +36 -0
- data/ext/libarchive-3.6.2/libarchive/archive_rb.c +709 -0
- data/ext/libarchive-3.6.2/libarchive/archive_rb.h +113 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read.c +1756 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_add_passphrase.c +190 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_append_filter.c +204 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_data_into_fd.c +64 -18
- data/ext/libarchive-3.6.2/libarchive/archive_read_disk_entry_from_file.c +1086 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_disk_posix.c +2732 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_disk_private.h +40 -4
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_disk_set_standard_lookup.c +21 -11
- data/ext/libarchive-3.6.2/libarchive/archive_read_disk_windows.c +2479 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_extract.c +60 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_extract.c → libarchive-3.6.2/libarchive/archive_read_extract2.c} +34 -61
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_fd.c +70 -49
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_file.c +38 -23
- data/ext/libarchive-3.6.2/libarchive/archive_read_open_filename.c +586 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_open_memory.c +58 -28
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_private.h +127 -59
- data/ext/libarchive-3.6.2/libarchive/archive_read_set_format.c +117 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_set_options.c +133 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_all.c → libarchive-3.6.2/libarchive/archive_read_support_filter_all.c} +35 -10
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_by_code.c +83 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_bzip2.c → libarchive-3.6.2/libarchive/archive_read_support_filter_bzip2.c} +38 -26
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_compress.c → libarchive-3.6.2/libarchive/archive_read_support_filter_compress.c} +52 -44
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_grzip.c +112 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_gzip.c → libarchive-3.6.2/libarchive/archive_read_support_filter_gzip.c} +108 -37
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lrzip.c +122 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lz4.c +742 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_lzop.c +499 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_none.c → libarchive-3.6.2/libarchive/archive_read_support_filter_none.c} +15 -3
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_program.c → libarchive-3.6.2/libarchive/archive_read_support_filter_program.c} +114 -77
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_rpm.c → libarchive-3.6.2/libarchive/archive_read_support_filter_rpm.c} +31 -31
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_uu.c → libarchive-3.6.2/libarchive/archive_read_support_filter_uu.c} +141 -85
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_compression_xz.c → libarchive-3.6.2/libarchive/archive_read_support_filter_xz.c} +369 -284
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_filter_zstd.c +297 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_7zip.c +3900 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_all.c +89 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_ar.c +126 -72
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_by_code.c +92 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_cab.c +3228 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_cpio.c +1104 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_empty.c +14 -11
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_iso9660.c +990 -541
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_lha.c +2916 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_mtree.c +2150 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_rar.c +3797 -0
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_rar5.c +4251 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_raw.c +38 -31
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_tar.c +1157 -629
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_warc.c +848 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_read_support_format_xar.c +439 -258
- data/ext/libarchive-3.6.2/libarchive/archive_read_support_format_zip.c +4270 -0
- data/ext/libarchive-3.6.2/libarchive/archive_string.c +4240 -0
- data/ext/libarchive-3.6.2/libarchive/archive_string.h +243 -0
- data/ext/libarchive-3.6.2/libarchive/archive_string_composition.h +2292 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_string_sprintf.c +44 -16
- data/ext/libarchive-3.6.2/libarchive/archive_util.c +655 -0
- data/ext/libarchive-3.6.2/libarchive/archive_version_details.c +151 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_virtual.c +85 -16
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_windows.c +214 -541
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_windows.h +74 -106
- data/ext/libarchive-3.6.2/libarchive/archive_write.c +828 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter.c +72 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_b64encode.c +304 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_by_name.c +77 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_bzip2.c +401 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_write_set_compression_compress.c → libarchive-3.6.2/libarchive/archive_write_add_filter_compress.c} +86 -131
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_grzip.c +135 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_gzip.c +442 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lrzip.c +197 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lz4.c +700 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_lzop.c +478 -0
- data/ext/{libarchive-2.8.4/libarchive/archive_read_support_format_all.c → libarchive-3.6.2/libarchive/archive_write_add_filter_none.c} +11 -11
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_program.c +391 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_uuencode.c +295 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_xz.c +545 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_add_filter_zstd.c +418 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_disk_posix.c +4711 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_disk_private.h +9 -2
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_disk_set_standard_lookup.c +30 -29
- data/ext/libarchive-3.6.2/libarchive/archive_write_disk_windows.c +2842 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_fd.c +15 -10
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_file.c +15 -9
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_filename.c +128 -20
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_open_memory.c +7 -18
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_private.h +72 -29
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format.c +56 -3
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_7zip.c +2322 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_ar.c +54 -34
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_by_name.c +20 -2
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio.c +11 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_binary.c +610 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_newc.c +457 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_cpio_odc.c +500 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_filter_by_ext.c +142 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_gnutar.c +755 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_iso9660.c +8165 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_mtree.c +2217 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_pax.c +1049 -387
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_private.h +42 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_raw.c +125 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_shar.c +62 -47
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/archive_write_set_format_ustar.c +279 -108
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_v7tar.c +638 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_warc.c +453 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_xar.c +3259 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_format_zip.c +1704 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_options.c +130 -0
- data/ext/libarchive-3.6.2/libarchive/archive_write_set_passphrase.c +95 -0
- data/ext/libarchive-3.6.2/libarchive/archive_xxhash.h +48 -0
- data/ext/libarchive-3.6.2/libarchive/config_freebsd.h +271 -0
- data/ext/{libarchive-2.8.4 → libarchive-3.6.2}/libarchive/filter_fork.h +10 -5
- data/ext/{libarchive-2.8.4/libarchive/filter_fork.c → libarchive-3.6.2/libarchive/filter_fork_posix.c} +98 -19
- data/ext/libarchive-3.6.2/libarchive/filter_fork_windows.c +236 -0
- data/ext/libarchive-3.6.2/libarchive/xxhash.c +525 -0
- data/ext/libarchive-static-makefile +144 -80
- data/ext/libarchive-static-wrapper-makefile +1 -1
- data/ext/zlib-1.2.13/Makefile.in +404 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/adler32.c +51 -34
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/compress.c +27 -21
- data/ext/zlib-1.2.13/configure +922 -0
- data/ext/zlib-1.2.13/crc32.c +1125 -0
- data/ext/zlib-1.2.13/crc32.h +9446 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/deflate.c +842 -459
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/deflate.h +37 -33
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzclose.c +0 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzguts.h +103 -16
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/gzlib.c +155 -53
- data/ext/zlib-1.2.13/gzread.c +650 -0
- data/ext/zlib-1.2.13/gzwrite.c +677 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/infback.c +24 -12
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffast.c +49 -66
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffast.h +0 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inffixed.h +3 -3
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inflate.c +209 -94
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inflate.h +9 -5
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inftrees.c +24 -50
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/inftrees.h +1 -1
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/trees.c +135 -198
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/trees.h +0 -0
- data/ext/zlib-1.2.13/uncompr.c +93 -0
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zconf.h +182 -63
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zlib.h +617 -295
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zutil.c +50 -41
- data/ext/{zlib-1.2.5 → zlib-1.2.13}/zutil.h +83 -82
- metadata +244 -137
- data/ext/libarchive-0.1.1/libarchive.c +0 -1762
- data/ext/libarchive-2.8.4/Makefile.in +0 -7076
- data/ext/libarchive-2.8.4/build/autoconf/compile +0 -143
- data/ext/libarchive-2.8.4/build/autoconf/config.guess +0 -1502
- data/ext/libarchive-2.8.4/build/autoconf/config.sub +0 -1708
- data/ext/libarchive-2.8.4/build/autoconf/depcomp +0 -630
- data/ext/libarchive-2.8.4/build/autoconf/install-sh +0 -291
- data/ext/libarchive-2.8.4/build/autoconf/ltmain.sh +0 -8406
- data/ext/libarchive-2.8.4/build/autoconf/missing +0 -376
- data/ext/libarchive-2.8.4/config.h.in +0 -772
- data/ext/libarchive-2.8.4/configure +0 -17916
- data/ext/libarchive-2.8.4/libarchive/archive.h +0 -741
- data/ext/libarchive-2.8.4/libarchive/archive_entry.c +0 -2202
- data/ext/libarchive-2.8.4/libarchive/archive_hash.h +0 -281
- data/ext/libarchive-2.8.4/libarchive/archive_read.c +0 -1249
- data/ext/libarchive-2.8.4/libarchive/archive_read_disk.c +0 -198
- data/ext/libarchive-2.8.4/libarchive/archive_read_disk_entry_from_file.c +0 -570
- data/ext/libarchive-2.8.4/libarchive/archive_read_open_filename.c +0 -272
- data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_cpio.c +0 -777
- data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_mtree.c +0 -1304
- data/ext/libarchive-2.8.4/libarchive/archive_read_support_format_zip.c +0 -903
- data/ext/libarchive-2.8.4/libarchive/archive_string.c +0 -453
- data/ext/libarchive-2.8.4/libarchive/archive_string.h +0 -148
- data/ext/libarchive-2.8.4/libarchive/archive_util.c +0 -391
- data/ext/libarchive-2.8.4/libarchive/archive_write.c +0 -466
- data/ext/libarchive-2.8.4/libarchive/archive_write_disk.c +0 -2628
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_bzip2.c +0 -408
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_gzip.c +0 -477
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_none.c +0 -257
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_program.c +0 -347
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_compression_xz.c +0 -438
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_cpio.c +0 -344
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_cpio_newc.c +0 -295
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_mtree.c +0 -1050
- data/ext/libarchive-2.8.4/libarchive/archive_write_set_format_zip.c +0 -667
- data/ext/libarchive-2.8.4/libarchive/config_freebsd.h +0 -154
- data/ext/libarchive-2.8.4/libarchive/filter_fork_windows.c +0 -113
- data/ext/zlib-1.2.5/Makefile.in +0 -257
- data/ext/zlib-1.2.5/configure +0 -596
- data/ext/zlib-1.2.5/crc32.c +0 -442
- data/ext/zlib-1.2.5/crc32.h +0 -441
- data/ext/zlib-1.2.5/example.c +0 -565
- data/ext/zlib-1.2.5/gzread.c +0 -653
- data/ext/zlib-1.2.5/gzwrite.c +0 -531
- data/ext/zlib-1.2.5/minigzip.c +0 -440
- data/ext/zlib-1.2.5/uncompr.c +0 -59
@@ -0,0 +1,4240 @@
|
|
1
|
+
/*-
|
2
|
+
* Copyright (c) 2003-2011 Tim Kientzle
|
3
|
+
* Copyright (c) 2011-2012 Michihiro NAKAJIMA
|
4
|
+
* All rights reserved.
|
5
|
+
*
|
6
|
+
* Redistribution and use in source and binary forms, with or without
|
7
|
+
* modification, are permitted provided that the following conditions
|
8
|
+
* are met:
|
9
|
+
* 1. Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* 2. Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
|
16
|
+
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
17
|
+
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
18
|
+
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
|
19
|
+
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
20
|
+
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
21
|
+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
22
|
+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
24
|
+
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "archive_platform.h"
|
28
|
+
__FBSDID("$FreeBSD: head/lib/libarchive/archive_string.c 201095 2009-12-28 02:33:22Z kientzle $");
|
29
|
+
|
30
|
+
/*
|
31
|
+
* Basic resizable string support, to simplify manipulating arbitrary-sized
|
32
|
+
* strings while minimizing heap activity.
|
33
|
+
*
|
34
|
+
* In particular, the buffer used by a string object is only grown, it
|
35
|
+
* never shrinks, so you can clear and reuse the same string object
|
36
|
+
* without incurring additional memory allocations.
|
37
|
+
*/
|
38
|
+
|
39
|
+
#ifdef HAVE_ERRNO_H
|
40
|
+
#include <errno.h>
|
41
|
+
#endif
|
42
|
+
#ifdef HAVE_ICONV_H
|
43
|
+
#include <iconv.h>
|
44
|
+
#endif
|
45
|
+
#ifdef HAVE_LANGINFO_H
|
46
|
+
#include <langinfo.h>
|
47
|
+
#endif
|
48
|
+
#ifdef HAVE_LOCALCHARSET_H
|
49
|
+
#include <localcharset.h>
|
50
|
+
#endif
|
51
|
+
#ifdef HAVE_STDLIB_H
|
52
|
+
#include <stdlib.h>
|
53
|
+
#endif
|
54
|
+
#ifdef HAVE_STRING_H
|
55
|
+
#include <string.h>
|
56
|
+
#endif
|
57
|
+
#ifdef HAVE_WCHAR_H
|
58
|
+
#include <wchar.h>
|
59
|
+
#endif
|
60
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
61
|
+
#include <windows.h>
|
62
|
+
#include <locale.h>
|
63
|
+
#endif
|
64
|
+
|
65
|
+
#include "archive_endian.h"
|
66
|
+
#include "archive_private.h"
|
67
|
+
#include "archive_string.h"
|
68
|
+
#include "archive_string_composition.h"
|
69
|
+
|
70
|
+
#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy)
|
71
|
+
#define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t))
|
72
|
+
#endif
|
73
|
+
|
74
|
+
#if !defined(HAVE_WMEMMOVE) && !defined(wmemmove)
|
75
|
+
#define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t))
|
76
|
+
#endif
|
77
|
+
|
78
|
+
#undef max
|
79
|
+
#define max(a, b) ((a)>(b)?(a):(b))
|
80
|
+
|
81
|
+
struct archive_string_conv {
|
82
|
+
struct archive_string_conv *next;
|
83
|
+
char *from_charset;
|
84
|
+
char *to_charset;
|
85
|
+
unsigned from_cp;
|
86
|
+
unsigned to_cp;
|
87
|
+
/* Set 1 if from_charset and to_charset are the same. */
|
88
|
+
int same;
|
89
|
+
int flag;
|
90
|
+
#define SCONV_TO_CHARSET 1 /* MBS is being converted to specified
|
91
|
+
* charset. */
|
92
|
+
#define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from
|
93
|
+
* specified charset. */
|
94
|
+
#define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */
|
95
|
+
#define SCONV_WIN_CP (1<<3) /* Use Windows API for converting
|
96
|
+
* MBS. */
|
97
|
+
#define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive
|
98
|
+
* 2.x in the wrong assumption. */
|
99
|
+
#define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C.
|
100
|
+
* Before UTF-8 characters are actually
|
101
|
+
* processed. */
|
102
|
+
#define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D.
|
103
|
+
* Before UTF-8 characters are actually
|
104
|
+
* processed.
|
105
|
+
* Currently this only for MAC OS X. */
|
106
|
+
#define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */
|
107
|
+
#define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */
|
108
|
+
#define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */
|
109
|
+
#define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */
|
110
|
+
#define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */
|
111
|
+
#define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */
|
112
|
+
#define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE)
|
113
|
+
#define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE)
|
114
|
+
|
115
|
+
#if HAVE_ICONV
|
116
|
+
iconv_t cd;
|
117
|
+
iconv_t cd_w;/* Use at archive_mstring on
|
118
|
+
* Windows. */
|
119
|
+
#endif
|
120
|
+
/* A temporary buffer for normalization. */
|
121
|
+
struct archive_string utftmp;
|
122
|
+
int (*converter[2])(struct archive_string *, const void *, size_t,
|
123
|
+
struct archive_string_conv *);
|
124
|
+
int nconverter;
|
125
|
+
};
|
126
|
+
|
127
|
+
#define CP_C_LOCALE 0 /* "C" locale only for this file. */
|
128
|
+
#define CP_UTF16LE 1200
|
129
|
+
#define CP_UTF16BE 1201
|
130
|
+
|
131
|
+
#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF)
|
132
|
+
#define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF)
|
133
|
+
#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF)
|
134
|
+
#define UNICODE_MAX 0x10FFFF
|
135
|
+
#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */
|
136
|
+
/* Set U+FFFD(Replacement character) in UTF-8. */
|
137
|
+
static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
|
138
|
+
|
139
|
+
static struct archive_string_conv *find_sconv_object(struct archive *,
|
140
|
+
const char *, const char *);
|
141
|
+
static void add_sconv_object(struct archive *, struct archive_string_conv *);
|
142
|
+
static struct archive_string_conv *create_sconv_object(const char *,
|
143
|
+
const char *, unsigned, int);
|
144
|
+
static void free_sconv_object(struct archive_string_conv *);
|
145
|
+
static struct archive_string_conv *get_sconv_object(struct archive *,
|
146
|
+
const char *, const char *, int);
|
147
|
+
static unsigned make_codepage_from_charset(const char *);
|
148
|
+
static unsigned get_current_codepage(void);
|
149
|
+
static unsigned get_current_oemcp(void);
|
150
|
+
static size_t mbsnbytes(const void *, size_t);
|
151
|
+
static size_t utf16nbytes(const void *, size_t);
|
152
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
153
|
+
static int archive_wstring_append_from_mbs_in_codepage(
|
154
|
+
struct archive_wstring *, const char *, size_t,
|
155
|
+
struct archive_string_conv *);
|
156
|
+
static int archive_string_append_from_wcs_in_codepage(struct archive_string *,
|
157
|
+
const wchar_t *, size_t, struct archive_string_conv *);
|
158
|
+
static int is_big_endian(void);
|
159
|
+
static int strncat_in_codepage(struct archive_string *, const void *,
|
160
|
+
size_t, struct archive_string_conv *);
|
161
|
+
static int win_strncat_from_utf16be(struct archive_string *, const void *,
|
162
|
+
size_t, struct archive_string_conv *);
|
163
|
+
static int win_strncat_from_utf16le(struct archive_string *, const void *,
|
164
|
+
size_t, struct archive_string_conv *);
|
165
|
+
static int win_strncat_to_utf16be(struct archive_string *, const void *,
|
166
|
+
size_t, struct archive_string_conv *);
|
167
|
+
static int win_strncat_to_utf16le(struct archive_string *, const void *,
|
168
|
+
size_t, struct archive_string_conv *);
|
169
|
+
#endif
|
170
|
+
static int best_effort_strncat_from_utf16be(struct archive_string *,
|
171
|
+
const void *, size_t, struct archive_string_conv *);
|
172
|
+
static int best_effort_strncat_from_utf16le(struct archive_string *,
|
173
|
+
const void *, size_t, struct archive_string_conv *);
|
174
|
+
static int best_effort_strncat_to_utf16be(struct archive_string *,
|
175
|
+
const void *, size_t, struct archive_string_conv *);
|
176
|
+
static int best_effort_strncat_to_utf16le(struct archive_string *,
|
177
|
+
const void *, size_t, struct archive_string_conv *);
|
178
|
+
#if defined(HAVE_ICONV)
|
179
|
+
static int iconv_strncat_in_locale(struct archive_string *, const void *,
|
180
|
+
size_t, struct archive_string_conv *);
|
181
|
+
#endif
|
182
|
+
static int best_effort_strncat_in_locale(struct archive_string *,
|
183
|
+
const void *, size_t, struct archive_string_conv *);
|
184
|
+
static int _utf8_to_unicode(uint32_t *, const char *, size_t);
|
185
|
+
static int utf8_to_unicode(uint32_t *, const char *, size_t);
|
186
|
+
static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t);
|
187
|
+
static int cesu8_to_unicode(uint32_t *, const char *, size_t);
|
188
|
+
static size_t unicode_to_utf8(char *, size_t, uint32_t);
|
189
|
+
static int utf16_to_unicode(uint32_t *, const char *, size_t, int);
|
190
|
+
static size_t unicode_to_utf16be(char *, size_t, uint32_t);
|
191
|
+
static size_t unicode_to_utf16le(char *, size_t, uint32_t);
|
192
|
+
static int strncat_from_utf8_libarchive2(struct archive_string *,
|
193
|
+
const void *, size_t, struct archive_string_conv *);
|
194
|
+
static int strncat_from_utf8_to_utf8(struct archive_string *, const void *,
|
195
|
+
size_t, struct archive_string_conv *);
|
196
|
+
static int archive_string_normalize_C(struct archive_string *, const void *,
|
197
|
+
size_t, struct archive_string_conv *);
|
198
|
+
static int archive_string_normalize_D(struct archive_string *, const void *,
|
199
|
+
size_t, struct archive_string_conv *);
|
200
|
+
static int archive_string_append_unicode(struct archive_string *,
|
201
|
+
const void *, size_t, struct archive_string_conv *);
|
202
|
+
|
203
|
+
static struct archive_string *
|
204
|
+
archive_string_append(struct archive_string *as, const char *p, size_t s)
|
205
|
+
{
|
206
|
+
if (archive_string_ensure(as, as->length + s + 1) == NULL)
|
207
|
+
return (NULL);
|
208
|
+
if (s)
|
209
|
+
memmove(as->s + as->length, p, s);
|
210
|
+
as->length += s;
|
211
|
+
as->s[as->length] = 0;
|
212
|
+
return (as);
|
213
|
+
}
|
214
|
+
|
215
|
+
static struct archive_wstring *
|
216
|
+
archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s)
|
217
|
+
{
|
218
|
+
if (archive_wstring_ensure(as, as->length + s + 1) == NULL)
|
219
|
+
return (NULL);
|
220
|
+
if (s)
|
221
|
+
wmemmove(as->s + as->length, p, s);
|
222
|
+
as->length += s;
|
223
|
+
as->s[as->length] = 0;
|
224
|
+
return (as);
|
225
|
+
}
|
226
|
+
|
227
|
+
struct archive_string *
|
228
|
+
archive_array_append(struct archive_string *as, const char *p, size_t s)
|
229
|
+
{
|
230
|
+
return archive_string_append(as, p, s);
|
231
|
+
}
|
232
|
+
|
233
|
+
void
|
234
|
+
archive_string_concat(struct archive_string *dest, struct archive_string *src)
|
235
|
+
{
|
236
|
+
if (archive_string_append(dest, src->s, src->length) == NULL)
|
237
|
+
__archive_errx(1, "Out of memory");
|
238
|
+
}
|
239
|
+
|
240
|
+
void
|
241
|
+
archive_wstring_concat(struct archive_wstring *dest,
|
242
|
+
struct archive_wstring *src)
|
243
|
+
{
|
244
|
+
if (archive_wstring_append(dest, src->s, src->length) == NULL)
|
245
|
+
__archive_errx(1, "Out of memory");
|
246
|
+
}
|
247
|
+
|
248
|
+
void
|
249
|
+
archive_string_free(struct archive_string *as)
|
250
|
+
{
|
251
|
+
as->length = 0;
|
252
|
+
as->buffer_length = 0;
|
253
|
+
free(as->s);
|
254
|
+
as->s = NULL;
|
255
|
+
}
|
256
|
+
|
257
|
+
void
|
258
|
+
archive_wstring_free(struct archive_wstring *as)
|
259
|
+
{
|
260
|
+
as->length = 0;
|
261
|
+
as->buffer_length = 0;
|
262
|
+
free(as->s);
|
263
|
+
as->s = NULL;
|
264
|
+
}
|
265
|
+
|
266
|
+
struct archive_wstring *
|
267
|
+
archive_wstring_ensure(struct archive_wstring *as, size_t s)
|
268
|
+
{
|
269
|
+
return (struct archive_wstring *)
|
270
|
+
archive_string_ensure((struct archive_string *)as,
|
271
|
+
s * sizeof(wchar_t));
|
272
|
+
}
|
273
|
+
|
274
|
+
/* Returns NULL on any allocation failure. */
|
275
|
+
struct archive_string *
|
276
|
+
archive_string_ensure(struct archive_string *as, size_t s)
|
277
|
+
{
|
278
|
+
char *p;
|
279
|
+
size_t new_length;
|
280
|
+
|
281
|
+
/* If buffer is already big enough, don't reallocate. */
|
282
|
+
if (as->s && (s <= as->buffer_length))
|
283
|
+
return (as);
|
284
|
+
|
285
|
+
/*
|
286
|
+
* Growing the buffer at least exponentially ensures that
|
287
|
+
* append operations are always linear in the number of
|
288
|
+
* characters appended. Using a smaller growth rate for
|
289
|
+
* larger buffers reduces memory waste somewhat at the cost of
|
290
|
+
* a larger constant factor.
|
291
|
+
*/
|
292
|
+
if (as->buffer_length < 32)
|
293
|
+
/* Start with a minimum 32-character buffer. */
|
294
|
+
new_length = 32;
|
295
|
+
else if (as->buffer_length < 8192)
|
296
|
+
/* Buffers under 8k are doubled for speed. */
|
297
|
+
new_length = as->buffer_length + as->buffer_length;
|
298
|
+
else {
|
299
|
+
/* Buffers 8k and over grow by at least 25% each time. */
|
300
|
+
new_length = as->buffer_length + as->buffer_length / 4;
|
301
|
+
/* Be safe: If size wraps, fail. */
|
302
|
+
if (new_length < as->buffer_length) {
|
303
|
+
/* On failure, wipe the string and return NULL. */
|
304
|
+
archive_string_free(as);
|
305
|
+
errno = ENOMEM;/* Make sure errno has ENOMEM. */
|
306
|
+
return (NULL);
|
307
|
+
}
|
308
|
+
}
|
309
|
+
/*
|
310
|
+
* The computation above is a lower limit to how much we'll
|
311
|
+
* grow the buffer. In any case, we have to grow it enough to
|
312
|
+
* hold the request.
|
313
|
+
*/
|
314
|
+
if (new_length < s)
|
315
|
+
new_length = s;
|
316
|
+
/* Now we can reallocate the buffer. */
|
317
|
+
p = (char *)realloc(as->s, new_length);
|
318
|
+
if (p == NULL) {
|
319
|
+
/* On failure, wipe the string and return NULL. */
|
320
|
+
archive_string_free(as);
|
321
|
+
errno = ENOMEM;/* Make sure errno has ENOMEM. */
|
322
|
+
return (NULL);
|
323
|
+
}
|
324
|
+
|
325
|
+
as->s = p;
|
326
|
+
as->buffer_length = new_length;
|
327
|
+
return (as);
|
328
|
+
}
|
329
|
+
|
330
|
+
/*
|
331
|
+
* TODO: See if there's a way to avoid scanning
|
332
|
+
* the source string twice. Then test to see
|
333
|
+
* if it actually helps (remember that we're almost
|
334
|
+
* always called with pretty short arguments, so
|
335
|
+
* such an optimization might not help).
|
336
|
+
*/
|
337
|
+
struct archive_string *
|
338
|
+
archive_strncat(struct archive_string *as, const void *_p, size_t n)
|
339
|
+
{
|
340
|
+
size_t s;
|
341
|
+
const char *p, *pp;
|
342
|
+
|
343
|
+
p = (const char *)_p;
|
344
|
+
|
345
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
346
|
+
s = 0;
|
347
|
+
pp = p;
|
348
|
+
while (s < n && *pp) {
|
349
|
+
pp++;
|
350
|
+
s++;
|
351
|
+
}
|
352
|
+
if ((as = archive_string_append(as, p, s)) == NULL)
|
353
|
+
__archive_errx(1, "Out of memory");
|
354
|
+
return (as);
|
355
|
+
}
|
356
|
+
|
357
|
+
struct archive_wstring *
|
358
|
+
archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n)
|
359
|
+
{
|
360
|
+
size_t s;
|
361
|
+
const wchar_t *pp;
|
362
|
+
|
363
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
364
|
+
s = 0;
|
365
|
+
pp = p;
|
366
|
+
while (s < n && *pp) {
|
367
|
+
pp++;
|
368
|
+
s++;
|
369
|
+
}
|
370
|
+
if ((as = archive_wstring_append(as, p, s)) == NULL)
|
371
|
+
__archive_errx(1, "Out of memory");
|
372
|
+
return (as);
|
373
|
+
}
|
374
|
+
|
375
|
+
struct archive_string *
|
376
|
+
archive_strcat(struct archive_string *as, const void *p)
|
377
|
+
{
|
378
|
+
/* strcat is just strncat without an effective limit.
|
379
|
+
* Assert that we'll never get called with a source
|
380
|
+
* string over 16MB.
|
381
|
+
* TODO: Review all uses of strcat in the source
|
382
|
+
* and try to replace them with strncat().
|
383
|
+
*/
|
384
|
+
return archive_strncat(as, p, 0x1000000);
|
385
|
+
}
|
386
|
+
|
387
|
+
struct archive_wstring *
|
388
|
+
archive_wstrcat(struct archive_wstring *as, const wchar_t *p)
|
389
|
+
{
|
390
|
+
/* Ditto. */
|
391
|
+
return archive_wstrncat(as, p, 0x1000000);
|
392
|
+
}
|
393
|
+
|
394
|
+
struct archive_string *
|
395
|
+
archive_strappend_char(struct archive_string *as, char c)
|
396
|
+
{
|
397
|
+
if ((as = archive_string_append(as, &c, 1)) == NULL)
|
398
|
+
__archive_errx(1, "Out of memory");
|
399
|
+
return (as);
|
400
|
+
}
|
401
|
+
|
402
|
+
struct archive_wstring *
|
403
|
+
archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c)
|
404
|
+
{
|
405
|
+
if ((as = archive_wstring_append(as, &c, 1)) == NULL)
|
406
|
+
__archive_errx(1, "Out of memory");
|
407
|
+
return (as);
|
408
|
+
}
|
409
|
+
|
410
|
+
/*
|
411
|
+
* Get the "current character set" name to use with iconv.
|
412
|
+
* On FreeBSD, the empty character set name "" chooses
|
413
|
+
* the correct character encoding for the current locale,
|
414
|
+
* so this isn't necessary.
|
415
|
+
* But iconv on Mac OS 10.6 doesn't seem to handle this correctly;
|
416
|
+
* on that system, we have to explicitly call nl_langinfo()
|
417
|
+
* to get the right name. Not sure about other platforms.
|
418
|
+
*
|
419
|
+
* NOTE: GNU libiconv does not recognize the character-set name
|
420
|
+
* which some platform nl_langinfo(CODESET) returns, so we should
|
421
|
+
* use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv.
|
422
|
+
*/
|
423
|
+
static const char *
|
424
|
+
default_iconv_charset(const char *charset) {
|
425
|
+
if (charset != NULL && charset[0] != '\0')
|
426
|
+
return charset;
|
427
|
+
#if HAVE_LOCALE_CHARSET && !defined(__APPLE__)
|
428
|
+
/* locale_charset() is broken on Mac OS */
|
429
|
+
return locale_charset();
|
430
|
+
#elif HAVE_NL_LANGINFO
|
431
|
+
return nl_langinfo(CODESET);
|
432
|
+
#else
|
433
|
+
return "";
|
434
|
+
#endif
|
435
|
+
}
|
436
|
+
|
437
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
438
|
+
|
439
|
+
/*
|
440
|
+
* Convert MBS to WCS.
|
441
|
+
* Note: returns -1 if conversion fails.
|
442
|
+
*/
|
443
|
+
int
|
444
|
+
archive_wstring_append_from_mbs(struct archive_wstring *dest,
|
445
|
+
const char *p, size_t len)
|
446
|
+
{
|
447
|
+
return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL);
|
448
|
+
}
|
449
|
+
|
450
|
+
static int
|
451
|
+
archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest,
|
452
|
+
const char *s, size_t length, struct archive_string_conv *sc)
|
453
|
+
{
|
454
|
+
int count, ret = 0;
|
455
|
+
UINT from_cp;
|
456
|
+
|
457
|
+
if (sc != NULL)
|
458
|
+
from_cp = sc->from_cp;
|
459
|
+
else
|
460
|
+
from_cp = get_current_codepage();
|
461
|
+
|
462
|
+
if (from_cp == CP_C_LOCALE) {
|
463
|
+
/*
|
464
|
+
* "C" locale special processing.
|
465
|
+
*/
|
466
|
+
wchar_t *ws;
|
467
|
+
const unsigned char *mp;
|
468
|
+
|
469
|
+
if (NULL == archive_wstring_ensure(dest,
|
470
|
+
dest->length + length + 1))
|
471
|
+
return (-1);
|
472
|
+
|
473
|
+
ws = dest->s + dest->length;
|
474
|
+
mp = (const unsigned char *)s;
|
475
|
+
count = 0;
|
476
|
+
while (count < (int)length && *mp) {
|
477
|
+
*ws++ = (wchar_t)*mp++;
|
478
|
+
count++;
|
479
|
+
}
|
480
|
+
} else if (sc != NULL &&
|
481
|
+
(sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) {
|
482
|
+
/*
|
483
|
+
* Normalize UTF-8 and UTF-16BE and convert it directly
|
484
|
+
* to UTF-16 as wchar_t.
|
485
|
+
*/
|
486
|
+
struct archive_string u16;
|
487
|
+
int saved_flag = sc->flag;/* save current flag. */
|
488
|
+
|
489
|
+
if (is_big_endian())
|
490
|
+
sc->flag |= SCONV_TO_UTF16BE;
|
491
|
+
else
|
492
|
+
sc->flag |= SCONV_TO_UTF16LE;
|
493
|
+
|
494
|
+
if (sc->flag & SCONV_FROM_UTF16) {
|
495
|
+
/*
|
496
|
+
* UTF-16BE/LE NFD ===> UTF-16 NFC
|
497
|
+
* UTF-16BE/LE NFC ===> UTF-16 NFD
|
498
|
+
*/
|
499
|
+
count = (int)utf16nbytes(s, length);
|
500
|
+
} else {
|
501
|
+
/*
|
502
|
+
* UTF-8 NFD ===> UTF-16 NFC
|
503
|
+
* UTF-8 NFC ===> UTF-16 NFD
|
504
|
+
*/
|
505
|
+
count = (int)mbsnbytes(s, length);
|
506
|
+
}
|
507
|
+
u16.s = (char *)dest->s;
|
508
|
+
u16.length = dest->length << 1;;
|
509
|
+
u16.buffer_length = dest->buffer_length;
|
510
|
+
if (sc->flag & SCONV_NORMALIZATION_C)
|
511
|
+
ret = archive_string_normalize_C(&u16, s, count, sc);
|
512
|
+
else
|
513
|
+
ret = archive_string_normalize_D(&u16, s, count, sc);
|
514
|
+
dest->s = (wchar_t *)u16.s;
|
515
|
+
dest->length = u16.length >> 1;
|
516
|
+
dest->buffer_length = u16.buffer_length;
|
517
|
+
sc->flag = saved_flag;/* restore the saved flag. */
|
518
|
+
return (ret);
|
519
|
+
} else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) {
|
520
|
+
count = (int)utf16nbytes(s, length);
|
521
|
+
count >>= 1; /* to be WCS length */
|
522
|
+
/* Allocate memory for WCS. */
|
523
|
+
if (NULL == archive_wstring_ensure(dest,
|
524
|
+
dest->length + count + 1))
|
525
|
+
return (-1);
|
526
|
+
wmemcpy(dest->s + dest->length, (const wchar_t *)s, count);
|
527
|
+
if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) {
|
528
|
+
uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
|
529
|
+
int b;
|
530
|
+
for (b = 0; b < count; b++) {
|
531
|
+
uint16_t val = archive_le16dec(u16+b);
|
532
|
+
archive_be16enc(u16+b, val);
|
533
|
+
}
|
534
|
+
} else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) {
|
535
|
+
uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
|
536
|
+
int b;
|
537
|
+
for (b = 0; b < count; b++) {
|
538
|
+
uint16_t val = archive_be16dec(u16+b);
|
539
|
+
archive_le16enc(u16+b, val);
|
540
|
+
}
|
541
|
+
}
|
542
|
+
} else {
|
543
|
+
DWORD mbflag;
|
544
|
+
size_t buffsize;
|
545
|
+
|
546
|
+
if (sc == NULL)
|
547
|
+
mbflag = 0;
|
548
|
+
else if (sc->flag & SCONV_FROM_CHARSET) {
|
549
|
+
/* Do not trust the length which comes from
|
550
|
+
* an archive file. */
|
551
|
+
length = mbsnbytes(s, length);
|
552
|
+
mbflag = 0;
|
553
|
+
} else
|
554
|
+
mbflag = MB_PRECOMPOSED;
|
555
|
+
|
556
|
+
buffsize = dest->length + length + 1;
|
557
|
+
do {
|
558
|
+
/* Allocate memory for WCS. */
|
559
|
+
if (NULL == archive_wstring_ensure(dest, buffsize))
|
560
|
+
return (-1);
|
561
|
+
/* Convert MBS to WCS. */
|
562
|
+
count = MultiByteToWideChar(from_cp,
|
563
|
+
mbflag, s, (int)length, dest->s + dest->length,
|
564
|
+
(int)(dest->buffer_length >> 1) -1);
|
565
|
+
if (count == 0 &&
|
566
|
+
GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
|
567
|
+
/* Expand the WCS buffer. */
|
568
|
+
buffsize = dest->buffer_length << 1;
|
569
|
+
continue;
|
570
|
+
}
|
571
|
+
if (count == 0 && length != 0)
|
572
|
+
ret = -1;
|
573
|
+
break;
|
574
|
+
} while (1);
|
575
|
+
}
|
576
|
+
dest->length += count;
|
577
|
+
dest->s[dest->length] = L'\0';
|
578
|
+
return (ret);
|
579
|
+
}
|
580
|
+
|
581
|
+
#else
|
582
|
+
|
583
|
+
/*
|
584
|
+
* Convert MBS to WCS.
|
585
|
+
* Note: returns -1 if conversion fails.
|
586
|
+
*/
|
587
|
+
int
|
588
|
+
archive_wstring_append_from_mbs(struct archive_wstring *dest,
|
589
|
+
const char *p, size_t len)
|
590
|
+
{
|
591
|
+
size_t r;
|
592
|
+
int ret_val = 0;
|
593
|
+
/*
|
594
|
+
* No single byte will be more than one wide character,
|
595
|
+
* so this length estimate will always be big enough.
|
596
|
+
*/
|
597
|
+
// size_t wcs_length = len;
|
598
|
+
size_t mbs_length = len;
|
599
|
+
const char *mbs = p;
|
600
|
+
wchar_t *wcs;
|
601
|
+
#if HAVE_MBRTOWC
|
602
|
+
mbstate_t shift_state;
|
603
|
+
|
604
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
605
|
+
#endif
|
606
|
+
/*
|
607
|
+
* As we decided to have wcs_length == mbs_length == len
|
608
|
+
* we can use len here instead of wcs_length
|
609
|
+
*/
|
610
|
+
if (NULL == archive_wstring_ensure(dest, dest->length + len + 1))
|
611
|
+
return (-1);
|
612
|
+
wcs = dest->s + dest->length;
|
613
|
+
/*
|
614
|
+
* We cannot use mbsrtowcs/mbstowcs here because those may convert
|
615
|
+
* extra MBS when strlen(p) > len and one wide character consists of
|
616
|
+
* multi bytes.
|
617
|
+
*/
|
618
|
+
while (*mbs && mbs_length > 0) {
|
619
|
+
/*
|
620
|
+
* The buffer we allocated is always big enough.
|
621
|
+
* Keep this code path in a comment if we decide to choose
|
622
|
+
* smaller wcs_length in the future
|
623
|
+
*/
|
624
|
+
/*
|
625
|
+
if (wcs_length == 0) {
|
626
|
+
dest->length = wcs - dest->s;
|
627
|
+
dest->s[dest->length] = L'\0';
|
628
|
+
wcs_length = mbs_length;
|
629
|
+
if (NULL == archive_wstring_ensure(dest,
|
630
|
+
dest->length + wcs_length + 1))
|
631
|
+
return (-1);
|
632
|
+
wcs = dest->s + dest->length;
|
633
|
+
}
|
634
|
+
*/
|
635
|
+
#if HAVE_MBRTOWC
|
636
|
+
r = mbrtowc(wcs, mbs, mbs_length, &shift_state);
|
637
|
+
#else
|
638
|
+
r = mbtowc(wcs, mbs, mbs_length);
|
639
|
+
#endif
|
640
|
+
if (r == (size_t)-1 || r == (size_t)-2) {
|
641
|
+
ret_val = -1;
|
642
|
+
break;
|
643
|
+
}
|
644
|
+
if (r == 0 || r > mbs_length)
|
645
|
+
break;
|
646
|
+
wcs++;
|
647
|
+
// wcs_length--;
|
648
|
+
mbs += r;
|
649
|
+
mbs_length -= r;
|
650
|
+
}
|
651
|
+
dest->length = wcs - dest->s;
|
652
|
+
dest->s[dest->length] = L'\0';
|
653
|
+
return (ret_val);
|
654
|
+
}
|
655
|
+
|
656
|
+
#endif
|
657
|
+
|
658
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
659
|
+
|
660
|
+
/*
|
661
|
+
* WCS ==> MBS.
|
662
|
+
* Note: returns -1 if conversion fails.
|
663
|
+
*
|
664
|
+
* Win32 builds use WideCharToMultiByte from the Windows API.
|
665
|
+
* (Maybe Cygwin should too? WideCharToMultiByte will know a
|
666
|
+
* lot more about local character encodings than the wcrtomb()
|
667
|
+
* wrapper is going to know.)
|
668
|
+
*/
|
669
|
+
int
|
670
|
+
archive_string_append_from_wcs(struct archive_string *as,
|
671
|
+
const wchar_t *w, size_t len)
|
672
|
+
{
|
673
|
+
return archive_string_append_from_wcs_in_codepage(as, w, len, NULL);
|
674
|
+
}
|
675
|
+
|
676
|
+
static int
|
677
|
+
archive_string_append_from_wcs_in_codepage(struct archive_string *as,
|
678
|
+
const wchar_t *ws, size_t len, struct archive_string_conv *sc)
|
679
|
+
{
|
680
|
+
BOOL defchar_used, *dp;
|
681
|
+
int count, ret = 0;
|
682
|
+
UINT to_cp;
|
683
|
+
int wslen = (int)len;
|
684
|
+
|
685
|
+
if (sc != NULL)
|
686
|
+
to_cp = sc->to_cp;
|
687
|
+
else
|
688
|
+
to_cp = get_current_codepage();
|
689
|
+
|
690
|
+
if (to_cp == CP_C_LOCALE) {
|
691
|
+
/*
|
692
|
+
* "C" locale special processing.
|
693
|
+
*/
|
694
|
+
const wchar_t *wp = ws;
|
695
|
+
char *p;
|
696
|
+
|
697
|
+
if (NULL == archive_string_ensure(as,
|
698
|
+
as->length + wslen +1))
|
699
|
+
return (-1);
|
700
|
+
p = as->s + as->length;
|
701
|
+
count = 0;
|
702
|
+
defchar_used = 0;
|
703
|
+
while (count < wslen && *wp) {
|
704
|
+
if (*wp > 255) {
|
705
|
+
*p++ = '?';
|
706
|
+
wp++;
|
707
|
+
defchar_used = 1;
|
708
|
+
} else
|
709
|
+
*p++ = (char)*wp++;
|
710
|
+
count++;
|
711
|
+
}
|
712
|
+
} else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) {
|
713
|
+
uint16_t *u16;
|
714
|
+
|
715
|
+
if (NULL ==
|
716
|
+
archive_string_ensure(as, as->length + len * 2 + 2))
|
717
|
+
return (-1);
|
718
|
+
u16 = (uint16_t *)(as->s + as->length);
|
719
|
+
count = 0;
|
720
|
+
defchar_used = 0;
|
721
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
722
|
+
while (count < (int)len && *ws) {
|
723
|
+
archive_be16enc(u16+count, *ws);
|
724
|
+
ws++;
|
725
|
+
count++;
|
726
|
+
}
|
727
|
+
} else {
|
728
|
+
while (count < (int)len && *ws) {
|
729
|
+
archive_le16enc(u16+count, *ws);
|
730
|
+
ws++;
|
731
|
+
count++;
|
732
|
+
}
|
733
|
+
}
|
734
|
+
count <<= 1; /* to be byte size */
|
735
|
+
} else {
|
736
|
+
/* Make sure the MBS buffer has plenty to set. */
|
737
|
+
if (NULL ==
|
738
|
+
archive_string_ensure(as, as->length + len * 2 + 1))
|
739
|
+
return (-1);
|
740
|
+
do {
|
741
|
+
defchar_used = 0;
|
742
|
+
if (to_cp == CP_UTF8 || sc == NULL)
|
743
|
+
dp = NULL;
|
744
|
+
else
|
745
|
+
dp = &defchar_used;
|
746
|
+
count = WideCharToMultiByte(to_cp, 0, ws, wslen,
|
747
|
+
as->s + as->length,
|
748
|
+
(int)as->buffer_length - (int)as->length - 1, NULL, dp);
|
749
|
+
if (count == 0 &&
|
750
|
+
GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
|
751
|
+
/* Expand the MBS buffer and retry. */
|
752
|
+
if (NULL == archive_string_ensure(as,
|
753
|
+
as->buffer_length + len))
|
754
|
+
return (-1);
|
755
|
+
continue;
|
756
|
+
}
|
757
|
+
if (count == 0)
|
758
|
+
ret = -1;
|
759
|
+
break;
|
760
|
+
} while (1);
|
761
|
+
}
|
762
|
+
as->length += count;
|
763
|
+
as->s[as->length] = '\0';
|
764
|
+
return (defchar_used?-1:ret);
|
765
|
+
}
|
766
|
+
|
767
|
+
#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB)
|
768
|
+
|
769
|
+
/*
|
770
|
+
* Translates a wide character string into current locale character set
|
771
|
+
* and appends to the archive_string. Note: returns -1 if conversion
|
772
|
+
* fails.
|
773
|
+
*/
|
774
|
+
int
|
775
|
+
archive_string_append_from_wcs(struct archive_string *as,
|
776
|
+
const wchar_t *w, size_t len)
|
777
|
+
{
|
778
|
+
/* We cannot use the standard wcstombs() here because it
|
779
|
+
* cannot tell us how big the output buffer should be. So
|
780
|
+
* I've built a loop around wcrtomb() or wctomb() that
|
781
|
+
* converts a character at a time and resizes the string as
|
782
|
+
* needed. We prefer wcrtomb() when it's available because
|
783
|
+
* it's thread-safe. */
|
784
|
+
int n, ret_val = 0;
|
785
|
+
char *p;
|
786
|
+
char *end;
|
787
|
+
#if HAVE_WCRTOMB
|
788
|
+
mbstate_t shift_state;
|
789
|
+
|
790
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
791
|
+
#else
|
792
|
+
/* Clear the shift state before starting. */
|
793
|
+
wctomb(NULL, L'\0');
|
794
|
+
#endif
|
795
|
+
/*
|
796
|
+
* Allocate buffer for MBS.
|
797
|
+
* We need this allocation here since it is possible that
|
798
|
+
* as->s is still NULL.
|
799
|
+
*/
|
800
|
+
if (archive_string_ensure(as, as->length + len + 1) == NULL)
|
801
|
+
return (-1);
|
802
|
+
|
803
|
+
p = as->s + as->length;
|
804
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
805
|
+
while (*w != L'\0' && len > 0) {
|
806
|
+
if (p >= end) {
|
807
|
+
as->length = p - as->s;
|
808
|
+
as->s[as->length] = '\0';
|
809
|
+
/* Re-allocate buffer for MBS. */
|
810
|
+
if (archive_string_ensure(as,
|
811
|
+
as->length + max(len * 2,
|
812
|
+
(size_t)MB_CUR_MAX) + 1) == NULL)
|
813
|
+
return (-1);
|
814
|
+
p = as->s + as->length;
|
815
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
816
|
+
}
|
817
|
+
#if HAVE_WCRTOMB
|
818
|
+
n = wcrtomb(p, *w++, &shift_state);
|
819
|
+
#else
|
820
|
+
n = wctomb(p, *w++);
|
821
|
+
#endif
|
822
|
+
if (n == -1) {
|
823
|
+
if (errno == EILSEQ) {
|
824
|
+
/* Skip an illegal wide char. */
|
825
|
+
*p++ = '?';
|
826
|
+
ret_val = -1;
|
827
|
+
} else {
|
828
|
+
ret_val = -1;
|
829
|
+
break;
|
830
|
+
}
|
831
|
+
} else
|
832
|
+
p += n;
|
833
|
+
len--;
|
834
|
+
}
|
835
|
+
as->length = p - as->s;
|
836
|
+
as->s[as->length] = '\0';
|
837
|
+
return (ret_val);
|
838
|
+
}
|
839
|
+
|
840
|
+
#else /* HAVE_WCTOMB || HAVE_WCRTOMB */
|
841
|
+
|
842
|
+
/*
|
843
|
+
* TODO: Test if __STDC_ISO_10646__ is defined.
|
844
|
+
* Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
|
845
|
+
* one character at a time. If a non-Windows platform doesn't have
|
846
|
+
* either of these, fall back to the built-in UTF8 conversion.
|
847
|
+
*/
|
848
|
+
int
|
849
|
+
archive_string_append_from_wcs(struct archive_string *as,
|
850
|
+
const wchar_t *w, size_t len)
|
851
|
+
{
|
852
|
+
(void)as;/* UNUSED */
|
853
|
+
(void)w;/* UNUSED */
|
854
|
+
(void)len;/* UNUSED */
|
855
|
+
errno = ENOSYS;
|
856
|
+
return (-1);
|
857
|
+
}
|
858
|
+
|
859
|
+
#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */
|
860
|
+
|
861
|
+
/*
|
862
|
+
* Find a string conversion object by a pair of 'from' charset name
|
863
|
+
* and 'to' charset name from an archive object.
|
864
|
+
* Return NULL if not found.
|
865
|
+
*/
|
866
|
+
static struct archive_string_conv *
|
867
|
+
find_sconv_object(struct archive *a, const char *fc, const char *tc)
|
868
|
+
{
|
869
|
+
struct archive_string_conv *sc;
|
870
|
+
|
871
|
+
if (a == NULL)
|
872
|
+
return (NULL);
|
873
|
+
|
874
|
+
for (sc = a->sconv; sc != NULL; sc = sc->next) {
|
875
|
+
if (strcmp(sc->from_charset, fc) == 0 &&
|
876
|
+
strcmp(sc->to_charset, tc) == 0)
|
877
|
+
break;
|
878
|
+
}
|
879
|
+
return (sc);
|
880
|
+
}
|
881
|
+
|
882
|
+
/*
|
883
|
+
* Register a string object to an archive object.
|
884
|
+
*/
|
885
|
+
static void
|
886
|
+
add_sconv_object(struct archive *a, struct archive_string_conv *sc)
|
887
|
+
{
|
888
|
+
struct archive_string_conv **psc;
|
889
|
+
|
890
|
+
/* Add a new sconv to sconv list. */
|
891
|
+
psc = &(a->sconv);
|
892
|
+
while (*psc != NULL)
|
893
|
+
psc = &((*psc)->next);
|
894
|
+
*psc = sc;
|
895
|
+
}
|
896
|
+
|
897
|
+
static void
|
898
|
+
add_converter(struct archive_string_conv *sc, int (*converter)
|
899
|
+
(struct archive_string *, const void *, size_t,
|
900
|
+
struct archive_string_conv *))
|
901
|
+
{
|
902
|
+
if (sc == NULL || sc->nconverter >= 2)
|
903
|
+
__archive_errx(1, "Programming error");
|
904
|
+
sc->converter[sc->nconverter++] = converter;
|
905
|
+
}
|
906
|
+
|
907
|
+
static void
|
908
|
+
setup_converter(struct archive_string_conv *sc)
|
909
|
+
{
|
910
|
+
|
911
|
+
/* Reset. */
|
912
|
+
sc->nconverter = 0;
|
913
|
+
|
914
|
+
/*
|
915
|
+
* Perform special sequence for the incorrect UTF-8 filenames
|
916
|
+
* made by libarchive2.x.
|
917
|
+
*/
|
918
|
+
if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) {
|
919
|
+
add_converter(sc, strncat_from_utf8_libarchive2);
|
920
|
+
return;
|
921
|
+
}
|
922
|
+
|
923
|
+
/*
|
924
|
+
* Convert a string to UTF-16BE/LE.
|
925
|
+
*/
|
926
|
+
if (sc->flag & SCONV_TO_UTF16) {
|
927
|
+
/*
|
928
|
+
* If the current locale is UTF-8, we can translate
|
929
|
+
* a UTF-8 string into a UTF-16BE string.
|
930
|
+
*/
|
931
|
+
if (sc->flag & SCONV_FROM_UTF8) {
|
932
|
+
add_converter(sc, archive_string_append_unicode);
|
933
|
+
return;
|
934
|
+
}
|
935
|
+
|
936
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
937
|
+
if (sc->flag & SCONV_WIN_CP) {
|
938
|
+
if (sc->flag & SCONV_TO_UTF16BE)
|
939
|
+
add_converter(sc, win_strncat_to_utf16be);
|
940
|
+
else
|
941
|
+
add_converter(sc, win_strncat_to_utf16le);
|
942
|
+
return;
|
943
|
+
}
|
944
|
+
#endif
|
945
|
+
|
946
|
+
#if defined(HAVE_ICONV)
|
947
|
+
if (sc->cd != (iconv_t)-1) {
|
948
|
+
add_converter(sc, iconv_strncat_in_locale);
|
949
|
+
return;
|
950
|
+
}
|
951
|
+
#endif
|
952
|
+
|
953
|
+
if (sc->flag & SCONV_BEST_EFFORT) {
|
954
|
+
if (sc->flag & SCONV_TO_UTF16BE)
|
955
|
+
add_converter(sc,
|
956
|
+
best_effort_strncat_to_utf16be);
|
957
|
+
else
|
958
|
+
add_converter(sc,
|
959
|
+
best_effort_strncat_to_utf16le);
|
960
|
+
} else
|
961
|
+
/* Make sure we have no converter. */
|
962
|
+
sc->nconverter = 0;
|
963
|
+
return;
|
964
|
+
}
|
965
|
+
|
966
|
+
/*
|
967
|
+
* Convert a string from UTF-16BE/LE.
|
968
|
+
*/
|
969
|
+
if (sc->flag & SCONV_FROM_UTF16) {
|
970
|
+
/*
|
971
|
+
* At least we should normalize a UTF-16BE string.
|
972
|
+
*/
|
973
|
+
if (sc->flag & SCONV_NORMALIZATION_D)
|
974
|
+
add_converter(sc,archive_string_normalize_D);
|
975
|
+
else if (sc->flag & SCONV_NORMALIZATION_C)
|
976
|
+
add_converter(sc, archive_string_normalize_C);
|
977
|
+
|
978
|
+
if (sc->flag & SCONV_TO_UTF8) {
|
979
|
+
/*
|
980
|
+
* If the current locale is UTF-8, we can translate
|
981
|
+
* a UTF-16BE/LE string into a UTF-8 string directly.
|
982
|
+
*/
|
983
|
+
if (!(sc->flag &
|
984
|
+
(SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
|
985
|
+
add_converter(sc,
|
986
|
+
archive_string_append_unicode);
|
987
|
+
return;
|
988
|
+
}
|
989
|
+
|
990
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
991
|
+
if (sc->flag & SCONV_WIN_CP) {
|
992
|
+
if (sc->flag & SCONV_FROM_UTF16BE)
|
993
|
+
add_converter(sc, win_strncat_from_utf16be);
|
994
|
+
else
|
995
|
+
add_converter(sc, win_strncat_from_utf16le);
|
996
|
+
return;
|
997
|
+
}
|
998
|
+
#endif
|
999
|
+
|
1000
|
+
#if defined(HAVE_ICONV)
|
1001
|
+
if (sc->cd != (iconv_t)-1) {
|
1002
|
+
add_converter(sc, iconv_strncat_in_locale);
|
1003
|
+
return;
|
1004
|
+
}
|
1005
|
+
#endif
|
1006
|
+
|
1007
|
+
if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
|
1008
|
+
== (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
|
1009
|
+
add_converter(sc, best_effort_strncat_from_utf16be);
|
1010
|
+
else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
|
1011
|
+
== (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
|
1012
|
+
add_converter(sc, best_effort_strncat_from_utf16le);
|
1013
|
+
else
|
1014
|
+
/* Make sure we have no converter. */
|
1015
|
+
sc->nconverter = 0;
|
1016
|
+
return;
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
if (sc->flag & SCONV_FROM_UTF8) {
|
1020
|
+
/*
|
1021
|
+
* At least we should normalize a UTF-8 string.
|
1022
|
+
*/
|
1023
|
+
if (sc->flag & SCONV_NORMALIZATION_D)
|
1024
|
+
add_converter(sc,archive_string_normalize_D);
|
1025
|
+
else if (sc->flag & SCONV_NORMALIZATION_C)
|
1026
|
+
add_converter(sc, archive_string_normalize_C);
|
1027
|
+
|
1028
|
+
/*
|
1029
|
+
* Copy UTF-8 string with a check of CESU-8.
|
1030
|
+
* Apparently, iconv does not check surrogate pairs in UTF-8
|
1031
|
+
* when both from-charset and to-charset are UTF-8, and then
|
1032
|
+
* we use our UTF-8 copy code.
|
1033
|
+
*/
|
1034
|
+
if (sc->flag & SCONV_TO_UTF8) {
|
1035
|
+
/*
|
1036
|
+
* If the current locale is UTF-8, we can translate
|
1037
|
+
* a UTF-16BE string into a UTF-8 string directly.
|
1038
|
+
*/
|
1039
|
+
if (!(sc->flag &
|
1040
|
+
(SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
|
1041
|
+
add_converter(sc, strncat_from_utf8_to_utf8);
|
1042
|
+
return;
|
1043
|
+
}
|
1044
|
+
}
|
1045
|
+
|
1046
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1047
|
+
/*
|
1048
|
+
* On Windows we can use Windows API for a string conversion.
|
1049
|
+
*/
|
1050
|
+
if (sc->flag & SCONV_WIN_CP) {
|
1051
|
+
add_converter(sc, strncat_in_codepage);
|
1052
|
+
return;
|
1053
|
+
}
|
1054
|
+
#endif
|
1055
|
+
|
1056
|
+
#if HAVE_ICONV
|
1057
|
+
if (sc->cd != (iconv_t)-1) {
|
1058
|
+
add_converter(sc, iconv_strncat_in_locale);
|
1059
|
+
/*
|
1060
|
+
* iconv generally does not support UTF-8-MAC and so
|
1061
|
+
* we have to the output of iconv from NFC to NFD if
|
1062
|
+
* need.
|
1063
|
+
*/
|
1064
|
+
if ((sc->flag & SCONV_FROM_CHARSET) &&
|
1065
|
+
(sc->flag & SCONV_TO_UTF8)) {
|
1066
|
+
if (sc->flag & SCONV_NORMALIZATION_D)
|
1067
|
+
add_converter(sc, archive_string_normalize_D);
|
1068
|
+
}
|
1069
|
+
return;
|
1070
|
+
}
|
1071
|
+
#endif
|
1072
|
+
|
1073
|
+
/*
|
1074
|
+
* Try conversion in the best effort or no conversion.
|
1075
|
+
*/
|
1076
|
+
if ((sc->flag & SCONV_BEST_EFFORT) || sc->same)
|
1077
|
+
add_converter(sc, best_effort_strncat_in_locale);
|
1078
|
+
else
|
1079
|
+
/* Make sure we have no converter. */
|
1080
|
+
sc->nconverter = 0;
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
/*
|
1084
|
+
* Return canonicalized charset-name but this supports just UTF-8, UTF-16BE
|
1085
|
+
* and CP932 which are referenced in create_sconv_object().
|
1086
|
+
*/
|
1087
|
+
static const char *
|
1088
|
+
canonical_charset_name(const char *charset)
|
1089
|
+
{
|
1090
|
+
char cs[16];
|
1091
|
+
char *p;
|
1092
|
+
const char *s;
|
1093
|
+
|
1094
|
+
if (charset == NULL || charset[0] == '\0'
|
1095
|
+
|| strlen(charset) > 15)
|
1096
|
+
return (charset);
|
1097
|
+
|
1098
|
+
/* Copy name to uppercase. */
|
1099
|
+
p = cs;
|
1100
|
+
s = charset;
|
1101
|
+
while (*s) {
|
1102
|
+
char c = *s++;
|
1103
|
+
if (c >= 'a' && c <= 'z')
|
1104
|
+
c -= 'a' - 'A';
|
1105
|
+
*p++ = c;
|
1106
|
+
}
|
1107
|
+
*p++ = '\0';
|
1108
|
+
|
1109
|
+
if (strcmp(cs, "UTF-8") == 0 ||
|
1110
|
+
strcmp(cs, "UTF8") == 0)
|
1111
|
+
return ("UTF-8");
|
1112
|
+
if (strcmp(cs, "UTF-16BE") == 0 ||
|
1113
|
+
strcmp(cs, "UTF16BE") == 0)
|
1114
|
+
return ("UTF-16BE");
|
1115
|
+
if (strcmp(cs, "UTF-16LE") == 0 ||
|
1116
|
+
strcmp(cs, "UTF16LE") == 0)
|
1117
|
+
return ("UTF-16LE");
|
1118
|
+
if (strcmp(cs, "CP932") == 0)
|
1119
|
+
return ("CP932");
|
1120
|
+
return (charset);
|
1121
|
+
}
|
1122
|
+
|
1123
|
+
/*
|
1124
|
+
* Create a string conversion object.
|
1125
|
+
*/
|
1126
|
+
static struct archive_string_conv *
|
1127
|
+
create_sconv_object(const char *fc, const char *tc,
|
1128
|
+
unsigned current_codepage, int flag)
|
1129
|
+
{
|
1130
|
+
struct archive_string_conv *sc;
|
1131
|
+
|
1132
|
+
sc = calloc(1, sizeof(*sc));
|
1133
|
+
if (sc == NULL)
|
1134
|
+
return (NULL);
|
1135
|
+
sc->next = NULL;
|
1136
|
+
sc->from_charset = strdup(fc);
|
1137
|
+
if (sc->from_charset == NULL) {
|
1138
|
+
free(sc);
|
1139
|
+
return (NULL);
|
1140
|
+
}
|
1141
|
+
sc->to_charset = strdup(tc);
|
1142
|
+
if (sc->to_charset == NULL) {
|
1143
|
+
free(sc->from_charset);
|
1144
|
+
free(sc);
|
1145
|
+
return (NULL);
|
1146
|
+
}
|
1147
|
+
archive_string_init(&sc->utftmp);
|
1148
|
+
|
1149
|
+
if (flag & SCONV_TO_CHARSET) {
|
1150
|
+
/*
|
1151
|
+
* Convert characters from the current locale charset to
|
1152
|
+
* a specified charset.
|
1153
|
+
*/
|
1154
|
+
sc->from_cp = current_codepage;
|
1155
|
+
sc->to_cp = make_codepage_from_charset(tc);
|
1156
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1157
|
+
if (IsValidCodePage(sc->to_cp))
|
1158
|
+
flag |= SCONV_WIN_CP;
|
1159
|
+
#endif
|
1160
|
+
} else if (flag & SCONV_FROM_CHARSET) {
|
1161
|
+
/*
|
1162
|
+
* Convert characters from a specified charset to
|
1163
|
+
* the current locale charset.
|
1164
|
+
*/
|
1165
|
+
sc->to_cp = current_codepage;
|
1166
|
+
sc->from_cp = make_codepage_from_charset(fc);
|
1167
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1168
|
+
if (IsValidCodePage(sc->from_cp))
|
1169
|
+
flag |= SCONV_WIN_CP;
|
1170
|
+
#endif
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
/*
|
1174
|
+
* Check if "from charset" and "to charset" are the same.
|
1175
|
+
*/
|
1176
|
+
if (strcmp(fc, tc) == 0 ||
|
1177
|
+
(sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp))
|
1178
|
+
sc->same = 1;
|
1179
|
+
else
|
1180
|
+
sc->same = 0;
|
1181
|
+
|
1182
|
+
/*
|
1183
|
+
* Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE.
|
1184
|
+
*/
|
1185
|
+
if (strcmp(tc, "UTF-8") == 0)
|
1186
|
+
flag |= SCONV_TO_UTF8;
|
1187
|
+
else if (strcmp(tc, "UTF-16BE") == 0)
|
1188
|
+
flag |= SCONV_TO_UTF16BE;
|
1189
|
+
else if (strcmp(tc, "UTF-16LE") == 0)
|
1190
|
+
flag |= SCONV_TO_UTF16LE;
|
1191
|
+
if (strcmp(fc, "UTF-8") == 0)
|
1192
|
+
flag |= SCONV_FROM_UTF8;
|
1193
|
+
else if (strcmp(fc, "UTF-16BE") == 0)
|
1194
|
+
flag |= SCONV_FROM_UTF16BE;
|
1195
|
+
else if (strcmp(fc, "UTF-16LE") == 0)
|
1196
|
+
flag |= SCONV_FROM_UTF16LE;
|
1197
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1198
|
+
if (sc->to_cp == CP_UTF8)
|
1199
|
+
flag |= SCONV_TO_UTF8;
|
1200
|
+
else if (sc->to_cp == CP_UTF16BE)
|
1201
|
+
flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP;
|
1202
|
+
else if (sc->to_cp == CP_UTF16LE)
|
1203
|
+
flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP;
|
1204
|
+
if (sc->from_cp == CP_UTF8)
|
1205
|
+
flag |= SCONV_FROM_UTF8;
|
1206
|
+
else if (sc->from_cp == CP_UTF16BE)
|
1207
|
+
flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP;
|
1208
|
+
else if (sc->from_cp == CP_UTF16LE)
|
1209
|
+
flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP;
|
1210
|
+
#endif
|
1211
|
+
|
1212
|
+
/*
|
1213
|
+
* Set a flag for Unicode NFD. Usually iconv cannot correctly
|
1214
|
+
* handle it. So we have to translate NFD characters to NFC ones
|
1215
|
+
* ourselves before iconv handles. Another reason is to prevent
|
1216
|
+
* that the same sight of two filenames, one is NFC and other
|
1217
|
+
* is NFD, would be in its directory.
|
1218
|
+
* On Mac OS X, although its filesystem layer automatically
|
1219
|
+
* convert filenames to NFD, it would be useful for filename
|
1220
|
+
* comparing to find out the same filenames that we normalize
|
1221
|
+
* that to be NFD ourselves.
|
1222
|
+
*/
|
1223
|
+
if ((flag & SCONV_FROM_CHARSET) &&
|
1224
|
+
(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) {
|
1225
|
+
#if defined(__APPLE__)
|
1226
|
+
if (flag & SCONV_TO_UTF8)
|
1227
|
+
flag |= SCONV_NORMALIZATION_D;
|
1228
|
+
else
|
1229
|
+
#endif
|
1230
|
+
flag |= SCONV_NORMALIZATION_C;
|
1231
|
+
}
|
1232
|
+
#if defined(__APPLE__)
|
1233
|
+
/*
|
1234
|
+
* In case writing an archive file, make sure that a filename
|
1235
|
+
* going to be passed to iconv is a Unicode NFC string since
|
1236
|
+
* a filename in HFS Plus filesystem is a Unicode NFD one and
|
1237
|
+
* iconv cannot handle it with "UTF-8" charset. It is simpler
|
1238
|
+
* than a use of "UTF-8-MAC" charset.
|
1239
|
+
*/
|
1240
|
+
if ((flag & SCONV_TO_CHARSET) &&
|
1241
|
+
(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
|
1242
|
+
!(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
|
1243
|
+
flag |= SCONV_NORMALIZATION_C;
|
1244
|
+
/*
|
1245
|
+
* In case reading an archive file. make sure that a filename
|
1246
|
+
* will be passed to users is a Unicode NFD string in order to
|
1247
|
+
* correctly compare the filename with other one which comes
|
1248
|
+
* from HFS Plus filesystem.
|
1249
|
+
*/
|
1250
|
+
if ((flag & SCONV_FROM_CHARSET) &&
|
1251
|
+
!(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
|
1252
|
+
(flag & SCONV_TO_UTF8))
|
1253
|
+
flag |= SCONV_NORMALIZATION_D;
|
1254
|
+
#endif
|
1255
|
+
|
1256
|
+
#if defined(HAVE_ICONV)
|
1257
|
+
sc->cd_w = (iconv_t)-1;
|
1258
|
+
/*
|
1259
|
+
* Create an iconv object.
|
1260
|
+
*/
|
1261
|
+
if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) &&
|
1262
|
+
(flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) ||
|
1263
|
+
(flag & SCONV_WIN_CP)) {
|
1264
|
+
/* This case we won't use iconv. */
|
1265
|
+
sc->cd = (iconv_t)-1;
|
1266
|
+
} else {
|
1267
|
+
sc->cd = iconv_open(tc, fc);
|
1268
|
+
if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) {
|
1269
|
+
/*
|
1270
|
+
* Unfortunately, all of iconv implements do support
|
1271
|
+
* "CP932" character-set, so we should use "SJIS"
|
1272
|
+
* instead if iconv_open failed.
|
1273
|
+
*/
|
1274
|
+
if (strcmp(tc, "CP932") == 0)
|
1275
|
+
sc->cd = iconv_open("SJIS", fc);
|
1276
|
+
else if (strcmp(fc, "CP932") == 0)
|
1277
|
+
sc->cd = iconv_open(tc, "SJIS");
|
1278
|
+
}
|
1279
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1280
|
+
/*
|
1281
|
+
* archive_mstring on Windows directly convert multi-bytes
|
1282
|
+
* into archive_wstring in order not to depend on locale
|
1283
|
+
* so that you can do a I18N programming. This will be
|
1284
|
+
* used only in archive_mstring_copy_mbs_len_l so far.
|
1285
|
+
*/
|
1286
|
+
if (flag & SCONV_FROM_CHARSET) {
|
1287
|
+
sc->cd_w = iconv_open("UTF-8", fc);
|
1288
|
+
if (sc->cd_w == (iconv_t)-1 &&
|
1289
|
+
(sc->flag & SCONV_BEST_EFFORT)) {
|
1290
|
+
if (strcmp(fc, "CP932") == 0)
|
1291
|
+
sc->cd_w = iconv_open("UTF-8", "SJIS");
|
1292
|
+
}
|
1293
|
+
}
|
1294
|
+
#endif /* _WIN32 && !__CYGWIN__ */
|
1295
|
+
}
|
1296
|
+
#endif /* HAVE_ICONV */
|
1297
|
+
|
1298
|
+
sc->flag = flag;
|
1299
|
+
|
1300
|
+
/*
|
1301
|
+
* Set up converters.
|
1302
|
+
*/
|
1303
|
+
setup_converter(sc);
|
1304
|
+
|
1305
|
+
return (sc);
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
/*
|
1309
|
+
* Free a string conversion object.
|
1310
|
+
*/
|
1311
|
+
static void
|
1312
|
+
free_sconv_object(struct archive_string_conv *sc)
|
1313
|
+
{
|
1314
|
+
free(sc->from_charset);
|
1315
|
+
free(sc->to_charset);
|
1316
|
+
archive_string_free(&sc->utftmp);
|
1317
|
+
#if HAVE_ICONV
|
1318
|
+
if (sc->cd != (iconv_t)-1)
|
1319
|
+
iconv_close(sc->cd);
|
1320
|
+
if (sc->cd_w != (iconv_t)-1)
|
1321
|
+
iconv_close(sc->cd_w);
|
1322
|
+
#endif
|
1323
|
+
free(sc);
|
1324
|
+
}
|
1325
|
+
|
1326
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1327
|
+
static unsigned
|
1328
|
+
my_atoi(const char *p)
|
1329
|
+
{
|
1330
|
+
unsigned cp;
|
1331
|
+
|
1332
|
+
cp = 0;
|
1333
|
+
while (*p) {
|
1334
|
+
if (*p >= '0' && *p <= '9')
|
1335
|
+
cp = cp * 10 + (*p - '0');
|
1336
|
+
else
|
1337
|
+
return (-1);
|
1338
|
+
p++;
|
1339
|
+
}
|
1340
|
+
return (cp);
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
/*
|
1344
|
+
* Translate Charset name (as used by iconv) into CodePage (as used by Windows)
|
1345
|
+
* Return -1 if failed.
|
1346
|
+
*
|
1347
|
+
* Note: This translation code may be insufficient.
|
1348
|
+
*/
|
1349
|
+
static struct charset {
|
1350
|
+
const char *name;
|
1351
|
+
unsigned cp;
|
1352
|
+
} charsets[] = {
|
1353
|
+
/* MUST BE SORTED! */
|
1354
|
+
{"ASCII", 1252},
|
1355
|
+
{"ASMO-708", 708},
|
1356
|
+
{"BIG5", 950},
|
1357
|
+
{"CHINESE", 936},
|
1358
|
+
{"CP367", 1252},
|
1359
|
+
{"CP819", 1252},
|
1360
|
+
{"CP1025", 21025},
|
1361
|
+
{"DOS-720", 720},
|
1362
|
+
{"DOS-862", 862},
|
1363
|
+
{"EUC-CN", 51936},
|
1364
|
+
{"EUC-JP", 51932},
|
1365
|
+
{"EUC-KR", 949},
|
1366
|
+
{"EUCCN", 51936},
|
1367
|
+
{"EUCJP", 51932},
|
1368
|
+
{"EUCKR", 949},
|
1369
|
+
{"GB18030", 54936},
|
1370
|
+
{"GB2312", 936},
|
1371
|
+
{"HEBREW", 1255},
|
1372
|
+
{"HZ-GB-2312", 52936},
|
1373
|
+
{"IBM273", 20273},
|
1374
|
+
{"IBM277", 20277},
|
1375
|
+
{"IBM278", 20278},
|
1376
|
+
{"IBM280", 20280},
|
1377
|
+
{"IBM284", 20284},
|
1378
|
+
{"IBM285", 20285},
|
1379
|
+
{"IBM290", 20290},
|
1380
|
+
{"IBM297", 20297},
|
1381
|
+
{"IBM367", 1252},
|
1382
|
+
{"IBM420", 20420},
|
1383
|
+
{"IBM423", 20423},
|
1384
|
+
{"IBM424", 20424},
|
1385
|
+
{"IBM819", 1252},
|
1386
|
+
{"IBM871", 20871},
|
1387
|
+
{"IBM880", 20880},
|
1388
|
+
{"IBM905", 20905},
|
1389
|
+
{"IBM924", 20924},
|
1390
|
+
{"ISO-8859-1", 28591},
|
1391
|
+
{"ISO-8859-13", 28603},
|
1392
|
+
{"ISO-8859-15", 28605},
|
1393
|
+
{"ISO-8859-2", 28592},
|
1394
|
+
{"ISO-8859-3", 28593},
|
1395
|
+
{"ISO-8859-4", 28594},
|
1396
|
+
{"ISO-8859-5", 28595},
|
1397
|
+
{"ISO-8859-6", 28596},
|
1398
|
+
{"ISO-8859-7", 28597},
|
1399
|
+
{"ISO-8859-8", 28598},
|
1400
|
+
{"ISO-8859-9", 28599},
|
1401
|
+
{"ISO8859-1", 28591},
|
1402
|
+
{"ISO8859-13", 28603},
|
1403
|
+
{"ISO8859-15", 28605},
|
1404
|
+
{"ISO8859-2", 28592},
|
1405
|
+
{"ISO8859-3", 28593},
|
1406
|
+
{"ISO8859-4", 28594},
|
1407
|
+
{"ISO8859-5", 28595},
|
1408
|
+
{"ISO8859-6", 28596},
|
1409
|
+
{"ISO8859-7", 28597},
|
1410
|
+
{"ISO8859-8", 28598},
|
1411
|
+
{"ISO8859-9", 28599},
|
1412
|
+
{"JOHAB", 1361},
|
1413
|
+
{"KOI8-R", 20866},
|
1414
|
+
{"KOI8-U", 21866},
|
1415
|
+
{"KS_C_5601-1987", 949},
|
1416
|
+
{"LATIN1", 1252},
|
1417
|
+
{"LATIN2", 28592},
|
1418
|
+
{"MACINTOSH", 10000},
|
1419
|
+
{"SHIFT-JIS", 932},
|
1420
|
+
{"SHIFT_JIS", 932},
|
1421
|
+
{"SJIS", 932},
|
1422
|
+
{"US", 1252},
|
1423
|
+
{"US-ASCII", 1252},
|
1424
|
+
{"UTF-16", 1200},
|
1425
|
+
{"UTF-16BE", 1201},
|
1426
|
+
{"UTF-16LE", 1200},
|
1427
|
+
{"UTF-8", CP_UTF8},
|
1428
|
+
{"X-EUROPA", 29001},
|
1429
|
+
{"X-MAC-ARABIC", 10004},
|
1430
|
+
{"X-MAC-CE", 10029},
|
1431
|
+
{"X-MAC-CHINESEIMP", 10008},
|
1432
|
+
{"X-MAC-CHINESETRAD", 10002},
|
1433
|
+
{"X-MAC-CROATIAN", 10082},
|
1434
|
+
{"X-MAC-CYRILLIC", 10007},
|
1435
|
+
{"X-MAC-GREEK", 10006},
|
1436
|
+
{"X-MAC-HEBREW", 10005},
|
1437
|
+
{"X-MAC-ICELANDIC", 10079},
|
1438
|
+
{"X-MAC-JAPANESE", 10001},
|
1439
|
+
{"X-MAC-KOREAN", 10003},
|
1440
|
+
{"X-MAC-ROMANIAN", 10010},
|
1441
|
+
{"X-MAC-THAI", 10021},
|
1442
|
+
{"X-MAC-TURKISH", 10081},
|
1443
|
+
{"X-MAC-UKRAINIAN", 10017},
|
1444
|
+
};
|
1445
|
+
static unsigned
|
1446
|
+
make_codepage_from_charset(const char *charset)
|
1447
|
+
{
|
1448
|
+
char cs[16];
|
1449
|
+
char *p;
|
1450
|
+
unsigned cp;
|
1451
|
+
int a, b;
|
1452
|
+
|
1453
|
+
if (charset == NULL || strlen(charset) > 15)
|
1454
|
+
return -1;
|
1455
|
+
|
1456
|
+
/* Copy name to uppercase. */
|
1457
|
+
p = cs;
|
1458
|
+
while (*charset) {
|
1459
|
+
char c = *charset++;
|
1460
|
+
if (c >= 'a' && c <= 'z')
|
1461
|
+
c -= 'a' - 'A';
|
1462
|
+
*p++ = c;
|
1463
|
+
}
|
1464
|
+
*p++ = '\0';
|
1465
|
+
cp = -1;
|
1466
|
+
|
1467
|
+
/* Look it up in the table first, so that we can easily
|
1468
|
+
* override CP367, which we map to 1252 instead of 367. */
|
1469
|
+
a = 0;
|
1470
|
+
b = sizeof(charsets)/sizeof(charsets[0]);
|
1471
|
+
while (b > a) {
|
1472
|
+
int c = (b + a) / 2;
|
1473
|
+
int r = strcmp(charsets[c].name, cs);
|
1474
|
+
if (r < 0)
|
1475
|
+
a = c + 1;
|
1476
|
+
else if (r > 0)
|
1477
|
+
b = c;
|
1478
|
+
else
|
1479
|
+
return charsets[c].cp;
|
1480
|
+
}
|
1481
|
+
|
1482
|
+
/* If it's not in the table, try to parse it. */
|
1483
|
+
switch (*cs) {
|
1484
|
+
case 'C':
|
1485
|
+
if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') {
|
1486
|
+
cp = my_atoi(cs + 2);
|
1487
|
+
} else if (strcmp(cs, "CP_ACP") == 0)
|
1488
|
+
cp = get_current_codepage();
|
1489
|
+
else if (strcmp(cs, "CP_OEMCP") == 0)
|
1490
|
+
cp = get_current_oemcp();
|
1491
|
+
break;
|
1492
|
+
case 'I':
|
1493
|
+
if (cs[1] == 'B' && cs[2] == 'M' &&
|
1494
|
+
cs[3] >= '0' && cs[3] <= '9') {
|
1495
|
+
cp = my_atoi(cs + 3);
|
1496
|
+
}
|
1497
|
+
break;
|
1498
|
+
case 'W':
|
1499
|
+
if (strncmp(cs, "WINDOWS-", 8) == 0) {
|
1500
|
+
cp = my_atoi(cs + 8);
|
1501
|
+
if (cp != 874 && (cp < 1250 || cp > 1258))
|
1502
|
+
cp = -1;/* This may invalid code. */
|
1503
|
+
}
|
1504
|
+
break;
|
1505
|
+
}
|
1506
|
+
return (cp);
|
1507
|
+
}
|
1508
|
+
|
1509
|
+
/*
|
1510
|
+
* Return ANSI Code Page of current locale set by setlocale().
|
1511
|
+
*/
|
1512
|
+
static unsigned
|
1513
|
+
get_current_codepage(void)
|
1514
|
+
{
|
1515
|
+
char *locale, *p;
|
1516
|
+
unsigned cp;
|
1517
|
+
|
1518
|
+
locale = setlocale(LC_CTYPE, NULL);
|
1519
|
+
if (locale == NULL)
|
1520
|
+
return (GetACP());
|
1521
|
+
if (locale[0] == 'C' && locale[1] == '\0')
|
1522
|
+
return (CP_C_LOCALE);
|
1523
|
+
p = strrchr(locale, '.');
|
1524
|
+
if (p == NULL)
|
1525
|
+
return (GetACP());
|
1526
|
+
if (strcmp(p+1, "utf8") == 0)
|
1527
|
+
return CP_UTF8;
|
1528
|
+
cp = my_atoi(p+1);
|
1529
|
+
if ((int)cp <= 0)
|
1530
|
+
return (GetACP());
|
1531
|
+
return (cp);
|
1532
|
+
}
|
1533
|
+
|
1534
|
+
/*
|
1535
|
+
* Translation table between Locale Name and ACP/OEMCP.
|
1536
|
+
*/
|
1537
|
+
static struct {
|
1538
|
+
unsigned acp;
|
1539
|
+
unsigned ocp;
|
1540
|
+
const char *locale;
|
1541
|
+
} acp_ocp_map[] = {
|
1542
|
+
{ 950, 950, "Chinese_Taiwan" },
|
1543
|
+
{ 936, 936, "Chinese_People's Republic of China" },
|
1544
|
+
{ 950, 950, "Chinese_Taiwan" },
|
1545
|
+
{ 1250, 852, "Czech_Czech Republic" },
|
1546
|
+
{ 1252, 850, "Danish_Denmark" },
|
1547
|
+
{ 1252, 850, "Dutch_Netherlands" },
|
1548
|
+
{ 1252, 850, "Dutch_Belgium" },
|
1549
|
+
{ 1252, 437, "English_United States" },
|
1550
|
+
{ 1252, 850, "English_Australia" },
|
1551
|
+
{ 1252, 850, "English_Canada" },
|
1552
|
+
{ 1252, 850, "English_New Zealand" },
|
1553
|
+
{ 1252, 850, "English_United Kingdom" },
|
1554
|
+
{ 1252, 437, "English_United States" },
|
1555
|
+
{ 1252, 850, "Finnish_Finland" },
|
1556
|
+
{ 1252, 850, "French_France" },
|
1557
|
+
{ 1252, 850, "French_Belgium" },
|
1558
|
+
{ 1252, 850, "French_Canada" },
|
1559
|
+
{ 1252, 850, "French_Switzerland" },
|
1560
|
+
{ 1252, 850, "German_Germany" },
|
1561
|
+
{ 1252, 850, "German_Austria" },
|
1562
|
+
{ 1252, 850, "German_Switzerland" },
|
1563
|
+
{ 1253, 737, "Greek_Greece" },
|
1564
|
+
{ 1250, 852, "Hungarian_Hungary" },
|
1565
|
+
{ 1252, 850, "Icelandic_Iceland" },
|
1566
|
+
{ 1252, 850, "Italian_Italy" },
|
1567
|
+
{ 1252, 850, "Italian_Switzerland" },
|
1568
|
+
{ 932, 932, "Japanese_Japan" },
|
1569
|
+
{ 949, 949, "Korean_Korea" },
|
1570
|
+
{ 1252, 850, "Norwegian (BokmOl)_Norway" },
|
1571
|
+
{ 1252, 850, "Norwegian (BokmOl)_Norway" },
|
1572
|
+
{ 1252, 850, "Norwegian-Nynorsk_Norway" },
|
1573
|
+
{ 1250, 852, "Polish_Poland" },
|
1574
|
+
{ 1252, 850, "Portuguese_Portugal" },
|
1575
|
+
{ 1252, 850, "Portuguese_Brazil" },
|
1576
|
+
{ 1251, 866, "Russian_Russia" },
|
1577
|
+
{ 1250, 852, "Slovak_Slovakia" },
|
1578
|
+
{ 1252, 850, "Spanish_Spain" },
|
1579
|
+
{ 1252, 850, "Spanish_Mexico" },
|
1580
|
+
{ 1252, 850, "Spanish_Spain" },
|
1581
|
+
{ 1252, 850, "Swedish_Sweden" },
|
1582
|
+
{ 1254, 857, "Turkish_Turkey" },
|
1583
|
+
{ 0, 0, NULL}
|
1584
|
+
};
|
1585
|
+
|
1586
|
+
/*
|
1587
|
+
* Return OEM Code Page of current locale set by setlocale().
|
1588
|
+
*/
|
1589
|
+
static unsigned
|
1590
|
+
get_current_oemcp(void)
|
1591
|
+
{
|
1592
|
+
int i;
|
1593
|
+
char *locale, *p;
|
1594
|
+
size_t len;
|
1595
|
+
|
1596
|
+
locale = setlocale(LC_CTYPE, NULL);
|
1597
|
+
if (locale == NULL)
|
1598
|
+
return (GetOEMCP());
|
1599
|
+
if (locale[0] == 'C' && locale[1] == '\0')
|
1600
|
+
return (CP_C_LOCALE);
|
1601
|
+
|
1602
|
+
p = strrchr(locale, '.');
|
1603
|
+
if (p == NULL)
|
1604
|
+
return (GetOEMCP());
|
1605
|
+
len = p - locale;
|
1606
|
+
for (i = 0; acp_ocp_map[i].acp; i++) {
|
1607
|
+
if (strncmp(acp_ocp_map[i].locale, locale, len) == 0)
|
1608
|
+
return (acp_ocp_map[i].ocp);
|
1609
|
+
}
|
1610
|
+
return (GetOEMCP());
|
1611
|
+
}
|
1612
|
+
#else
|
1613
|
+
|
1614
|
+
/*
|
1615
|
+
* POSIX platform does not use CodePage.
|
1616
|
+
*/
|
1617
|
+
|
1618
|
+
static unsigned
|
1619
|
+
get_current_codepage(void)
|
1620
|
+
{
|
1621
|
+
return (-1);/* Unknown */
|
1622
|
+
}
|
1623
|
+
static unsigned
|
1624
|
+
make_codepage_from_charset(const char *charset)
|
1625
|
+
{
|
1626
|
+
(void)charset; /* UNUSED */
|
1627
|
+
return (-1);/* Unknown */
|
1628
|
+
}
|
1629
|
+
static unsigned
|
1630
|
+
get_current_oemcp(void)
|
1631
|
+
{
|
1632
|
+
return (-1);/* Unknown */
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
#endif /* defined(_WIN32) && !defined(__CYGWIN__) */
|
1636
|
+
|
1637
|
+
/*
|
1638
|
+
* Return a string conversion object.
|
1639
|
+
*/
|
1640
|
+
static struct archive_string_conv *
|
1641
|
+
get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag)
|
1642
|
+
{
|
1643
|
+
struct archive_string_conv *sc;
|
1644
|
+
unsigned current_codepage;
|
1645
|
+
|
1646
|
+
/* Check if we have made the sconv object. */
|
1647
|
+
sc = find_sconv_object(a, fc, tc);
|
1648
|
+
if (sc != NULL)
|
1649
|
+
return (sc);
|
1650
|
+
|
1651
|
+
if (a == NULL)
|
1652
|
+
current_codepage = get_current_codepage();
|
1653
|
+
else
|
1654
|
+
current_codepage = a->current_codepage;
|
1655
|
+
|
1656
|
+
sc = create_sconv_object(canonical_charset_name(fc),
|
1657
|
+
canonical_charset_name(tc), current_codepage, flag);
|
1658
|
+
if (sc == NULL) {
|
1659
|
+
if (a != NULL)
|
1660
|
+
archive_set_error(a, ENOMEM,
|
1661
|
+
"Could not allocate memory for "
|
1662
|
+
"a string conversion object");
|
1663
|
+
return (NULL);
|
1664
|
+
}
|
1665
|
+
|
1666
|
+
/*
|
1667
|
+
* If there is no converter for current string conversion object,
|
1668
|
+
* we cannot handle this conversion.
|
1669
|
+
*/
|
1670
|
+
if (sc->nconverter == 0) {
|
1671
|
+
if (a != NULL) {
|
1672
|
+
#if HAVE_ICONV
|
1673
|
+
archive_set_error(a, ARCHIVE_ERRNO_MISC,
|
1674
|
+
"iconv_open failed : Cannot handle ``%s''",
|
1675
|
+
(flag & SCONV_TO_CHARSET)?tc:fc);
|
1676
|
+
#else
|
1677
|
+
archive_set_error(a, ARCHIVE_ERRNO_MISC,
|
1678
|
+
"A character-set conversion not fully supported "
|
1679
|
+
"on this platform");
|
1680
|
+
#endif
|
1681
|
+
}
|
1682
|
+
/* Failed; free a sconv object. */
|
1683
|
+
free_sconv_object(sc);
|
1684
|
+
return (NULL);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
/*
|
1688
|
+
* Success!
|
1689
|
+
*/
|
1690
|
+
if (a != NULL)
|
1691
|
+
add_sconv_object(a, sc);
|
1692
|
+
return (sc);
|
1693
|
+
}
|
1694
|
+
|
1695
|
+
static const char *
|
1696
|
+
get_current_charset(struct archive *a)
|
1697
|
+
{
|
1698
|
+
const char *cur_charset;
|
1699
|
+
|
1700
|
+
if (a == NULL)
|
1701
|
+
cur_charset = default_iconv_charset("");
|
1702
|
+
else {
|
1703
|
+
cur_charset = default_iconv_charset(a->current_code);
|
1704
|
+
if (a->current_code == NULL) {
|
1705
|
+
a->current_code = strdup(cur_charset);
|
1706
|
+
a->current_codepage = get_current_codepage();
|
1707
|
+
a->current_oemcp = get_current_oemcp();
|
1708
|
+
}
|
1709
|
+
}
|
1710
|
+
return (cur_charset);
|
1711
|
+
}
|
1712
|
+
|
1713
|
+
/*
|
1714
|
+
* Make and Return a string conversion object.
|
1715
|
+
* Return NULL if the platform does not support the specified conversion
|
1716
|
+
* and best_effort is 0.
|
1717
|
+
* If best_effort is set, A string conversion object must be returned
|
1718
|
+
* unless memory allocation for the object fails, but the conversion
|
1719
|
+
* might fail when non-ASCII code is found.
|
1720
|
+
*/
|
1721
|
+
struct archive_string_conv *
|
1722
|
+
archive_string_conversion_to_charset(struct archive *a, const char *charset,
|
1723
|
+
int best_effort)
|
1724
|
+
{
|
1725
|
+
int flag = SCONV_TO_CHARSET;
|
1726
|
+
|
1727
|
+
if (best_effort)
|
1728
|
+
flag |= SCONV_BEST_EFFORT;
|
1729
|
+
return (get_sconv_object(a, get_current_charset(a), charset, flag));
|
1730
|
+
}
|
1731
|
+
|
1732
|
+
struct archive_string_conv *
|
1733
|
+
archive_string_conversion_from_charset(struct archive *a, const char *charset,
|
1734
|
+
int best_effort)
|
1735
|
+
{
|
1736
|
+
int flag = SCONV_FROM_CHARSET;
|
1737
|
+
|
1738
|
+
if (best_effort)
|
1739
|
+
flag |= SCONV_BEST_EFFORT;
|
1740
|
+
return (get_sconv_object(a, charset, get_current_charset(a), flag));
|
1741
|
+
}
|
1742
|
+
|
1743
|
+
/*
|
1744
|
+
* archive_string_default_conversion_*_archive() are provided for Windows
|
1745
|
+
* platform because other archiver application use CP_OEMCP for
|
1746
|
+
* MultiByteToWideChar() and WideCharToMultiByte() for the filenames
|
1747
|
+
* in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP
|
1748
|
+
* unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP).
|
1749
|
+
* So we should make a string conversion between CP_ACP and CP_OEMCP
|
1750
|
+
* for compatibility.
|
1751
|
+
*/
|
1752
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
1753
|
+
struct archive_string_conv *
|
1754
|
+
archive_string_default_conversion_for_read(struct archive *a)
|
1755
|
+
{
|
1756
|
+
const char *cur_charset = get_current_charset(a);
|
1757
|
+
char oemcp[16];
|
1758
|
+
|
1759
|
+
/* NOTE: a check of cur_charset is unneeded but we need
|
1760
|
+
* that get_current_charset() has been surely called at
|
1761
|
+
* this time whatever C compiler optimized. */
|
1762
|
+
if (cur_charset != NULL &&
|
1763
|
+
(a->current_codepage == CP_C_LOCALE ||
|
1764
|
+
a->current_codepage == a->current_oemcp))
|
1765
|
+
return (NULL);/* no conversion. */
|
1766
|
+
|
1767
|
+
_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
|
1768
|
+
/* Make sure a null termination must be set. */
|
1769
|
+
oemcp[sizeof(oemcp)-1] = '\0';
|
1770
|
+
return (get_sconv_object(a, oemcp, cur_charset,
|
1771
|
+
SCONV_FROM_CHARSET));
|
1772
|
+
}
|
1773
|
+
|
1774
|
+
struct archive_string_conv *
|
1775
|
+
archive_string_default_conversion_for_write(struct archive *a)
|
1776
|
+
{
|
1777
|
+
const char *cur_charset = get_current_charset(a);
|
1778
|
+
char oemcp[16];
|
1779
|
+
|
1780
|
+
/* NOTE: a check of cur_charset is unneeded but we need
|
1781
|
+
* that get_current_charset() has been surely called at
|
1782
|
+
* this time whatever C compiler optimized. */
|
1783
|
+
if (cur_charset != NULL &&
|
1784
|
+
(a->current_codepage == CP_C_LOCALE ||
|
1785
|
+
a->current_codepage == a->current_oemcp))
|
1786
|
+
return (NULL);/* no conversion. */
|
1787
|
+
|
1788
|
+
_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
|
1789
|
+
/* Make sure a null termination must be set. */
|
1790
|
+
oemcp[sizeof(oemcp)-1] = '\0';
|
1791
|
+
return (get_sconv_object(a, cur_charset, oemcp,
|
1792
|
+
SCONV_TO_CHARSET));
|
1793
|
+
}
|
1794
|
+
#else
|
1795
|
+
struct archive_string_conv *
|
1796
|
+
archive_string_default_conversion_for_read(struct archive *a)
|
1797
|
+
{
|
1798
|
+
(void)a; /* UNUSED */
|
1799
|
+
return (NULL);
|
1800
|
+
}
|
1801
|
+
|
1802
|
+
struct archive_string_conv *
|
1803
|
+
archive_string_default_conversion_for_write(struct archive *a)
|
1804
|
+
{
|
1805
|
+
(void)a; /* UNUSED */
|
1806
|
+
return (NULL);
|
1807
|
+
}
|
1808
|
+
#endif
|
1809
|
+
|
1810
|
+
/*
|
1811
|
+
* Dispose of all character conversion objects in the archive object.
|
1812
|
+
*/
|
1813
|
+
void
|
1814
|
+
archive_string_conversion_free(struct archive *a)
|
1815
|
+
{
|
1816
|
+
struct archive_string_conv *sc;
|
1817
|
+
struct archive_string_conv *sc_next;
|
1818
|
+
|
1819
|
+
for (sc = a->sconv; sc != NULL; sc = sc_next) {
|
1820
|
+
sc_next = sc->next;
|
1821
|
+
free_sconv_object(sc);
|
1822
|
+
}
|
1823
|
+
a->sconv = NULL;
|
1824
|
+
free(a->current_code);
|
1825
|
+
a->current_code = NULL;
|
1826
|
+
}
|
1827
|
+
|
1828
|
+
/*
|
1829
|
+
* Return a conversion charset name.
|
1830
|
+
*/
|
1831
|
+
const char *
|
1832
|
+
archive_string_conversion_charset_name(struct archive_string_conv *sc)
|
1833
|
+
{
|
1834
|
+
if (sc->flag & SCONV_TO_CHARSET)
|
1835
|
+
return (sc->to_charset);
|
1836
|
+
else
|
1837
|
+
return (sc->from_charset);
|
1838
|
+
}
|
1839
|
+
|
1840
|
+
/*
|
1841
|
+
* Change the behavior of a string conversion.
|
1842
|
+
*/
|
1843
|
+
void
|
1844
|
+
archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt)
|
1845
|
+
{
|
1846
|
+
switch (opt) {
|
1847
|
+
/*
|
1848
|
+
* A filename in UTF-8 was made with libarchive 2.x in a wrong
|
1849
|
+
* assumption that wchar_t was Unicode.
|
1850
|
+
* This option enables simulating the assumption in order to read
|
1851
|
+
* that filename correctly.
|
1852
|
+
*/
|
1853
|
+
case SCONV_SET_OPT_UTF8_LIBARCHIVE2X:
|
1854
|
+
#if (defined(_WIN32) && !defined(__CYGWIN__)) \
|
1855
|
+
|| defined(__STDC_ISO_10646__) || defined(__APPLE__)
|
1856
|
+
/*
|
1857
|
+
* Nothing to do for it since wchar_t on these platforms
|
1858
|
+
* is really Unicode.
|
1859
|
+
*/
|
1860
|
+
(void)sc; /* UNUSED */
|
1861
|
+
#else
|
1862
|
+
if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) {
|
1863
|
+
sc->flag |= SCONV_UTF8_LIBARCHIVE_2;
|
1864
|
+
/* Set up string converters. */
|
1865
|
+
setup_converter(sc);
|
1866
|
+
}
|
1867
|
+
#endif
|
1868
|
+
break;
|
1869
|
+
case SCONV_SET_OPT_NORMALIZATION_C:
|
1870
|
+
if ((sc->flag & SCONV_NORMALIZATION_C) == 0) {
|
1871
|
+
sc->flag |= SCONV_NORMALIZATION_C;
|
1872
|
+
sc->flag &= ~SCONV_NORMALIZATION_D;
|
1873
|
+
/* Set up string converters. */
|
1874
|
+
setup_converter(sc);
|
1875
|
+
}
|
1876
|
+
break;
|
1877
|
+
case SCONV_SET_OPT_NORMALIZATION_D:
|
1878
|
+
#if defined(HAVE_ICONV)
|
1879
|
+
/*
|
1880
|
+
* If iconv will take the string, do not change the
|
1881
|
+
* setting of the normalization.
|
1882
|
+
*/
|
1883
|
+
if (!(sc->flag & SCONV_WIN_CP) &&
|
1884
|
+
(sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
|
1885
|
+
!(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
|
1886
|
+
break;
|
1887
|
+
#endif
|
1888
|
+
if ((sc->flag & SCONV_NORMALIZATION_D) == 0) {
|
1889
|
+
sc->flag |= SCONV_NORMALIZATION_D;
|
1890
|
+
sc->flag &= ~SCONV_NORMALIZATION_C;
|
1891
|
+
/* Set up string converters. */
|
1892
|
+
setup_converter(sc);
|
1893
|
+
}
|
1894
|
+
break;
|
1895
|
+
default:
|
1896
|
+
break;
|
1897
|
+
}
|
1898
|
+
}
|
1899
|
+
|
1900
|
+
/*
|
1901
|
+
*
|
1902
|
+
* Copy one archive_string to another in locale conversion.
|
1903
|
+
*
|
1904
|
+
* archive_strncat_l();
|
1905
|
+
* archive_strncpy_l();
|
1906
|
+
*
|
1907
|
+
*/
|
1908
|
+
|
1909
|
+
static size_t
|
1910
|
+
mbsnbytes(const void *_p, size_t n)
|
1911
|
+
{
|
1912
|
+
size_t s;
|
1913
|
+
const char *p, *pp;
|
1914
|
+
|
1915
|
+
if (_p == NULL)
|
1916
|
+
return (0);
|
1917
|
+
p = (const char *)_p;
|
1918
|
+
|
1919
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
1920
|
+
s = 0;
|
1921
|
+
pp = p;
|
1922
|
+
while (s < n && *pp) {
|
1923
|
+
pp++;
|
1924
|
+
s++;
|
1925
|
+
}
|
1926
|
+
return (s);
|
1927
|
+
}
|
1928
|
+
|
1929
|
+
static size_t
|
1930
|
+
utf16nbytes(const void *_p, size_t n)
|
1931
|
+
{
|
1932
|
+
size_t s;
|
1933
|
+
const char *p, *pp;
|
1934
|
+
|
1935
|
+
if (_p == NULL)
|
1936
|
+
return (0);
|
1937
|
+
p = (const char *)_p;
|
1938
|
+
|
1939
|
+
/* Like strlen(p), except won't examine positions beyond p[n]. */
|
1940
|
+
s = 0;
|
1941
|
+
pp = p;
|
1942
|
+
n >>= 1;
|
1943
|
+
while (s < n && (pp[0] || pp[1])) {
|
1944
|
+
pp += 2;
|
1945
|
+
s++;
|
1946
|
+
}
|
1947
|
+
return (s<<1);
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
int
|
1951
|
+
archive_strncpy_l(struct archive_string *as, const void *_p, size_t n,
|
1952
|
+
struct archive_string_conv *sc)
|
1953
|
+
{
|
1954
|
+
as->length = 0;
|
1955
|
+
return (archive_strncat_l(as, _p, n, sc));
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
int
|
1959
|
+
archive_strncat_l(struct archive_string *as, const void *_p, size_t n,
|
1960
|
+
struct archive_string_conv *sc)
|
1961
|
+
{
|
1962
|
+
const void *s;
|
1963
|
+
size_t length = 0;
|
1964
|
+
int i, r = 0, r2;
|
1965
|
+
|
1966
|
+
if (_p != NULL && n > 0) {
|
1967
|
+
if (sc != NULL && (sc->flag & SCONV_FROM_UTF16))
|
1968
|
+
length = utf16nbytes(_p, n);
|
1969
|
+
else
|
1970
|
+
length = mbsnbytes(_p, n);
|
1971
|
+
}
|
1972
|
+
|
1973
|
+
/* We must allocate memory even if there is no data for conversion
|
1974
|
+
* or copy. This simulates archive_string_append behavior. */
|
1975
|
+
if (length == 0) {
|
1976
|
+
int tn = 1;
|
1977
|
+
if (sc != NULL && (sc->flag & SCONV_TO_UTF16))
|
1978
|
+
tn = 2;
|
1979
|
+
if (archive_string_ensure(as, as->length + tn) == NULL)
|
1980
|
+
return (-1);
|
1981
|
+
as->s[as->length] = 0;
|
1982
|
+
if (tn == 2)
|
1983
|
+
as->s[as->length+1] = 0;
|
1984
|
+
return (0);
|
1985
|
+
}
|
1986
|
+
|
1987
|
+
/*
|
1988
|
+
* If sc is NULL, we just make a copy.
|
1989
|
+
*/
|
1990
|
+
if (sc == NULL) {
|
1991
|
+
if (archive_string_append(as, _p, length) == NULL)
|
1992
|
+
return (-1);/* No memory */
|
1993
|
+
return (0);
|
1994
|
+
}
|
1995
|
+
|
1996
|
+
s = _p;
|
1997
|
+
i = 0;
|
1998
|
+
if (sc->nconverter > 1) {
|
1999
|
+
sc->utftmp.length = 0;
|
2000
|
+
r2 = sc->converter[0](&(sc->utftmp), s, length, sc);
|
2001
|
+
if (r2 != 0 && errno == ENOMEM)
|
2002
|
+
return (r2);
|
2003
|
+
if (r > r2)
|
2004
|
+
r = r2;
|
2005
|
+
s = sc->utftmp.s;
|
2006
|
+
length = sc->utftmp.length;
|
2007
|
+
++i;
|
2008
|
+
}
|
2009
|
+
r2 = sc->converter[i](as, s, length, sc);
|
2010
|
+
if (r > r2)
|
2011
|
+
r = r2;
|
2012
|
+
return (r);
|
2013
|
+
}
|
2014
|
+
|
2015
|
+
#if HAVE_ICONV
|
2016
|
+
|
2017
|
+
/*
|
2018
|
+
* Return -1 if conversion fails.
|
2019
|
+
*/
|
2020
|
+
static int
|
2021
|
+
iconv_strncat_in_locale(struct archive_string *as, const void *_p,
|
2022
|
+
size_t length, struct archive_string_conv *sc)
|
2023
|
+
{
|
2024
|
+
ICONV_CONST char *itp;
|
2025
|
+
size_t remaining;
|
2026
|
+
iconv_t cd;
|
2027
|
+
char *outp;
|
2028
|
+
size_t avail, bs;
|
2029
|
+
int return_value = 0; /* success */
|
2030
|
+
int to_size, from_size;
|
2031
|
+
|
2032
|
+
if (sc->flag & SCONV_TO_UTF16)
|
2033
|
+
to_size = 2;
|
2034
|
+
else
|
2035
|
+
to_size = 1;
|
2036
|
+
if (sc->flag & SCONV_FROM_UTF16)
|
2037
|
+
from_size = 2;
|
2038
|
+
else
|
2039
|
+
from_size = 1;
|
2040
|
+
|
2041
|
+
if (archive_string_ensure(as, as->length + length*2+to_size) == NULL)
|
2042
|
+
return (-1);
|
2043
|
+
|
2044
|
+
cd = sc->cd;
|
2045
|
+
itp = (char *)(uintptr_t)_p;
|
2046
|
+
remaining = length;
|
2047
|
+
outp = as->s + as->length;
|
2048
|
+
avail = as->buffer_length - as->length - to_size;
|
2049
|
+
while (remaining >= (size_t)from_size) {
|
2050
|
+
size_t result = iconv(cd, &itp, &remaining, &outp, &avail);
|
2051
|
+
|
2052
|
+
if (result != (size_t)-1)
|
2053
|
+
break; /* Conversion completed. */
|
2054
|
+
|
2055
|
+
if (errno == EILSEQ || errno == EINVAL) {
|
2056
|
+
/*
|
2057
|
+
* If an output charset is UTF-8 or UTF-16BE/LE,
|
2058
|
+
* unknown character should be U+FFFD
|
2059
|
+
* (replacement character).
|
2060
|
+
*/
|
2061
|
+
if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
|
2062
|
+
size_t rbytes;
|
2063
|
+
if (sc->flag & SCONV_TO_UTF8)
|
2064
|
+
rbytes = sizeof(utf8_replacement_char);
|
2065
|
+
else
|
2066
|
+
rbytes = 2;
|
2067
|
+
|
2068
|
+
if (avail < rbytes) {
|
2069
|
+
as->length = outp - as->s;
|
2070
|
+
bs = as->buffer_length +
|
2071
|
+
(remaining * to_size) + rbytes;
|
2072
|
+
if (NULL ==
|
2073
|
+
archive_string_ensure(as, bs))
|
2074
|
+
return (-1);
|
2075
|
+
outp = as->s + as->length;
|
2076
|
+
avail = as->buffer_length
|
2077
|
+
- as->length - to_size;
|
2078
|
+
}
|
2079
|
+
if (sc->flag & SCONV_TO_UTF8)
|
2080
|
+
memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
|
2081
|
+
else if (sc->flag & SCONV_TO_UTF16BE)
|
2082
|
+
archive_be16enc(outp, UNICODE_R_CHAR);
|
2083
|
+
else
|
2084
|
+
archive_le16enc(outp, UNICODE_R_CHAR);
|
2085
|
+
outp += rbytes;
|
2086
|
+
avail -= rbytes;
|
2087
|
+
} else {
|
2088
|
+
/* Skip the illegal input bytes. */
|
2089
|
+
*outp++ = '?';
|
2090
|
+
avail--;
|
2091
|
+
}
|
2092
|
+
itp += from_size;
|
2093
|
+
remaining -= from_size;
|
2094
|
+
return_value = -1; /* failure */
|
2095
|
+
} else {
|
2096
|
+
/* E2BIG no output buffer,
|
2097
|
+
* Increase an output buffer. */
|
2098
|
+
as->length = outp - as->s;
|
2099
|
+
bs = as->buffer_length + remaining * 2;
|
2100
|
+
if (NULL == archive_string_ensure(as, bs))
|
2101
|
+
return (-1);
|
2102
|
+
outp = as->s + as->length;
|
2103
|
+
avail = as->buffer_length - as->length - to_size;
|
2104
|
+
}
|
2105
|
+
}
|
2106
|
+
as->length = outp - as->s;
|
2107
|
+
as->s[as->length] = 0;
|
2108
|
+
if (to_size == 2)
|
2109
|
+
as->s[as->length+1] = 0;
|
2110
|
+
return (return_value);
|
2111
|
+
}
|
2112
|
+
|
2113
|
+
#endif /* HAVE_ICONV */
|
2114
|
+
|
2115
|
+
|
2116
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
2117
|
+
|
2118
|
+
/*
|
2119
|
+
* Translate a string from a some CodePage to an another CodePage by
|
2120
|
+
* Windows APIs, and copy the result. Return -1 if conversion fails.
|
2121
|
+
*/
|
2122
|
+
static int
|
2123
|
+
strncat_in_codepage(struct archive_string *as,
|
2124
|
+
const void *_p, size_t length, struct archive_string_conv *sc)
|
2125
|
+
{
|
2126
|
+
const char *s = (const char *)_p;
|
2127
|
+
struct archive_wstring aws;
|
2128
|
+
size_t l;
|
2129
|
+
int r, saved_flag;
|
2130
|
+
|
2131
|
+
archive_string_init(&aws);
|
2132
|
+
saved_flag = sc->flag;
|
2133
|
+
sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C);
|
2134
|
+
r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc);
|
2135
|
+
sc->flag = saved_flag;
|
2136
|
+
if (r != 0) {
|
2137
|
+
archive_wstring_free(&aws);
|
2138
|
+
if (errno != ENOMEM)
|
2139
|
+
archive_string_append(as, s, length);
|
2140
|
+
return (-1);
|
2141
|
+
}
|
2142
|
+
|
2143
|
+
l = as->length;
|
2144
|
+
r = archive_string_append_from_wcs_in_codepage(
|
2145
|
+
as, aws.s, aws.length, sc);
|
2146
|
+
if (r != 0 && errno != ENOMEM && l == as->length)
|
2147
|
+
archive_string_append(as, s, length);
|
2148
|
+
archive_wstring_free(&aws);
|
2149
|
+
return (r);
|
2150
|
+
}
|
2151
|
+
|
2152
|
+
/*
|
2153
|
+
* Test whether MBS ==> WCS is okay.
|
2154
|
+
*/
|
2155
|
+
static int
|
2156
|
+
invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
|
2157
|
+
{
|
2158
|
+
const char *p = (const char *)_p;
|
2159
|
+
unsigned codepage;
|
2160
|
+
DWORD mbflag = MB_ERR_INVALID_CHARS;
|
2161
|
+
|
2162
|
+
if (sc->flag & SCONV_FROM_CHARSET)
|
2163
|
+
codepage = sc->to_cp;
|
2164
|
+
else
|
2165
|
+
codepage = sc->from_cp;
|
2166
|
+
|
2167
|
+
if (codepage == CP_C_LOCALE)
|
2168
|
+
return (0);
|
2169
|
+
if (codepage != CP_UTF8)
|
2170
|
+
mbflag |= MB_PRECOMPOSED;
|
2171
|
+
|
2172
|
+
if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0)
|
2173
|
+
return (-1); /* Invalid */
|
2174
|
+
return (0); /* Okay */
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
#else
|
2178
|
+
|
2179
|
+
/*
|
2180
|
+
* Test whether MBS ==> WCS is okay.
|
2181
|
+
*/
|
2182
|
+
static int
|
2183
|
+
invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
|
2184
|
+
{
|
2185
|
+
const char *p = (const char *)_p;
|
2186
|
+
size_t r;
|
2187
|
+
|
2188
|
+
#if HAVE_MBRTOWC
|
2189
|
+
mbstate_t shift_state;
|
2190
|
+
|
2191
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
2192
|
+
#else
|
2193
|
+
/* Clear the shift state before starting. */
|
2194
|
+
mbtowc(NULL, NULL, 0);
|
2195
|
+
#endif
|
2196
|
+
while (n) {
|
2197
|
+
wchar_t wc;
|
2198
|
+
|
2199
|
+
#if HAVE_MBRTOWC
|
2200
|
+
r = mbrtowc(&wc, p, n, &shift_state);
|
2201
|
+
#else
|
2202
|
+
r = mbtowc(&wc, p, n);
|
2203
|
+
#endif
|
2204
|
+
if (r == (size_t)-1 || r == (size_t)-2)
|
2205
|
+
return (-1);/* Invalid. */
|
2206
|
+
if (r == 0)
|
2207
|
+
break;
|
2208
|
+
p += r;
|
2209
|
+
n -= r;
|
2210
|
+
}
|
2211
|
+
(void)sc; /* UNUSED */
|
2212
|
+
return (0); /* All Okey. */
|
2213
|
+
}
|
2214
|
+
|
2215
|
+
#endif /* defined(_WIN32) && !defined(__CYGWIN__) */
|
2216
|
+
|
2217
|
+
/*
|
2218
|
+
* Basically returns -1 because we cannot make a conversion of charset
|
2219
|
+
* without iconv but in some cases this would return 0.
|
2220
|
+
* Returns 0 if all copied characters are ASCII.
|
2221
|
+
* Returns 0 if both from-locale and to-locale are the same and those
|
2222
|
+
* can be WCS with no error.
|
2223
|
+
*/
|
2224
|
+
static int
|
2225
|
+
best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
|
2226
|
+
size_t length, struct archive_string_conv *sc)
|
2227
|
+
{
|
2228
|
+
size_t remaining;
|
2229
|
+
const uint8_t *itp;
|
2230
|
+
int return_value = 0; /* success */
|
2231
|
+
|
2232
|
+
/*
|
2233
|
+
* If both from-locale and to-locale is the same, this makes a copy.
|
2234
|
+
* And then this checks all copied MBS can be WCS if so returns 0.
|
2235
|
+
*/
|
2236
|
+
if (sc->same) {
|
2237
|
+
if (archive_string_append(as, _p, length) == NULL)
|
2238
|
+
return (-1);/* No memory */
|
2239
|
+
return (invalid_mbs(_p, length, sc));
|
2240
|
+
}
|
2241
|
+
|
2242
|
+
/*
|
2243
|
+
* If a character is ASCII, this just copies it. If not, this
|
2244
|
+
* assigns '?' character instead but in UTF-8 locale this assigns
|
2245
|
+
* byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
|
2246
|
+
* a Replacement Character in Unicode.
|
2247
|
+
*/
|
2248
|
+
|
2249
|
+
remaining = length;
|
2250
|
+
itp = (const uint8_t *)_p;
|
2251
|
+
while (*itp && remaining > 0) {
|
2252
|
+
if (*itp > 127) {
|
2253
|
+
// Non-ASCII: Substitute with suitable replacement
|
2254
|
+
if (sc->flag & SCONV_TO_UTF8) {
|
2255
|
+
if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
|
2256
|
+
__archive_errx(1, "Out of memory");
|
2257
|
+
}
|
2258
|
+
} else {
|
2259
|
+
archive_strappend_char(as, '?');
|
2260
|
+
}
|
2261
|
+
return_value = -1;
|
2262
|
+
} else {
|
2263
|
+
archive_strappend_char(as, *itp);
|
2264
|
+
}
|
2265
|
+
++itp;
|
2266
|
+
}
|
2267
|
+
return (return_value);
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
|
2271
|
+
/*
|
2272
|
+
* Unicode conversion functions.
|
2273
|
+
* - UTF-8 <===> UTF-8 in removing surrogate pairs.
|
2274
|
+
* - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs.
|
2275
|
+
* - UTF-8 made by libarchive 2.x ===> UTF-8.
|
2276
|
+
* - UTF-16BE <===> UTF-8.
|
2277
|
+
*
|
2278
|
+
*/
|
2279
|
+
|
2280
|
+
/*
|
2281
|
+
* Utility to convert a single UTF-8 sequence.
|
2282
|
+
*
|
2283
|
+
* Usually return used bytes, return used byte in negative value when
|
2284
|
+
* a unicode character is replaced with U+FFFD.
|
2285
|
+
* See also http://unicode.org/review/pr-121.html Public Review Issue #121
|
2286
|
+
* Recommended Practice for Replacement Characters.
|
2287
|
+
*/
|
2288
|
+
static int
|
2289
|
+
_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2290
|
+
{
|
2291
|
+
static const char utf8_count[256] = {
|
2292
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */
|
2293
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */
|
2294
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */
|
2295
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */
|
2296
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */
|
2297
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */
|
2298
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */
|
2299
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */
|
2300
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */
|
2301
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */
|
2302
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */
|
2303
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */
|
2304
|
+
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */
|
2305
|
+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */
|
2306
|
+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */
|
2307
|
+
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */
|
2308
|
+
};
|
2309
|
+
int ch, i;
|
2310
|
+
int cnt;
|
2311
|
+
uint32_t wc;
|
2312
|
+
|
2313
|
+
/* Sanity check. */
|
2314
|
+
if (n == 0)
|
2315
|
+
return (0);
|
2316
|
+
/*
|
2317
|
+
* Decode 1-4 bytes depending on the value of the first byte.
|
2318
|
+
*/
|
2319
|
+
ch = (unsigned char)*s;
|
2320
|
+
if (ch == 0)
|
2321
|
+
return (0); /* Standard: return 0 for end-of-string. */
|
2322
|
+
cnt = utf8_count[ch];
|
2323
|
+
|
2324
|
+
/* Invalid sequence or there are not plenty bytes. */
|
2325
|
+
if ((int)n < cnt) {
|
2326
|
+
cnt = (int)n;
|
2327
|
+
for (i = 1; i < cnt; i++) {
|
2328
|
+
if ((s[i] & 0xc0) != 0x80) {
|
2329
|
+
cnt = i;
|
2330
|
+
break;
|
2331
|
+
}
|
2332
|
+
}
|
2333
|
+
goto invalid_sequence;
|
2334
|
+
}
|
2335
|
+
|
2336
|
+
/* Make a Unicode code point from a single UTF-8 sequence. */
|
2337
|
+
switch (cnt) {
|
2338
|
+
case 1: /* 1 byte sequence. */
|
2339
|
+
*pwc = ch & 0x7f;
|
2340
|
+
return (cnt);
|
2341
|
+
case 2: /* 2 bytes sequence. */
|
2342
|
+
if ((s[1] & 0xc0) != 0x80) {
|
2343
|
+
cnt = 1;
|
2344
|
+
goto invalid_sequence;
|
2345
|
+
}
|
2346
|
+
*pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
|
2347
|
+
return (cnt);
|
2348
|
+
case 3: /* 3 bytes sequence. */
|
2349
|
+
if ((s[1] & 0xc0) != 0x80) {
|
2350
|
+
cnt = 1;
|
2351
|
+
goto invalid_sequence;
|
2352
|
+
}
|
2353
|
+
if ((s[2] & 0xc0) != 0x80) {
|
2354
|
+
cnt = 2;
|
2355
|
+
goto invalid_sequence;
|
2356
|
+
}
|
2357
|
+
wc = ((ch & 0x0f) << 12)
|
2358
|
+
| ((s[1] & 0x3f) << 6)
|
2359
|
+
| (s[2] & 0x3f);
|
2360
|
+
if (wc < 0x800)
|
2361
|
+
goto invalid_sequence;/* Overlong sequence. */
|
2362
|
+
break;
|
2363
|
+
case 4: /* 4 bytes sequence. */
|
2364
|
+
if ((s[1] & 0xc0) != 0x80) {
|
2365
|
+
cnt = 1;
|
2366
|
+
goto invalid_sequence;
|
2367
|
+
}
|
2368
|
+
if ((s[2] & 0xc0) != 0x80) {
|
2369
|
+
cnt = 2;
|
2370
|
+
goto invalid_sequence;
|
2371
|
+
}
|
2372
|
+
if ((s[3] & 0xc0) != 0x80) {
|
2373
|
+
cnt = 3;
|
2374
|
+
goto invalid_sequence;
|
2375
|
+
}
|
2376
|
+
wc = ((ch & 0x07) << 18)
|
2377
|
+
| ((s[1] & 0x3f) << 12)
|
2378
|
+
| ((s[2] & 0x3f) << 6)
|
2379
|
+
| (s[3] & 0x3f);
|
2380
|
+
if (wc < 0x10000)
|
2381
|
+
goto invalid_sequence;/* Overlong sequence. */
|
2382
|
+
break;
|
2383
|
+
default: /* Others are all invalid sequence. */
|
2384
|
+
if (ch == 0xc0 || ch == 0xc1)
|
2385
|
+
cnt = 2;
|
2386
|
+
else if (ch >= 0xf5 && ch <= 0xf7)
|
2387
|
+
cnt = 4;
|
2388
|
+
else if (ch >= 0xf8 && ch <= 0xfb)
|
2389
|
+
cnt = 5;
|
2390
|
+
else if (ch == 0xfc || ch == 0xfd)
|
2391
|
+
cnt = 6;
|
2392
|
+
else
|
2393
|
+
cnt = 1;
|
2394
|
+
if ((int)n < cnt)
|
2395
|
+
cnt = (int)n;
|
2396
|
+
for (i = 1; i < cnt; i++) {
|
2397
|
+
if ((s[i] & 0xc0) != 0x80) {
|
2398
|
+
cnt = i;
|
2399
|
+
break;
|
2400
|
+
}
|
2401
|
+
}
|
2402
|
+
goto invalid_sequence;
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
/* The code point larger than 0x10FFFF is not legal
|
2406
|
+
* Unicode values. */
|
2407
|
+
if (wc > UNICODE_MAX)
|
2408
|
+
goto invalid_sequence;
|
2409
|
+
/* Correctly gets a Unicode, returns used bytes. */
|
2410
|
+
*pwc = wc;
|
2411
|
+
return (cnt);
|
2412
|
+
invalid_sequence:
|
2413
|
+
*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
|
2414
|
+
return (cnt * -1);
|
2415
|
+
}
|
2416
|
+
|
2417
|
+
static int
|
2418
|
+
utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2419
|
+
{
|
2420
|
+
int cnt;
|
2421
|
+
|
2422
|
+
cnt = _utf8_to_unicode(pwc, s, n);
|
2423
|
+
/* Any of Surrogate pair is not legal Unicode values. */
|
2424
|
+
if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
|
2425
|
+
return (-3);
|
2426
|
+
return (cnt);
|
2427
|
+
}
|
2428
|
+
|
2429
|
+
static inline uint32_t
|
2430
|
+
combine_surrogate_pair(uint32_t uc, uint32_t uc2)
|
2431
|
+
{
|
2432
|
+
uc -= 0xD800;
|
2433
|
+
uc *= 0x400;
|
2434
|
+
uc += uc2 - 0xDC00;
|
2435
|
+
uc += 0x10000;
|
2436
|
+
return (uc);
|
2437
|
+
}
|
2438
|
+
|
2439
|
+
/*
|
2440
|
+
* Convert a single UTF-8/CESU-8 sequence to a Unicode code point in
|
2441
|
+
* removing surrogate pairs.
|
2442
|
+
*
|
2443
|
+
* CESU-8: The Compatibility Encoding Scheme for UTF-16.
|
2444
|
+
*
|
2445
|
+
* Usually return used bytes, return used byte in negative value when
|
2446
|
+
* a unicode character is replaced with U+FFFD.
|
2447
|
+
*/
|
2448
|
+
static int
|
2449
|
+
cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2450
|
+
{
|
2451
|
+
uint32_t wc = 0;
|
2452
|
+
int cnt;
|
2453
|
+
|
2454
|
+
cnt = _utf8_to_unicode(&wc, s, n);
|
2455
|
+
if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) {
|
2456
|
+
uint32_t wc2 = 0;
|
2457
|
+
if (n - 3 < 3) {
|
2458
|
+
/* Invalid byte sequence. */
|
2459
|
+
goto invalid_sequence;
|
2460
|
+
}
|
2461
|
+
cnt = _utf8_to_unicode(&wc2, s+3, n-3);
|
2462
|
+
if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) {
|
2463
|
+
/* Invalid byte sequence. */
|
2464
|
+
goto invalid_sequence;
|
2465
|
+
}
|
2466
|
+
wc = combine_surrogate_pair(wc, wc2);
|
2467
|
+
cnt = 6;
|
2468
|
+
} else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) {
|
2469
|
+
/* Invalid byte sequence. */
|
2470
|
+
goto invalid_sequence;
|
2471
|
+
}
|
2472
|
+
*pwc = wc;
|
2473
|
+
return (cnt);
|
2474
|
+
invalid_sequence:
|
2475
|
+
*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
|
2476
|
+
if (cnt > 0)
|
2477
|
+
cnt *= -1;
|
2478
|
+
return (cnt);
|
2479
|
+
}
|
2480
|
+
|
2481
|
+
/*
|
2482
|
+
* Convert a Unicode code point to a single UTF-8 sequence.
|
2483
|
+
*
|
2484
|
+
* NOTE:This function does not check if the Unicode is legal or not.
|
2485
|
+
* Please you definitely check it before calling this.
|
2486
|
+
*/
|
2487
|
+
static size_t
|
2488
|
+
unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
|
2489
|
+
{
|
2490
|
+
char *_p = p;
|
2491
|
+
|
2492
|
+
/* Invalid Unicode char maps to Replacement character */
|
2493
|
+
if (uc > UNICODE_MAX)
|
2494
|
+
uc = UNICODE_R_CHAR;
|
2495
|
+
/* Translate code point to UTF8 */
|
2496
|
+
if (uc <= 0x7f) {
|
2497
|
+
if (remaining == 0)
|
2498
|
+
return (0);
|
2499
|
+
*p++ = (char)uc;
|
2500
|
+
} else if (uc <= 0x7ff) {
|
2501
|
+
if (remaining < 2)
|
2502
|
+
return (0);
|
2503
|
+
*p++ = 0xc0 | ((uc >> 6) & 0x1f);
|
2504
|
+
*p++ = 0x80 | (uc & 0x3f);
|
2505
|
+
} else if (uc <= 0xffff) {
|
2506
|
+
if (remaining < 3)
|
2507
|
+
return (0);
|
2508
|
+
*p++ = 0xe0 | ((uc >> 12) & 0x0f);
|
2509
|
+
*p++ = 0x80 | ((uc >> 6) & 0x3f);
|
2510
|
+
*p++ = 0x80 | (uc & 0x3f);
|
2511
|
+
} else {
|
2512
|
+
if (remaining < 4)
|
2513
|
+
return (0);
|
2514
|
+
*p++ = 0xf0 | ((uc >> 18) & 0x07);
|
2515
|
+
*p++ = 0x80 | ((uc >> 12) & 0x3f);
|
2516
|
+
*p++ = 0x80 | ((uc >> 6) & 0x3f);
|
2517
|
+
*p++ = 0x80 | (uc & 0x3f);
|
2518
|
+
}
|
2519
|
+
return (p - _p);
|
2520
|
+
}
|
2521
|
+
|
2522
|
+
static int
|
2523
|
+
utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2524
|
+
{
|
2525
|
+
return (utf16_to_unicode(pwc, s, n, 1));
|
2526
|
+
}
|
2527
|
+
|
2528
|
+
static int
|
2529
|
+
utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n)
|
2530
|
+
{
|
2531
|
+
return (utf16_to_unicode(pwc, s, n, 0));
|
2532
|
+
}
|
2533
|
+
|
2534
|
+
static int
|
2535
|
+
utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be)
|
2536
|
+
{
|
2537
|
+
const char *utf16 = s;
|
2538
|
+
unsigned uc;
|
2539
|
+
|
2540
|
+
if (n == 0)
|
2541
|
+
return (0);
|
2542
|
+
if (n == 1) {
|
2543
|
+
/* set the Replacement Character instead. */
|
2544
|
+
*pwc = UNICODE_R_CHAR;
|
2545
|
+
return (-1);
|
2546
|
+
}
|
2547
|
+
|
2548
|
+
if (be)
|
2549
|
+
uc = archive_be16dec(utf16);
|
2550
|
+
else
|
2551
|
+
uc = archive_le16dec(utf16);
|
2552
|
+
utf16 += 2;
|
2553
|
+
|
2554
|
+
/* If this is a surrogate pair, assemble the full code point.*/
|
2555
|
+
if (IS_HIGH_SURROGATE_LA(uc)) {
|
2556
|
+
unsigned uc2;
|
2557
|
+
|
2558
|
+
if (n >= 4) {
|
2559
|
+
if (be)
|
2560
|
+
uc2 = archive_be16dec(utf16);
|
2561
|
+
else
|
2562
|
+
uc2 = archive_le16dec(utf16);
|
2563
|
+
} else
|
2564
|
+
uc2 = 0;
|
2565
|
+
if (IS_LOW_SURROGATE_LA(uc2)) {
|
2566
|
+
uc = combine_surrogate_pair(uc, uc2);
|
2567
|
+
utf16 += 2;
|
2568
|
+
} else {
|
2569
|
+
/* Undescribed code point should be U+FFFD
|
2570
|
+
* (replacement character). */
|
2571
|
+
*pwc = UNICODE_R_CHAR;
|
2572
|
+
return (-2);
|
2573
|
+
}
|
2574
|
+
}
|
2575
|
+
|
2576
|
+
/*
|
2577
|
+
* Surrogate pair values(0xd800 through 0xdfff) are only
|
2578
|
+
* used by UTF-16, so, after above calculation, the code
|
2579
|
+
* must not be surrogate values, and Unicode has no codes
|
2580
|
+
* larger than 0x10ffff. Thus, those are not legal Unicode
|
2581
|
+
* values.
|
2582
|
+
*/
|
2583
|
+
if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
|
2584
|
+
/* Undescribed code point should be U+FFFD
|
2585
|
+
* (replacement character). */
|
2586
|
+
*pwc = UNICODE_R_CHAR;
|
2587
|
+
return (((int)(utf16 - s)) * -1);
|
2588
|
+
}
|
2589
|
+
*pwc = uc;
|
2590
|
+
return ((int)(utf16 - s));
|
2591
|
+
}
|
2592
|
+
|
2593
|
+
static size_t
|
2594
|
+
unicode_to_utf16be(char *p, size_t remaining, uint32_t uc)
|
2595
|
+
{
|
2596
|
+
char *utf16 = p;
|
2597
|
+
|
2598
|
+
if (uc > 0xffff) {
|
2599
|
+
/* We have a code point that won't fit into a
|
2600
|
+
* wchar_t; convert it to a surrogate pair. */
|
2601
|
+
if (remaining < 4)
|
2602
|
+
return (0);
|
2603
|
+
uc -= 0x10000;
|
2604
|
+
archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
|
2605
|
+
archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
|
2606
|
+
return (4);
|
2607
|
+
} else {
|
2608
|
+
if (remaining < 2)
|
2609
|
+
return (0);
|
2610
|
+
archive_be16enc(utf16, uc);
|
2611
|
+
return (2);
|
2612
|
+
}
|
2613
|
+
}
|
2614
|
+
|
2615
|
+
static size_t
|
2616
|
+
unicode_to_utf16le(char *p, size_t remaining, uint32_t uc)
|
2617
|
+
{
|
2618
|
+
char *utf16 = p;
|
2619
|
+
|
2620
|
+
if (uc > 0xffff) {
|
2621
|
+
/* We have a code point that won't fit into a
|
2622
|
+
* wchar_t; convert it to a surrogate pair. */
|
2623
|
+
if (remaining < 4)
|
2624
|
+
return (0);
|
2625
|
+
uc -= 0x10000;
|
2626
|
+
archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
|
2627
|
+
archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
|
2628
|
+
return (4);
|
2629
|
+
} else {
|
2630
|
+
if (remaining < 2)
|
2631
|
+
return (0);
|
2632
|
+
archive_le16enc(utf16, uc);
|
2633
|
+
return (2);
|
2634
|
+
}
|
2635
|
+
}
|
2636
|
+
|
2637
|
+
/*
|
2638
|
+
* Copy UTF-8 string in checking surrogate pair.
|
2639
|
+
* If any surrogate pair are found, it would be canonicalized.
|
2640
|
+
*/
|
2641
|
+
static int
|
2642
|
+
strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p,
|
2643
|
+
size_t len, struct archive_string_conv *sc)
|
2644
|
+
{
|
2645
|
+
const char *s;
|
2646
|
+
char *p, *endp;
|
2647
|
+
int n, ret = 0;
|
2648
|
+
|
2649
|
+
(void)sc; /* UNUSED */
|
2650
|
+
|
2651
|
+
if (archive_string_ensure(as, as->length + len + 1) == NULL)
|
2652
|
+
return (-1);
|
2653
|
+
|
2654
|
+
s = (const char *)_p;
|
2655
|
+
p = as->s + as->length;
|
2656
|
+
endp = as->s + as->buffer_length -1;
|
2657
|
+
do {
|
2658
|
+
uint32_t uc;
|
2659
|
+
const char *ss = s;
|
2660
|
+
size_t w;
|
2661
|
+
|
2662
|
+
/*
|
2663
|
+
* Forward byte sequence until a conversion of that is needed.
|
2664
|
+
*/
|
2665
|
+
while ((n = utf8_to_unicode(&uc, s, len)) > 0) {
|
2666
|
+
s += n;
|
2667
|
+
len -= n;
|
2668
|
+
}
|
2669
|
+
if (ss < s) {
|
2670
|
+
if (p + (s - ss) > endp) {
|
2671
|
+
as->length = p - as->s;
|
2672
|
+
if (archive_string_ensure(as,
|
2673
|
+
as->buffer_length + len + 1) == NULL)
|
2674
|
+
return (-1);
|
2675
|
+
p = as->s + as->length;
|
2676
|
+
endp = as->s + as->buffer_length -1;
|
2677
|
+
}
|
2678
|
+
|
2679
|
+
memcpy(p, ss, s - ss);
|
2680
|
+
p += s - ss;
|
2681
|
+
}
|
2682
|
+
|
2683
|
+
/*
|
2684
|
+
* If n is negative, current byte sequence needs a replacement.
|
2685
|
+
*/
|
2686
|
+
if (n < 0) {
|
2687
|
+
if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) {
|
2688
|
+
/* Current byte sequence may be CESU-8. */
|
2689
|
+
n = cesu8_to_unicode(&uc, s, len);
|
2690
|
+
}
|
2691
|
+
if (n < 0) {
|
2692
|
+
ret = -1;
|
2693
|
+
n *= -1;/* Use a replaced unicode character. */
|
2694
|
+
}
|
2695
|
+
|
2696
|
+
/* Rebuild UTF-8 byte sequence. */
|
2697
|
+
while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) {
|
2698
|
+
as->length = p - as->s;
|
2699
|
+
if (archive_string_ensure(as,
|
2700
|
+
as->buffer_length + len + 1) == NULL)
|
2701
|
+
return (-1);
|
2702
|
+
p = as->s + as->length;
|
2703
|
+
endp = as->s + as->buffer_length -1;
|
2704
|
+
}
|
2705
|
+
p += w;
|
2706
|
+
s += n;
|
2707
|
+
len -= n;
|
2708
|
+
}
|
2709
|
+
} while (n > 0);
|
2710
|
+
as->length = p - as->s;
|
2711
|
+
as->s[as->length] = '\0';
|
2712
|
+
return (ret);
|
2713
|
+
}
|
2714
|
+
|
2715
|
+
static int
|
2716
|
+
archive_string_append_unicode(struct archive_string *as, const void *_p,
|
2717
|
+
size_t len, struct archive_string_conv *sc)
|
2718
|
+
{
|
2719
|
+
const char *s;
|
2720
|
+
char *p, *endp;
|
2721
|
+
uint32_t uc;
|
2722
|
+
size_t w;
|
2723
|
+
int n, ret = 0, ts, tm;
|
2724
|
+
int (*parse)(uint32_t *, const char *, size_t);
|
2725
|
+
size_t (*unparse)(char *, size_t, uint32_t);
|
2726
|
+
|
2727
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
2728
|
+
unparse = unicode_to_utf16be;
|
2729
|
+
ts = 2;
|
2730
|
+
} else if (sc->flag & SCONV_TO_UTF16LE) {
|
2731
|
+
unparse = unicode_to_utf16le;
|
2732
|
+
ts = 2;
|
2733
|
+
} else if (sc->flag & SCONV_TO_UTF8) {
|
2734
|
+
unparse = unicode_to_utf8;
|
2735
|
+
ts = 1;
|
2736
|
+
} else {
|
2737
|
+
/*
|
2738
|
+
* This case is going to be converted to another
|
2739
|
+
* character-set through iconv.
|
2740
|
+
*/
|
2741
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2742
|
+
unparse = unicode_to_utf16be;
|
2743
|
+
ts = 2;
|
2744
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2745
|
+
unparse = unicode_to_utf16le;
|
2746
|
+
ts = 2;
|
2747
|
+
} else {
|
2748
|
+
unparse = unicode_to_utf8;
|
2749
|
+
ts = 1;
|
2750
|
+
}
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2754
|
+
parse = utf16be_to_unicode;
|
2755
|
+
tm = 1;
|
2756
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2757
|
+
parse = utf16le_to_unicode;
|
2758
|
+
tm = 1;
|
2759
|
+
} else {
|
2760
|
+
parse = cesu8_to_unicode;
|
2761
|
+
tm = ts;
|
2762
|
+
}
|
2763
|
+
|
2764
|
+
if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
|
2765
|
+
return (-1);
|
2766
|
+
|
2767
|
+
s = (const char *)_p;
|
2768
|
+
p = as->s + as->length;
|
2769
|
+
endp = as->s + as->buffer_length - ts;
|
2770
|
+
while ((n = parse(&uc, s, len)) != 0) {
|
2771
|
+
if (n < 0) {
|
2772
|
+
/* Use a replaced unicode character. */
|
2773
|
+
n *= -1;
|
2774
|
+
ret = -1;
|
2775
|
+
}
|
2776
|
+
s += n;
|
2777
|
+
len -= n;
|
2778
|
+
while ((w = unparse(p, endp - p, uc)) == 0) {
|
2779
|
+
/* There is not enough output buffer so
|
2780
|
+
* we have to expand it. */
|
2781
|
+
as->length = p - as->s;
|
2782
|
+
if (archive_string_ensure(as,
|
2783
|
+
as->buffer_length + len * tm + ts) == NULL)
|
2784
|
+
return (-1);
|
2785
|
+
p = as->s + as->length;
|
2786
|
+
endp = as->s + as->buffer_length - ts;
|
2787
|
+
}
|
2788
|
+
p += w;
|
2789
|
+
}
|
2790
|
+
as->length = p - as->s;
|
2791
|
+
as->s[as->length] = '\0';
|
2792
|
+
if (ts == 2)
|
2793
|
+
as->s[as->length+1] = '\0';
|
2794
|
+
return (ret);
|
2795
|
+
}
|
2796
|
+
|
2797
|
+
/*
|
2798
|
+
* Following Constants for Hangul compositions this information comes from
|
2799
|
+
* Unicode Standard Annex #15 http://unicode.org/reports/tr15/
|
2800
|
+
*/
|
2801
|
+
#define HC_SBASE 0xAC00
|
2802
|
+
#define HC_LBASE 0x1100
|
2803
|
+
#define HC_VBASE 0x1161
|
2804
|
+
#define HC_TBASE 0x11A7
|
2805
|
+
#define HC_LCOUNT 19
|
2806
|
+
#define HC_VCOUNT 21
|
2807
|
+
#define HC_TCOUNT 28
|
2808
|
+
#define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT)
|
2809
|
+
#define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT)
|
2810
|
+
|
2811
|
+
static uint32_t
|
2812
|
+
get_nfc(uint32_t uc, uint32_t uc2)
|
2813
|
+
{
|
2814
|
+
int t, b;
|
2815
|
+
|
2816
|
+
t = 0;
|
2817
|
+
b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1;
|
2818
|
+
while (b >= t) {
|
2819
|
+
int m = (t + b) / 2;
|
2820
|
+
if (u_composition_table[m].cp1 < uc)
|
2821
|
+
t = m + 1;
|
2822
|
+
else if (u_composition_table[m].cp1 > uc)
|
2823
|
+
b = m - 1;
|
2824
|
+
else if (u_composition_table[m].cp2 < uc2)
|
2825
|
+
t = m + 1;
|
2826
|
+
else if (u_composition_table[m].cp2 > uc2)
|
2827
|
+
b = m - 1;
|
2828
|
+
else
|
2829
|
+
return (u_composition_table[m].nfc);
|
2830
|
+
}
|
2831
|
+
return (0);
|
2832
|
+
}
|
2833
|
+
|
2834
|
+
#define FDC_MAX 10 /* The maximum number of Following Decomposable
|
2835
|
+
* Characters. */
|
2836
|
+
|
2837
|
+
/*
|
2838
|
+
* Update first code point.
|
2839
|
+
*/
|
2840
|
+
#define UPDATE_UC(new_uc) do { \
|
2841
|
+
uc = new_uc; \
|
2842
|
+
ucptr = NULL; \
|
2843
|
+
} while (0)
|
2844
|
+
|
2845
|
+
/*
|
2846
|
+
* Replace first code point with second code point.
|
2847
|
+
*/
|
2848
|
+
#define REPLACE_UC_WITH_UC2() do { \
|
2849
|
+
uc = uc2; \
|
2850
|
+
ucptr = uc2ptr; \
|
2851
|
+
n = n2; \
|
2852
|
+
} while (0)
|
2853
|
+
|
2854
|
+
#define EXPAND_BUFFER() do { \
|
2855
|
+
as->length = p - as->s; \
|
2856
|
+
if (archive_string_ensure(as, \
|
2857
|
+
as->buffer_length + len * tm + ts) == NULL)\
|
2858
|
+
return (-1); \
|
2859
|
+
p = as->s + as->length; \
|
2860
|
+
endp = as->s + as->buffer_length - ts; \
|
2861
|
+
} while (0)
|
2862
|
+
|
2863
|
+
#define UNPARSE(p, endp, uc) do { \
|
2864
|
+
while ((w = unparse(p, (endp) - (p), uc)) == 0) {\
|
2865
|
+
EXPAND_BUFFER(); \
|
2866
|
+
} \
|
2867
|
+
p += w; \
|
2868
|
+
} while (0)
|
2869
|
+
|
2870
|
+
/*
|
2871
|
+
* Write first code point.
|
2872
|
+
* If the code point has not be changed from its original code,
|
2873
|
+
* this just copies it from its original buffer pointer.
|
2874
|
+
* If not, this converts it to UTF-8 byte sequence and copies it.
|
2875
|
+
*/
|
2876
|
+
#define WRITE_UC() do { \
|
2877
|
+
if (ucptr) { \
|
2878
|
+
if (p + n > endp) \
|
2879
|
+
EXPAND_BUFFER(); \
|
2880
|
+
switch (n) { \
|
2881
|
+
case 4: \
|
2882
|
+
*p++ = *ucptr++; \
|
2883
|
+
/* FALL THROUGH */ \
|
2884
|
+
case 3: \
|
2885
|
+
*p++ = *ucptr++; \
|
2886
|
+
/* FALL THROUGH */ \
|
2887
|
+
case 2: \
|
2888
|
+
*p++ = *ucptr++; \
|
2889
|
+
/* FALL THROUGH */ \
|
2890
|
+
case 1: \
|
2891
|
+
*p++ = *ucptr; \
|
2892
|
+
break; \
|
2893
|
+
} \
|
2894
|
+
ucptr = NULL; \
|
2895
|
+
} else { \
|
2896
|
+
UNPARSE(p, endp, uc); \
|
2897
|
+
} \
|
2898
|
+
} while (0)
|
2899
|
+
|
2900
|
+
/*
|
2901
|
+
* Collect following decomposable code points.
|
2902
|
+
*/
|
2903
|
+
#define COLLECT_CPS(start) do { \
|
2904
|
+
int _i; \
|
2905
|
+
for (_i = start; _i < FDC_MAX ; _i++) { \
|
2906
|
+
nx = parse(&ucx[_i], s, len); \
|
2907
|
+
if (nx <= 0) \
|
2908
|
+
break; \
|
2909
|
+
cx = CCC(ucx[_i]); \
|
2910
|
+
if (cl >= cx && cl != 228 && cx != 228)\
|
2911
|
+
break; \
|
2912
|
+
s += nx; \
|
2913
|
+
len -= nx; \
|
2914
|
+
cl = cx; \
|
2915
|
+
ccx[_i] = cx; \
|
2916
|
+
} \
|
2917
|
+
if (_i >= FDC_MAX) { \
|
2918
|
+
ret = -1; \
|
2919
|
+
ucx_size = FDC_MAX; \
|
2920
|
+
} else \
|
2921
|
+
ucx_size = _i; \
|
2922
|
+
} while (0)
|
2923
|
+
|
2924
|
+
/*
|
2925
|
+
* Normalize UTF-8/UTF-16BE characters to Form C and copy the result.
|
2926
|
+
*
|
2927
|
+
* TODO: Convert composition exclusions, which are never converted
|
2928
|
+
* from NFC,NFD,NFKC and NFKD, to Form C.
|
2929
|
+
*/
|
2930
|
+
static int
|
2931
|
+
archive_string_normalize_C(struct archive_string *as, const void *_p,
|
2932
|
+
size_t len, struct archive_string_conv *sc)
|
2933
|
+
{
|
2934
|
+
const char *s = (const char *)_p;
|
2935
|
+
char *p, *endp;
|
2936
|
+
uint32_t uc, uc2;
|
2937
|
+
size_t w;
|
2938
|
+
int always_replace, n, n2, ret = 0, spair, ts, tm;
|
2939
|
+
int (*parse)(uint32_t *, const char *, size_t);
|
2940
|
+
size_t (*unparse)(char *, size_t, uint32_t);
|
2941
|
+
|
2942
|
+
always_replace = 1;
|
2943
|
+
ts = 1;/* text size. */
|
2944
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
2945
|
+
unparse = unicode_to_utf16be;
|
2946
|
+
ts = 2;
|
2947
|
+
if (sc->flag & SCONV_FROM_UTF16BE)
|
2948
|
+
always_replace = 0;
|
2949
|
+
} else if (sc->flag & SCONV_TO_UTF16LE) {
|
2950
|
+
unparse = unicode_to_utf16le;
|
2951
|
+
ts = 2;
|
2952
|
+
if (sc->flag & SCONV_FROM_UTF16LE)
|
2953
|
+
always_replace = 0;
|
2954
|
+
} else if (sc->flag & SCONV_TO_UTF8) {
|
2955
|
+
unparse = unicode_to_utf8;
|
2956
|
+
if (sc->flag & SCONV_FROM_UTF8)
|
2957
|
+
always_replace = 0;
|
2958
|
+
} else {
|
2959
|
+
/*
|
2960
|
+
* This case is going to be converted to another
|
2961
|
+
* character-set through iconv.
|
2962
|
+
*/
|
2963
|
+
always_replace = 0;
|
2964
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2965
|
+
unparse = unicode_to_utf16be;
|
2966
|
+
ts = 2;
|
2967
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2968
|
+
unparse = unicode_to_utf16le;
|
2969
|
+
ts = 2;
|
2970
|
+
} else {
|
2971
|
+
unparse = unicode_to_utf8;
|
2972
|
+
}
|
2973
|
+
}
|
2974
|
+
|
2975
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
2976
|
+
parse = utf16be_to_unicode;
|
2977
|
+
tm = 1;
|
2978
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
2979
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
2980
|
+
parse = utf16le_to_unicode;
|
2981
|
+
tm = 1;
|
2982
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
2983
|
+
} else {
|
2984
|
+
parse = cesu8_to_unicode;
|
2985
|
+
tm = ts;
|
2986
|
+
spair = 6;/* surrogate pair size in UTF-8. */
|
2987
|
+
}
|
2988
|
+
|
2989
|
+
if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
|
2990
|
+
return (-1);
|
2991
|
+
|
2992
|
+
p = as->s + as->length;
|
2993
|
+
endp = as->s + as->buffer_length - ts;
|
2994
|
+
while ((n = parse(&uc, s, len)) != 0) {
|
2995
|
+
const char *ucptr, *uc2ptr;
|
2996
|
+
|
2997
|
+
if (n < 0) {
|
2998
|
+
/* Use a replaced unicode character. */
|
2999
|
+
UNPARSE(p, endp, uc);
|
3000
|
+
s += n*-1;
|
3001
|
+
len -= n*-1;
|
3002
|
+
ret = -1;
|
3003
|
+
continue;
|
3004
|
+
} else if (n == spair || always_replace)
|
3005
|
+
/* uc is converted from a surrogate pair.
|
3006
|
+
* this should be treated as a changed code. */
|
3007
|
+
ucptr = NULL;
|
3008
|
+
else
|
3009
|
+
ucptr = s;
|
3010
|
+
s += n;
|
3011
|
+
len -= n;
|
3012
|
+
|
3013
|
+
/* Read second code point. */
|
3014
|
+
while ((n2 = parse(&uc2, s, len)) > 0) {
|
3015
|
+
uint32_t ucx[FDC_MAX];
|
3016
|
+
int ccx[FDC_MAX];
|
3017
|
+
int cl, cx, i, nx, ucx_size;
|
3018
|
+
int LIndex,SIndex;
|
3019
|
+
uint32_t nfc;
|
3020
|
+
|
3021
|
+
if (n2 == spair || always_replace)
|
3022
|
+
/* uc2 is converted from a surrogate pair.
|
3023
|
+
* this should be treated as a changed code. */
|
3024
|
+
uc2ptr = NULL;
|
3025
|
+
else
|
3026
|
+
uc2ptr = s;
|
3027
|
+
s += n2;
|
3028
|
+
len -= n2;
|
3029
|
+
|
3030
|
+
/*
|
3031
|
+
* If current second code point is out of decomposable
|
3032
|
+
* code points, finding compositions is unneeded.
|
3033
|
+
*/
|
3034
|
+
if (!IS_DECOMPOSABLE_BLOCK(uc2)) {
|
3035
|
+
WRITE_UC();
|
3036
|
+
REPLACE_UC_WITH_UC2();
|
3037
|
+
continue;
|
3038
|
+
}
|
3039
|
+
|
3040
|
+
/*
|
3041
|
+
* Try to combine current code points.
|
3042
|
+
*/
|
3043
|
+
/*
|
3044
|
+
* We have to combine Hangul characters according to
|
3045
|
+
* http://uniicode.org/reports/tr15/#Hangul
|
3046
|
+
*/
|
3047
|
+
if (0 <= (LIndex = uc - HC_LBASE) &&
|
3048
|
+
LIndex < HC_LCOUNT) {
|
3049
|
+
/*
|
3050
|
+
* Hangul Composition.
|
3051
|
+
* 1. Two current code points are L and V.
|
3052
|
+
*/
|
3053
|
+
int VIndex = uc2 - HC_VBASE;
|
3054
|
+
if (0 <= VIndex && VIndex < HC_VCOUNT) {
|
3055
|
+
/* Make syllable of form LV. */
|
3056
|
+
UPDATE_UC(HC_SBASE +
|
3057
|
+
(LIndex * HC_VCOUNT + VIndex) *
|
3058
|
+
HC_TCOUNT);
|
3059
|
+
} else {
|
3060
|
+
WRITE_UC();
|
3061
|
+
REPLACE_UC_WITH_UC2();
|
3062
|
+
}
|
3063
|
+
continue;
|
3064
|
+
} else if (0 <= (SIndex = uc - HC_SBASE) &&
|
3065
|
+
SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) {
|
3066
|
+
/*
|
3067
|
+
* Hangul Composition.
|
3068
|
+
* 2. Two current code points are LV and T.
|
3069
|
+
*/
|
3070
|
+
int TIndex = uc2 - HC_TBASE;
|
3071
|
+
if (0 < TIndex && TIndex < HC_TCOUNT) {
|
3072
|
+
/* Make syllable of form LVT. */
|
3073
|
+
UPDATE_UC(uc + TIndex);
|
3074
|
+
} else {
|
3075
|
+
WRITE_UC();
|
3076
|
+
REPLACE_UC_WITH_UC2();
|
3077
|
+
}
|
3078
|
+
continue;
|
3079
|
+
} else if ((nfc = get_nfc(uc, uc2)) != 0) {
|
3080
|
+
/* A composition to current code points
|
3081
|
+
* is found. */
|
3082
|
+
UPDATE_UC(nfc);
|
3083
|
+
continue;
|
3084
|
+
} else if ((cl = CCC(uc2)) == 0) {
|
3085
|
+
/* Clearly 'uc2' the second code point is not
|
3086
|
+
* a decomposable code. */
|
3087
|
+
WRITE_UC();
|
3088
|
+
REPLACE_UC_WITH_UC2();
|
3089
|
+
continue;
|
3090
|
+
}
|
3091
|
+
|
3092
|
+
/*
|
3093
|
+
* Collect following decomposable code points.
|
3094
|
+
*/
|
3095
|
+
cx = 0;
|
3096
|
+
ucx[0] = uc2;
|
3097
|
+
ccx[0] = cl;
|
3098
|
+
COLLECT_CPS(1);
|
3099
|
+
|
3100
|
+
/*
|
3101
|
+
* Find a composed code in the collected code points.
|
3102
|
+
*/
|
3103
|
+
i = 1;
|
3104
|
+
while (i < ucx_size) {
|
3105
|
+
int j;
|
3106
|
+
|
3107
|
+
if ((nfc = get_nfc(uc, ucx[i])) == 0) {
|
3108
|
+
i++;
|
3109
|
+
continue;
|
3110
|
+
}
|
3111
|
+
|
3112
|
+
/*
|
3113
|
+
* nfc is composed of uc and ucx[i].
|
3114
|
+
*/
|
3115
|
+
UPDATE_UC(nfc);
|
3116
|
+
|
3117
|
+
/*
|
3118
|
+
* Remove ucx[i] by shifting
|
3119
|
+
* following code points.
|
3120
|
+
*/
|
3121
|
+
for (j = i; j+1 < ucx_size; j++) {
|
3122
|
+
ucx[j] = ucx[j+1];
|
3123
|
+
ccx[j] = ccx[j+1];
|
3124
|
+
}
|
3125
|
+
ucx_size --;
|
3126
|
+
|
3127
|
+
/*
|
3128
|
+
* Collect following code points blocked
|
3129
|
+
* by ucx[i] the removed code point.
|
3130
|
+
*/
|
3131
|
+
if (ucx_size > 0 && i == ucx_size &&
|
3132
|
+
nx > 0 && cx == cl) {
|
3133
|
+
cl = ccx[ucx_size-1];
|
3134
|
+
COLLECT_CPS(ucx_size);
|
3135
|
+
}
|
3136
|
+
/*
|
3137
|
+
* Restart finding a composed code with
|
3138
|
+
* the updated uc from the top of the
|
3139
|
+
* collected code points.
|
3140
|
+
*/
|
3141
|
+
i = 0;
|
3142
|
+
}
|
3143
|
+
|
3144
|
+
/*
|
3145
|
+
* Apparently the current code points are not
|
3146
|
+
* decomposed characters or already composed.
|
3147
|
+
*/
|
3148
|
+
WRITE_UC();
|
3149
|
+
for (i = 0; i < ucx_size; i++)
|
3150
|
+
UNPARSE(p, endp, ucx[i]);
|
3151
|
+
|
3152
|
+
/*
|
3153
|
+
* Flush out remaining canonical combining characters.
|
3154
|
+
*/
|
3155
|
+
if (nx > 0 && cx == cl && len > 0) {
|
3156
|
+
while ((nx = parse(&ucx[0], s, len))
|
3157
|
+
> 0) {
|
3158
|
+
cx = CCC(ucx[0]);
|
3159
|
+
if (cl > cx)
|
3160
|
+
break;
|
3161
|
+
s += nx;
|
3162
|
+
len -= nx;
|
3163
|
+
cl = cx;
|
3164
|
+
UNPARSE(p, endp, ucx[0]);
|
3165
|
+
}
|
3166
|
+
}
|
3167
|
+
break;
|
3168
|
+
}
|
3169
|
+
if (n2 < 0) {
|
3170
|
+
WRITE_UC();
|
3171
|
+
/* Use a replaced unicode character. */
|
3172
|
+
UNPARSE(p, endp, uc2);
|
3173
|
+
s += n2*-1;
|
3174
|
+
len -= n2*-1;
|
3175
|
+
ret = -1;
|
3176
|
+
continue;
|
3177
|
+
} else if (n2 == 0) {
|
3178
|
+
WRITE_UC();
|
3179
|
+
break;
|
3180
|
+
}
|
3181
|
+
}
|
3182
|
+
as->length = p - as->s;
|
3183
|
+
as->s[as->length] = '\0';
|
3184
|
+
if (ts == 2)
|
3185
|
+
as->s[as->length+1] = '\0';
|
3186
|
+
return (ret);
|
3187
|
+
}
|
3188
|
+
|
3189
|
+
static int
|
3190
|
+
get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc)
|
3191
|
+
{
|
3192
|
+
int t, b;
|
3193
|
+
|
3194
|
+
/*
|
3195
|
+
* These are not converted to NFD on Mac OS.
|
3196
|
+
*/
|
3197
|
+
if ((uc >= 0x2000 && uc <= 0x2FFF) ||
|
3198
|
+
(uc >= 0xF900 && uc <= 0xFAFF) ||
|
3199
|
+
(uc >= 0x2F800 && uc <= 0x2FAFF))
|
3200
|
+
return (0);
|
3201
|
+
/*
|
3202
|
+
* Those code points are not converted to NFD on Mac OS.
|
3203
|
+
* I do not know the reason because it is undocumented.
|
3204
|
+
* NFC NFD
|
3205
|
+
* 1109A ==> 11099 110BA
|
3206
|
+
* 1109C ==> 1109B 110BA
|
3207
|
+
* 110AB ==> 110A5 110BA
|
3208
|
+
*/
|
3209
|
+
if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB)
|
3210
|
+
return (0);
|
3211
|
+
|
3212
|
+
t = 0;
|
3213
|
+
b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1;
|
3214
|
+
while (b >= t) {
|
3215
|
+
int m = (t + b) / 2;
|
3216
|
+
if (u_decomposition_table[m].nfc < uc)
|
3217
|
+
t = m + 1;
|
3218
|
+
else if (u_decomposition_table[m].nfc > uc)
|
3219
|
+
b = m - 1;
|
3220
|
+
else {
|
3221
|
+
*cp1 = u_decomposition_table[m].cp1;
|
3222
|
+
*cp2 = u_decomposition_table[m].cp2;
|
3223
|
+
return (1);
|
3224
|
+
}
|
3225
|
+
}
|
3226
|
+
return (0);
|
3227
|
+
}
|
3228
|
+
|
3229
|
+
#define REPLACE_UC_WITH(cp) do { \
|
3230
|
+
uc = cp; \
|
3231
|
+
ucptr = NULL; \
|
3232
|
+
} while (0)
|
3233
|
+
|
3234
|
+
/*
|
3235
|
+
* Normalize UTF-8 characters to Form D and copy the result.
|
3236
|
+
*/
|
3237
|
+
static int
|
3238
|
+
archive_string_normalize_D(struct archive_string *as, const void *_p,
|
3239
|
+
size_t len, struct archive_string_conv *sc)
|
3240
|
+
{
|
3241
|
+
const char *s = (const char *)_p;
|
3242
|
+
char *p, *endp;
|
3243
|
+
uint32_t uc, uc2;
|
3244
|
+
size_t w;
|
3245
|
+
int always_replace, n, n2, ret = 0, spair, ts, tm;
|
3246
|
+
int (*parse)(uint32_t *, const char *, size_t);
|
3247
|
+
size_t (*unparse)(char *, size_t, uint32_t);
|
3248
|
+
|
3249
|
+
always_replace = 1;
|
3250
|
+
ts = 1;/* text size. */
|
3251
|
+
if (sc->flag & SCONV_TO_UTF16BE) {
|
3252
|
+
unparse = unicode_to_utf16be;
|
3253
|
+
ts = 2;
|
3254
|
+
if (sc->flag & SCONV_FROM_UTF16BE)
|
3255
|
+
always_replace = 0;
|
3256
|
+
} else if (sc->flag & SCONV_TO_UTF16LE) {
|
3257
|
+
unparse = unicode_to_utf16le;
|
3258
|
+
ts = 2;
|
3259
|
+
if (sc->flag & SCONV_FROM_UTF16LE)
|
3260
|
+
always_replace = 0;
|
3261
|
+
} else if (sc->flag & SCONV_TO_UTF8) {
|
3262
|
+
unparse = unicode_to_utf8;
|
3263
|
+
if (sc->flag & SCONV_FROM_UTF8)
|
3264
|
+
always_replace = 0;
|
3265
|
+
} else {
|
3266
|
+
/*
|
3267
|
+
* This case is going to be converted to another
|
3268
|
+
* character-set through iconv.
|
3269
|
+
*/
|
3270
|
+
always_replace = 0;
|
3271
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
3272
|
+
unparse = unicode_to_utf16be;
|
3273
|
+
ts = 2;
|
3274
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
3275
|
+
unparse = unicode_to_utf16le;
|
3276
|
+
ts = 2;
|
3277
|
+
} else {
|
3278
|
+
unparse = unicode_to_utf8;
|
3279
|
+
}
|
3280
|
+
}
|
3281
|
+
|
3282
|
+
if (sc->flag & SCONV_FROM_UTF16BE) {
|
3283
|
+
parse = utf16be_to_unicode;
|
3284
|
+
tm = 1;
|
3285
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
3286
|
+
} else if (sc->flag & SCONV_FROM_UTF16LE) {
|
3287
|
+
parse = utf16le_to_unicode;
|
3288
|
+
tm = 1;
|
3289
|
+
spair = 4;/* surrogate pair size in UTF-16. */
|
3290
|
+
} else {
|
3291
|
+
parse = cesu8_to_unicode;
|
3292
|
+
tm = ts;
|
3293
|
+
spair = 6;/* surrogate pair size in UTF-8. */
|
3294
|
+
}
|
3295
|
+
|
3296
|
+
if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
|
3297
|
+
return (-1);
|
3298
|
+
|
3299
|
+
p = as->s + as->length;
|
3300
|
+
endp = as->s + as->buffer_length - ts;
|
3301
|
+
while ((n = parse(&uc, s, len)) != 0) {
|
3302
|
+
const char *ucptr;
|
3303
|
+
uint32_t cp1, cp2;
|
3304
|
+
int SIndex;
|
3305
|
+
struct {
|
3306
|
+
uint32_t uc;
|
3307
|
+
int ccc;
|
3308
|
+
} fdc[FDC_MAX];
|
3309
|
+
int fdi, fdj;
|
3310
|
+
int ccc;
|
3311
|
+
|
3312
|
+
check_first_code:
|
3313
|
+
if (n < 0) {
|
3314
|
+
/* Use a replaced unicode character. */
|
3315
|
+
UNPARSE(p, endp, uc);
|
3316
|
+
s += n*-1;
|
3317
|
+
len -= n*-1;
|
3318
|
+
ret = -1;
|
3319
|
+
continue;
|
3320
|
+
} else if (n == spair || always_replace)
|
3321
|
+
/* uc is converted from a surrogate pair.
|
3322
|
+
* this should be treated as a changed code. */
|
3323
|
+
ucptr = NULL;
|
3324
|
+
else
|
3325
|
+
ucptr = s;
|
3326
|
+
s += n;
|
3327
|
+
len -= n;
|
3328
|
+
|
3329
|
+
/* Hangul Decomposition. */
|
3330
|
+
if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) {
|
3331
|
+
int L = HC_LBASE + SIndex / HC_NCOUNT;
|
3332
|
+
int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT;
|
3333
|
+
int T = HC_TBASE + SIndex % HC_TCOUNT;
|
3334
|
+
|
3335
|
+
REPLACE_UC_WITH(L);
|
3336
|
+
WRITE_UC();
|
3337
|
+
REPLACE_UC_WITH(V);
|
3338
|
+
WRITE_UC();
|
3339
|
+
if (T != HC_TBASE) {
|
3340
|
+
REPLACE_UC_WITH(T);
|
3341
|
+
WRITE_UC();
|
3342
|
+
}
|
3343
|
+
continue;
|
3344
|
+
}
|
3345
|
+
if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) {
|
3346
|
+
WRITE_UC();
|
3347
|
+
continue;
|
3348
|
+
}
|
3349
|
+
|
3350
|
+
fdi = 0;
|
3351
|
+
while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) {
|
3352
|
+
int k;
|
3353
|
+
|
3354
|
+
for (k = fdi; k > 0; k--)
|
3355
|
+
fdc[k] = fdc[k-1];
|
3356
|
+
fdc[0].ccc = CCC(cp2);
|
3357
|
+
fdc[0].uc = cp2;
|
3358
|
+
fdi++;
|
3359
|
+
REPLACE_UC_WITH(cp1);
|
3360
|
+
}
|
3361
|
+
|
3362
|
+
/* Read following code points. */
|
3363
|
+
while ((n2 = parse(&uc2, s, len)) > 0 &&
|
3364
|
+
(ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) {
|
3365
|
+
int j, k;
|
3366
|
+
|
3367
|
+
s += n2;
|
3368
|
+
len -= n2;
|
3369
|
+
for (j = 0; j < fdi; j++) {
|
3370
|
+
if (fdc[j].ccc > ccc)
|
3371
|
+
break;
|
3372
|
+
}
|
3373
|
+
if (j < fdi) {
|
3374
|
+
for (k = fdi; k > j; k--)
|
3375
|
+
fdc[k] = fdc[k-1];
|
3376
|
+
fdc[j].ccc = ccc;
|
3377
|
+
fdc[j].uc = uc2;
|
3378
|
+
} else {
|
3379
|
+
fdc[fdi].ccc = ccc;
|
3380
|
+
fdc[fdi].uc = uc2;
|
3381
|
+
}
|
3382
|
+
fdi++;
|
3383
|
+
}
|
3384
|
+
|
3385
|
+
WRITE_UC();
|
3386
|
+
for (fdj = 0; fdj < fdi; fdj++) {
|
3387
|
+
REPLACE_UC_WITH(fdc[fdj].uc);
|
3388
|
+
WRITE_UC();
|
3389
|
+
}
|
3390
|
+
|
3391
|
+
if (n2 == 0)
|
3392
|
+
break;
|
3393
|
+
REPLACE_UC_WITH(uc2);
|
3394
|
+
n = n2;
|
3395
|
+
goto check_first_code;
|
3396
|
+
}
|
3397
|
+
as->length = p - as->s;
|
3398
|
+
as->s[as->length] = '\0';
|
3399
|
+
if (ts == 2)
|
3400
|
+
as->s[as->length+1] = '\0';
|
3401
|
+
return (ret);
|
3402
|
+
}
|
3403
|
+
|
3404
|
+
/*
|
3405
|
+
* libarchive 2.x made incorrect UTF-8 strings in the wrong assumption
|
3406
|
+
* that WCS is Unicode. It is true for several platforms but some are false.
|
3407
|
+
* And then people who did not use UTF-8 locale on the non Unicode WCS
|
3408
|
+
* platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those
|
3409
|
+
* now cannot get right filename from libarchive 3.x and later since we
|
3410
|
+
* fixed the wrong assumption and it is incompatible to older its versions.
|
3411
|
+
* So we provide special option, "compat-2x.x", for resolving it.
|
3412
|
+
* That option enable the string conversion of libarchive 2.x.
|
3413
|
+
*
|
3414
|
+
* Translates the wrong UTF-8 string made by libarchive 2.x into current
|
3415
|
+
* locale character set and appends to the archive_string.
|
3416
|
+
* Note: returns -1 if conversion fails.
|
3417
|
+
*/
|
3418
|
+
static int
|
3419
|
+
strncat_from_utf8_libarchive2(struct archive_string *as,
|
3420
|
+
const void *_p, size_t len, struct archive_string_conv *sc)
|
3421
|
+
{
|
3422
|
+
const char *s;
|
3423
|
+
int n;
|
3424
|
+
char *p;
|
3425
|
+
char *end;
|
3426
|
+
uint32_t unicode;
|
3427
|
+
#if HAVE_WCRTOMB
|
3428
|
+
mbstate_t shift_state;
|
3429
|
+
|
3430
|
+
memset(&shift_state, 0, sizeof(shift_state));
|
3431
|
+
#else
|
3432
|
+
/* Clear the shift state before starting. */
|
3433
|
+
wctomb(NULL, L'\0');
|
3434
|
+
#endif
|
3435
|
+
(void)sc; /* UNUSED */
|
3436
|
+
/*
|
3437
|
+
* Allocate buffer for MBS.
|
3438
|
+
* We need this allocation here since it is possible that
|
3439
|
+
* as->s is still NULL.
|
3440
|
+
*/
|
3441
|
+
if (archive_string_ensure(as, as->length + len + 1) == NULL)
|
3442
|
+
return (-1);
|
3443
|
+
|
3444
|
+
s = (const char *)_p;
|
3445
|
+
p = as->s + as->length;
|
3446
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
3447
|
+
while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) {
|
3448
|
+
wchar_t wc;
|
3449
|
+
|
3450
|
+
if (p >= end) {
|
3451
|
+
as->length = p - as->s;
|
3452
|
+
/* Re-allocate buffer for MBS. */
|
3453
|
+
if (archive_string_ensure(as,
|
3454
|
+
as->length + max(len * 2,
|
3455
|
+
(size_t)MB_CUR_MAX) + 1) == NULL)
|
3456
|
+
return (-1);
|
3457
|
+
p = as->s + as->length;
|
3458
|
+
end = as->s + as->buffer_length - MB_CUR_MAX -1;
|
3459
|
+
}
|
3460
|
+
|
3461
|
+
/*
|
3462
|
+
* As libarchive 2.x, translates the UTF-8 characters into
|
3463
|
+
* wide-characters in the assumption that WCS is Unicode.
|
3464
|
+
*/
|
3465
|
+
if (n < 0) {
|
3466
|
+
n *= -1;
|
3467
|
+
wc = L'?';
|
3468
|
+
} else
|
3469
|
+
wc = (wchar_t)unicode;
|
3470
|
+
|
3471
|
+
s += n;
|
3472
|
+
len -= n;
|
3473
|
+
/*
|
3474
|
+
* Translates the wide-character into the current locale MBS.
|
3475
|
+
*/
|
3476
|
+
#if HAVE_WCRTOMB
|
3477
|
+
n = (int)wcrtomb(p, wc, &shift_state);
|
3478
|
+
#else
|
3479
|
+
n = (int)wctomb(p, wc);
|
3480
|
+
#endif
|
3481
|
+
if (n == -1)
|
3482
|
+
return (-1);
|
3483
|
+
p += n;
|
3484
|
+
}
|
3485
|
+
as->length = p - as->s;
|
3486
|
+
as->s[as->length] = '\0';
|
3487
|
+
return (0);
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
|
3491
|
+
/*
|
3492
|
+
* Conversion functions between current locale dependent MBS and UTF-16BE.
|
3493
|
+
* strncat_from_utf16be() : UTF-16BE --> MBS
|
3494
|
+
* strncat_to_utf16be() : MBS --> UTF16BE
|
3495
|
+
*/
|
3496
|
+
|
3497
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
3498
|
+
|
3499
|
+
/*
|
3500
|
+
* Convert a UTF-16BE/LE string to current locale and copy the result.
|
3501
|
+
* Return -1 if conversion fails.
|
3502
|
+
*/
|
3503
|
+
static int
|
3504
|
+
win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes,
|
3505
|
+
struct archive_string_conv *sc, int be)
|
3506
|
+
{
|
3507
|
+
struct archive_string tmp;
|
3508
|
+
const char *u16;
|
3509
|
+
int ll;
|
3510
|
+
BOOL defchar;
|
3511
|
+
char *mbs;
|
3512
|
+
size_t mbs_size, b;
|
3513
|
+
int ret = 0;
|
3514
|
+
|
3515
|
+
bytes &= ~1;
|
3516
|
+
if (archive_string_ensure(as, as->length + bytes +1) == NULL)
|
3517
|
+
return (-1);
|
3518
|
+
|
3519
|
+
mbs = as->s + as->length;
|
3520
|
+
mbs_size = as->buffer_length - as->length -1;
|
3521
|
+
|
3522
|
+
if (sc->to_cp == CP_C_LOCALE) {
|
3523
|
+
/*
|
3524
|
+
* "C" locale special process.
|
3525
|
+
*/
|
3526
|
+
u16 = _p;
|
3527
|
+
ll = 0;
|
3528
|
+
for (b = 0; b < bytes; b += 2) {
|
3529
|
+
uint16_t val;
|
3530
|
+
if (be)
|
3531
|
+
val = archive_be16dec(u16+b);
|
3532
|
+
else
|
3533
|
+
val = archive_le16dec(u16+b);
|
3534
|
+
if (val > 255) {
|
3535
|
+
*mbs++ = '?';
|
3536
|
+
ret = -1;
|
3537
|
+
} else
|
3538
|
+
*mbs++ = (char)(val&0xff);
|
3539
|
+
ll++;
|
3540
|
+
}
|
3541
|
+
as->length += ll;
|
3542
|
+
as->s[as->length] = '\0';
|
3543
|
+
return (ret);
|
3544
|
+
}
|
3545
|
+
|
3546
|
+
archive_string_init(&tmp);
|
3547
|
+
if (be) {
|
3548
|
+
if (is_big_endian()) {
|
3549
|
+
u16 = _p;
|
3550
|
+
} else {
|
3551
|
+
if (archive_string_ensure(&tmp, bytes+2) == NULL)
|
3552
|
+
return (-1);
|
3553
|
+
memcpy(tmp.s, _p, bytes);
|
3554
|
+
for (b = 0; b < bytes; b += 2) {
|
3555
|
+
uint16_t val = archive_be16dec(tmp.s+b);
|
3556
|
+
archive_le16enc(tmp.s+b, val);
|
3557
|
+
}
|
3558
|
+
u16 = tmp.s;
|
3559
|
+
}
|
3560
|
+
} else {
|
3561
|
+
if (!is_big_endian()) {
|
3562
|
+
u16 = _p;
|
3563
|
+
} else {
|
3564
|
+
if (archive_string_ensure(&tmp, bytes+2) == NULL)
|
3565
|
+
return (-1);
|
3566
|
+
memcpy(tmp.s, _p, bytes);
|
3567
|
+
for (b = 0; b < bytes; b += 2) {
|
3568
|
+
uint16_t val = archive_le16dec(tmp.s+b);
|
3569
|
+
archive_be16enc(tmp.s+b, val);
|
3570
|
+
}
|
3571
|
+
u16 = tmp.s;
|
3572
|
+
}
|
3573
|
+
}
|
3574
|
+
|
3575
|
+
do {
|
3576
|
+
defchar = 0;
|
3577
|
+
ll = WideCharToMultiByte(sc->to_cp, 0,
|
3578
|
+
(LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size,
|
3579
|
+
NULL, &defchar);
|
3580
|
+
/* Exit loop if we succeeded */
|
3581
|
+
if (ll != 0 ||
|
3582
|
+
GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
3583
|
+
break;
|
3584
|
+
}
|
3585
|
+
/* Else expand buffer and loop to try again. */
|
3586
|
+
ll = WideCharToMultiByte(sc->to_cp, 0,
|
3587
|
+
(LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL);
|
3588
|
+
if (archive_string_ensure(as, ll +1) == NULL)
|
3589
|
+
return (-1);
|
3590
|
+
mbs = as->s + as->length;
|
3591
|
+
mbs_size = as->buffer_length - as->length -1;
|
3592
|
+
} while (1);
|
3593
|
+
archive_string_free(&tmp);
|
3594
|
+
as->length += ll;
|
3595
|
+
as->s[as->length] = '\0';
|
3596
|
+
if (ll == 0 || defchar)
|
3597
|
+
ret = -1;
|
3598
|
+
return (ret);
|
3599
|
+
}
|
3600
|
+
|
3601
|
+
static int
|
3602
|
+
win_strncat_from_utf16be(struct archive_string *as, const void *_p,
|
3603
|
+
size_t bytes, struct archive_string_conv *sc)
|
3604
|
+
{
|
3605
|
+
return (win_strncat_from_utf16(as, _p, bytes, sc, 1));
|
3606
|
+
}
|
3607
|
+
|
3608
|
+
static int
|
3609
|
+
win_strncat_from_utf16le(struct archive_string *as, const void *_p,
|
3610
|
+
size_t bytes, struct archive_string_conv *sc)
|
3611
|
+
{
|
3612
|
+
return (win_strncat_from_utf16(as, _p, bytes, sc, 0));
|
3613
|
+
}
|
3614
|
+
|
3615
|
+
static int
|
3616
|
+
is_big_endian(void)
|
3617
|
+
{
|
3618
|
+
uint16_t d = 1;
|
3619
|
+
|
3620
|
+
return (archive_be16dec(&d) == 1);
|
3621
|
+
}
|
3622
|
+
|
3623
|
+
/*
|
3624
|
+
* Convert a current locale string to UTF-16BE/LE and copy the result.
|
3625
|
+
* Return -1 if conversion fails.
|
3626
|
+
*/
|
3627
|
+
static int
|
3628
|
+
win_strncat_to_utf16(struct archive_string *as16, const void *_p,
|
3629
|
+
size_t length, struct archive_string_conv *sc, int bigendian)
|
3630
|
+
{
|
3631
|
+
const char *s = (const char *)_p;
|
3632
|
+
char *u16;
|
3633
|
+
size_t count, avail;
|
3634
|
+
|
3635
|
+
if (archive_string_ensure(as16,
|
3636
|
+
as16->length + (length + 1) * 2) == NULL)
|
3637
|
+
return (-1);
|
3638
|
+
|
3639
|
+
u16 = as16->s + as16->length;
|
3640
|
+
avail = as16->buffer_length - 2;
|
3641
|
+
if (sc->from_cp == CP_C_LOCALE) {
|
3642
|
+
/*
|
3643
|
+
* "C" locale special process.
|
3644
|
+
*/
|
3645
|
+
count = 0;
|
3646
|
+
while (count < length && *s) {
|
3647
|
+
if (bigendian)
|
3648
|
+
archive_be16enc(u16, *s);
|
3649
|
+
else
|
3650
|
+
archive_le16enc(u16, *s);
|
3651
|
+
u16 += 2;
|
3652
|
+
s++;
|
3653
|
+
count++;
|
3654
|
+
}
|
3655
|
+
as16->length += count << 1;
|
3656
|
+
as16->s[as16->length] = 0;
|
3657
|
+
as16->s[as16->length+1] = 0;
|
3658
|
+
return (0);
|
3659
|
+
}
|
3660
|
+
do {
|
3661
|
+
count = MultiByteToWideChar(sc->from_cp,
|
3662
|
+
MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1);
|
3663
|
+
/* Exit loop if we succeeded */
|
3664
|
+
if (count != 0 ||
|
3665
|
+
GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
3666
|
+
break;
|
3667
|
+
}
|
3668
|
+
/* Expand buffer and try again */
|
3669
|
+
count = MultiByteToWideChar(sc->from_cp,
|
3670
|
+
MB_PRECOMPOSED, s, (int)length, NULL, 0);
|
3671
|
+
if (archive_string_ensure(as16, (count +1) * 2)
|
3672
|
+
== NULL)
|
3673
|
+
return (-1);
|
3674
|
+
u16 = as16->s + as16->length;
|
3675
|
+
avail = as16->buffer_length - 2;
|
3676
|
+
} while (1);
|
3677
|
+
as16->length += count * 2;
|
3678
|
+
as16->s[as16->length] = 0;
|
3679
|
+
as16->s[as16->length+1] = 0;
|
3680
|
+
if (count == 0)
|
3681
|
+
return (-1);
|
3682
|
+
|
3683
|
+
if (is_big_endian()) {
|
3684
|
+
if (!bigendian) {
|
3685
|
+
while (count > 0) {
|
3686
|
+
uint16_t v = archive_be16dec(u16);
|
3687
|
+
archive_le16enc(u16, v);
|
3688
|
+
u16 += 2;
|
3689
|
+
count--;
|
3690
|
+
}
|
3691
|
+
}
|
3692
|
+
} else {
|
3693
|
+
if (bigendian) {
|
3694
|
+
while (count > 0) {
|
3695
|
+
uint16_t v = archive_le16dec(u16);
|
3696
|
+
archive_be16enc(u16, v);
|
3697
|
+
u16 += 2;
|
3698
|
+
count--;
|
3699
|
+
}
|
3700
|
+
}
|
3701
|
+
}
|
3702
|
+
return (0);
|
3703
|
+
}
|
3704
|
+
|
3705
|
+
static int
|
3706
|
+
win_strncat_to_utf16be(struct archive_string *as16, const void *_p,
|
3707
|
+
size_t length, struct archive_string_conv *sc)
|
3708
|
+
{
|
3709
|
+
return (win_strncat_to_utf16(as16, _p, length, sc, 1));
|
3710
|
+
}
|
3711
|
+
|
3712
|
+
static int
|
3713
|
+
win_strncat_to_utf16le(struct archive_string *as16, const void *_p,
|
3714
|
+
size_t length, struct archive_string_conv *sc)
|
3715
|
+
{
|
3716
|
+
return (win_strncat_to_utf16(as16, _p, length, sc, 0));
|
3717
|
+
}
|
3718
|
+
|
3719
|
+
#endif /* _WIN32 && !__CYGWIN__ */
|
3720
|
+
|
3721
|
+
/*
|
3722
|
+
* Do the best effort for conversions.
|
3723
|
+
* We cannot handle UTF-16BE character-set without such iconv,
|
3724
|
+
* but there is a chance if a string consists just ASCII code or
|
3725
|
+
* a current locale is UTF-8.
|
3726
|
+
*/
|
3727
|
+
|
3728
|
+
/*
|
3729
|
+
* Convert a UTF-16BE string to current locale and copy the result.
|
3730
|
+
* Return -1 if conversion fails.
|
3731
|
+
*/
|
3732
|
+
static int
|
3733
|
+
best_effort_strncat_from_utf16(struct archive_string *as, const void *_p,
|
3734
|
+
size_t bytes, struct archive_string_conv *sc, int be)
|
3735
|
+
{
|
3736
|
+
const char *utf16 = (const char *)_p;
|
3737
|
+
char *mbs;
|
3738
|
+
uint32_t uc;
|
3739
|
+
int n, ret;
|
3740
|
+
|
3741
|
+
(void)sc; /* UNUSED */
|
3742
|
+
/*
|
3743
|
+
* Other case, we should do the best effort.
|
3744
|
+
* If all character are ASCII(<0x7f), we can convert it.
|
3745
|
+
* if not , we set a alternative character and return -1.
|
3746
|
+
*/
|
3747
|
+
ret = 0;
|
3748
|
+
if (archive_string_ensure(as, as->length + bytes +1) == NULL)
|
3749
|
+
return (-1);
|
3750
|
+
mbs = as->s + as->length;
|
3751
|
+
|
3752
|
+
while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) {
|
3753
|
+
if (n < 0) {
|
3754
|
+
n *= -1;
|
3755
|
+
ret = -1;
|
3756
|
+
}
|
3757
|
+
bytes -= n;
|
3758
|
+
utf16 += n;
|
3759
|
+
|
3760
|
+
if (uc > 127) {
|
3761
|
+
/* We cannot handle it. */
|
3762
|
+
*mbs++ = '?';
|
3763
|
+
ret = -1;
|
3764
|
+
} else
|
3765
|
+
*mbs++ = (char)uc;
|
3766
|
+
}
|
3767
|
+
as->length = mbs - as->s;
|
3768
|
+
as->s[as->length] = '\0';
|
3769
|
+
return (ret);
|
3770
|
+
}
|
3771
|
+
|
3772
|
+
static int
|
3773
|
+
best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p,
|
3774
|
+
size_t bytes, struct archive_string_conv *sc)
|
3775
|
+
{
|
3776
|
+
return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1));
|
3777
|
+
}
|
3778
|
+
|
3779
|
+
static int
|
3780
|
+
best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p,
|
3781
|
+
size_t bytes, struct archive_string_conv *sc)
|
3782
|
+
{
|
3783
|
+
return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0));
|
3784
|
+
}
|
3785
|
+
|
3786
|
+
/*
|
3787
|
+
* Convert a current locale string to UTF-16BE/LE and copy the result.
|
3788
|
+
* Return -1 if conversion fails.
|
3789
|
+
*/
|
3790
|
+
static int
|
3791
|
+
best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p,
|
3792
|
+
size_t length, struct archive_string_conv *sc, int bigendian)
|
3793
|
+
{
|
3794
|
+
const char *s = (const char *)_p;
|
3795
|
+
char *utf16;
|
3796
|
+
size_t remaining;
|
3797
|
+
int ret;
|
3798
|
+
|
3799
|
+
(void)sc; /* UNUSED */
|
3800
|
+
/*
|
3801
|
+
* Other case, we should do the best effort.
|
3802
|
+
* If all character are ASCII(<0x7f), we can convert it.
|
3803
|
+
* if not , we set a alternative character and return -1.
|
3804
|
+
*/
|
3805
|
+
ret = 0;
|
3806
|
+
remaining = length;
|
3807
|
+
|
3808
|
+
if (archive_string_ensure(as16,
|
3809
|
+
as16->length + (length + 1) * 2) == NULL)
|
3810
|
+
return (-1);
|
3811
|
+
|
3812
|
+
utf16 = as16->s + as16->length;
|
3813
|
+
while (remaining--) {
|
3814
|
+
unsigned c = *s++;
|
3815
|
+
if (c > 127) {
|
3816
|
+
/* We cannot handle it. */
|
3817
|
+
c = UNICODE_R_CHAR;
|
3818
|
+
ret = -1;
|
3819
|
+
}
|
3820
|
+
if (bigendian)
|
3821
|
+
archive_be16enc(utf16, c);
|
3822
|
+
else
|
3823
|
+
archive_le16enc(utf16, c);
|
3824
|
+
utf16 += 2;
|
3825
|
+
}
|
3826
|
+
as16->length = utf16 - as16->s;
|
3827
|
+
as16->s[as16->length] = 0;
|
3828
|
+
as16->s[as16->length+1] = 0;
|
3829
|
+
return (ret);
|
3830
|
+
}
|
3831
|
+
|
3832
|
+
static int
|
3833
|
+
best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p,
|
3834
|
+
size_t length, struct archive_string_conv *sc)
|
3835
|
+
{
|
3836
|
+
return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1));
|
3837
|
+
}
|
3838
|
+
|
3839
|
+
static int
|
3840
|
+
best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p,
|
3841
|
+
size_t length, struct archive_string_conv *sc)
|
3842
|
+
{
|
3843
|
+
return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0));
|
3844
|
+
}
|
3845
|
+
|
3846
|
+
|
3847
|
+
/*
|
3848
|
+
* Multistring operations.
|
3849
|
+
*/
|
3850
|
+
|
3851
|
+
void
|
3852
|
+
archive_mstring_clean(struct archive_mstring *aes)
|
3853
|
+
{
|
3854
|
+
archive_wstring_free(&(aes->aes_wcs));
|
3855
|
+
archive_string_free(&(aes->aes_mbs));
|
3856
|
+
archive_string_free(&(aes->aes_utf8));
|
3857
|
+
archive_string_free(&(aes->aes_mbs_in_locale));
|
3858
|
+
aes->aes_set = 0;
|
3859
|
+
}
|
3860
|
+
|
3861
|
+
void
|
3862
|
+
archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src)
|
3863
|
+
{
|
3864
|
+
dest->aes_set = src->aes_set;
|
3865
|
+
archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs));
|
3866
|
+
archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8));
|
3867
|
+
archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs));
|
3868
|
+
}
|
3869
|
+
|
3870
|
+
int
|
3871
|
+
archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
|
3872
|
+
const char **p)
|
3873
|
+
{
|
3874
|
+
struct archive_string_conv *sc;
|
3875
|
+
int r;
|
3876
|
+
|
3877
|
+
/* If we already have a UTF8 form, return that immediately. */
|
3878
|
+
if (aes->aes_set & AES_SET_UTF8) {
|
3879
|
+
*p = aes->aes_utf8.s;
|
3880
|
+
return (0);
|
3881
|
+
}
|
3882
|
+
|
3883
|
+
*p = NULL;
|
3884
|
+
/* Try converting WCS to MBS first if MBS does not exist yet. */
|
3885
|
+
if ((aes->aes_set & AES_SET_MBS) == 0) {
|
3886
|
+
const char *pm; /* unused */
|
3887
|
+
archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
|
3888
|
+
}
|
3889
|
+
if (aes->aes_set & AES_SET_MBS) {
|
3890
|
+
sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
|
3891
|
+
if (sc == NULL)
|
3892
|
+
return (-1);/* Couldn't allocate memory for sc. */
|
3893
|
+
r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s,
|
3894
|
+
aes->aes_mbs.length, sc);
|
3895
|
+
if (a == NULL)
|
3896
|
+
free_sconv_object(sc);
|
3897
|
+
if (r == 0) {
|
3898
|
+
aes->aes_set |= AES_SET_UTF8;
|
3899
|
+
*p = aes->aes_utf8.s;
|
3900
|
+
return (0);/* success. */
|
3901
|
+
} else
|
3902
|
+
return (-1);/* failure. */
|
3903
|
+
}
|
3904
|
+
return (0);/* success. */
|
3905
|
+
}
|
3906
|
+
|
3907
|
+
int
|
3908
|
+
archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes,
|
3909
|
+
const char **p)
|
3910
|
+
{
|
3911
|
+
struct archive_string_conv *sc;
|
3912
|
+
int r, ret = 0;
|
3913
|
+
|
3914
|
+
/* If we already have an MBS form, return that immediately. */
|
3915
|
+
if (aes->aes_set & AES_SET_MBS) {
|
3916
|
+
*p = aes->aes_mbs.s;
|
3917
|
+
return (ret);
|
3918
|
+
}
|
3919
|
+
|
3920
|
+
*p = NULL;
|
3921
|
+
/* If there's a WCS form, try converting with the native locale. */
|
3922
|
+
if (aes->aes_set & AES_SET_WCS) {
|
3923
|
+
archive_string_empty(&(aes->aes_mbs));
|
3924
|
+
r = archive_string_append_from_wcs(&(aes->aes_mbs),
|
3925
|
+
aes->aes_wcs.s, aes->aes_wcs.length);
|
3926
|
+
*p = aes->aes_mbs.s;
|
3927
|
+
if (r == 0) {
|
3928
|
+
aes->aes_set |= AES_SET_MBS;
|
3929
|
+
return (ret);
|
3930
|
+
} else
|
3931
|
+
ret = -1;
|
3932
|
+
}
|
3933
|
+
|
3934
|
+
/* If there's a UTF-8 form, try converting with the native locale. */
|
3935
|
+
if (aes->aes_set & AES_SET_UTF8) {
|
3936
|
+
archive_string_empty(&(aes->aes_mbs));
|
3937
|
+
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
|
3938
|
+
if (sc == NULL)
|
3939
|
+
return (-1);/* Couldn't allocate memory for sc. */
|
3940
|
+
r = archive_strncpy_l(&(aes->aes_mbs),
|
3941
|
+
aes->aes_utf8.s, aes->aes_utf8.length, sc);
|
3942
|
+
if (a == NULL)
|
3943
|
+
free_sconv_object(sc);
|
3944
|
+
*p = aes->aes_mbs.s;
|
3945
|
+
if (r == 0) {
|
3946
|
+
aes->aes_set |= AES_SET_MBS;
|
3947
|
+
ret = 0;/* success; overwrite previous error. */
|
3948
|
+
} else
|
3949
|
+
ret = -1;/* failure. */
|
3950
|
+
}
|
3951
|
+
return (ret);
|
3952
|
+
}
|
3953
|
+
|
3954
|
+
int
|
3955
|
+
archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
|
3956
|
+
const wchar_t **wp)
|
3957
|
+
{
|
3958
|
+
int r, ret = 0;
|
3959
|
+
|
3960
|
+
(void)a;/* UNUSED */
|
3961
|
+
/* Return WCS form if we already have it. */
|
3962
|
+
if (aes->aes_set & AES_SET_WCS) {
|
3963
|
+
*wp = aes->aes_wcs.s;
|
3964
|
+
return (ret);
|
3965
|
+
}
|
3966
|
+
|
3967
|
+
*wp = NULL;
|
3968
|
+
/* Try converting UTF8 to MBS first if MBS does not exist yet. */
|
3969
|
+
if ((aes->aes_set & AES_SET_MBS) == 0) {
|
3970
|
+
const char *p; /* unused */
|
3971
|
+
archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */
|
3972
|
+
}
|
3973
|
+
/* Try converting MBS to WCS using native locale. */
|
3974
|
+
if (aes->aes_set & AES_SET_MBS) {
|
3975
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
3976
|
+
r = archive_wstring_append_from_mbs(&(aes->aes_wcs),
|
3977
|
+
aes->aes_mbs.s, aes->aes_mbs.length);
|
3978
|
+
if (r == 0) {
|
3979
|
+
aes->aes_set |= AES_SET_WCS;
|
3980
|
+
*wp = aes->aes_wcs.s;
|
3981
|
+
} else
|
3982
|
+
ret = -1;/* failure. */
|
3983
|
+
}
|
3984
|
+
return (ret);
|
3985
|
+
}
|
3986
|
+
|
3987
|
+
int
|
3988
|
+
archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes,
|
3989
|
+
const char **p, size_t *length, struct archive_string_conv *sc)
|
3990
|
+
{
|
3991
|
+
int ret = 0;
|
3992
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
3993
|
+
int r;
|
3994
|
+
|
3995
|
+
/*
|
3996
|
+
* Internationalization programming on Windows must use Wide
|
3997
|
+
* characters because Windows platform cannot make locale UTF-8.
|
3998
|
+
*/
|
3999
|
+
if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) {
|
4000
|
+
archive_string_empty(&(aes->aes_mbs_in_locale));
|
4001
|
+
r = archive_string_append_from_wcs_in_codepage(
|
4002
|
+
&(aes->aes_mbs_in_locale), aes->aes_wcs.s,
|
4003
|
+
aes->aes_wcs.length, sc);
|
4004
|
+
if (r == 0) {
|
4005
|
+
*p = aes->aes_mbs_in_locale.s;
|
4006
|
+
if (length != NULL)
|
4007
|
+
*length = aes->aes_mbs_in_locale.length;
|
4008
|
+
return (0);
|
4009
|
+
} else if (errno == ENOMEM)
|
4010
|
+
return (-1);
|
4011
|
+
else
|
4012
|
+
ret = -1;
|
4013
|
+
}
|
4014
|
+
#endif
|
4015
|
+
|
4016
|
+
/* If there is not an MBS form but there is a WCS or UTF8 form, try converting
|
4017
|
+
* with the native locale to be used for translating it to specified
|
4018
|
+
* character-set. */
|
4019
|
+
if ((aes->aes_set & AES_SET_MBS) == 0) {
|
4020
|
+
const char *pm; /* unused */
|
4021
|
+
archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */
|
4022
|
+
}
|
4023
|
+
/* If we already have an MBS form, use it to be translated to
|
4024
|
+
* specified character-set. */
|
4025
|
+
if (aes->aes_set & AES_SET_MBS) {
|
4026
|
+
if (sc == NULL) {
|
4027
|
+
/* Conversion is unneeded. */
|
4028
|
+
*p = aes->aes_mbs.s;
|
4029
|
+
if (length != NULL)
|
4030
|
+
*length = aes->aes_mbs.length;
|
4031
|
+
return (0);
|
4032
|
+
}
|
4033
|
+
ret = archive_strncpy_l(&(aes->aes_mbs_in_locale),
|
4034
|
+
aes->aes_mbs.s, aes->aes_mbs.length, sc);
|
4035
|
+
*p = aes->aes_mbs_in_locale.s;
|
4036
|
+
if (length != NULL)
|
4037
|
+
*length = aes->aes_mbs_in_locale.length;
|
4038
|
+
} else {
|
4039
|
+
*p = NULL;
|
4040
|
+
if (length != NULL)
|
4041
|
+
*length = 0;
|
4042
|
+
}
|
4043
|
+
return (ret);
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
int
|
4047
|
+
archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs)
|
4048
|
+
{
|
4049
|
+
if (mbs == NULL) {
|
4050
|
+
aes->aes_set = 0;
|
4051
|
+
return (0);
|
4052
|
+
}
|
4053
|
+
return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs)));
|
4054
|
+
}
|
4055
|
+
|
4056
|
+
int
|
4057
|
+
archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs,
|
4058
|
+
size_t len)
|
4059
|
+
{
|
4060
|
+
if (mbs == NULL) {
|
4061
|
+
aes->aes_set = 0;
|
4062
|
+
return (0);
|
4063
|
+
}
|
4064
|
+
aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
|
4065
|
+
archive_strncpy(&(aes->aes_mbs), mbs, len);
|
4066
|
+
archive_string_empty(&(aes->aes_utf8));
|
4067
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
4068
|
+
return (0);
|
4069
|
+
}
|
4070
|
+
|
4071
|
+
int
|
4072
|
+
archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs)
|
4073
|
+
{
|
4074
|
+
return archive_mstring_copy_wcs_len(aes, wcs,
|
4075
|
+
wcs == NULL ? 0 : wcslen(wcs));
|
4076
|
+
}
|
4077
|
+
|
4078
|
+
int
|
4079
|
+
archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8)
|
4080
|
+
{
|
4081
|
+
if (utf8 == NULL) {
|
4082
|
+
aes->aes_set = 0;
|
4083
|
+
return (0);
|
4084
|
+
}
|
4085
|
+
aes->aes_set = AES_SET_UTF8;
|
4086
|
+
archive_string_empty(&(aes->aes_mbs));
|
4087
|
+
archive_string_empty(&(aes->aes_wcs));
|
4088
|
+
archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8));
|
4089
|
+
return (int)strlen(utf8);
|
4090
|
+
}
|
4091
|
+
|
4092
|
+
int
|
4093
|
+
archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs,
|
4094
|
+
size_t len)
|
4095
|
+
{
|
4096
|
+
if (wcs == NULL) {
|
4097
|
+
aes->aes_set = 0;
|
4098
|
+
return (0);
|
4099
|
+
}
|
4100
|
+
aes->aes_set = AES_SET_WCS; /* Only WCS form set. */
|
4101
|
+
archive_string_empty(&(aes->aes_mbs));
|
4102
|
+
archive_string_empty(&(aes->aes_utf8));
|
4103
|
+
archive_wstrncpy(&(aes->aes_wcs), wcs, len);
|
4104
|
+
return (0);
|
4105
|
+
}
|
4106
|
+
|
4107
|
+
int
|
4108
|
+
archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,
|
4109
|
+
const char *mbs, size_t len, struct archive_string_conv *sc)
|
4110
|
+
{
|
4111
|
+
int r;
|
4112
|
+
|
4113
|
+
if (mbs == NULL) {
|
4114
|
+
aes->aes_set = 0;
|
4115
|
+
return (0);
|
4116
|
+
}
|
4117
|
+
archive_string_empty(&(aes->aes_mbs));
|
4118
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
4119
|
+
archive_string_empty(&(aes->aes_utf8));
|
4120
|
+
#if defined(_WIN32) && !defined(__CYGWIN__)
|
4121
|
+
/*
|
4122
|
+
* Internationalization programming on Windows must use Wide
|
4123
|
+
* characters because Windows platform cannot make locale UTF-8.
|
4124
|
+
*/
|
4125
|
+
if (sc == NULL) {
|
4126
|
+
if (archive_string_append(&(aes->aes_mbs),
|
4127
|
+
mbs, mbsnbytes(mbs, len)) == NULL) {
|
4128
|
+
aes->aes_set = 0;
|
4129
|
+
r = -1;
|
4130
|
+
} else {
|
4131
|
+
aes->aes_set = AES_SET_MBS;
|
4132
|
+
r = 0;
|
4133
|
+
}
|
4134
|
+
#if defined(HAVE_ICONV)
|
4135
|
+
} else if (sc != NULL && sc->cd_w != (iconv_t)-1) {
|
4136
|
+
/*
|
4137
|
+
* This case happens only when MultiByteToWideChar() cannot
|
4138
|
+
* handle sc->from_cp, and we have to iconv in order to
|
4139
|
+
* translate character-set to wchar_t,UTF-16.
|
4140
|
+
*/
|
4141
|
+
iconv_t cd = sc->cd;
|
4142
|
+
unsigned from_cp;
|
4143
|
+
int flag;
|
4144
|
+
|
4145
|
+
/*
|
4146
|
+
* Translate multi-bytes from some character-set to UTF-8.
|
4147
|
+
*/
|
4148
|
+
sc->cd = sc->cd_w;
|
4149
|
+
r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc);
|
4150
|
+
sc->cd = cd;
|
4151
|
+
if (r != 0) {
|
4152
|
+
aes->aes_set = 0;
|
4153
|
+
return (r);
|
4154
|
+
}
|
4155
|
+
aes->aes_set = AES_SET_UTF8;
|
4156
|
+
|
4157
|
+
/*
|
4158
|
+
* Append the UTF-8 string into wstring.
|
4159
|
+
*/
|
4160
|
+
flag = sc->flag;
|
4161
|
+
sc->flag &= ~(SCONV_NORMALIZATION_C
|
4162
|
+
| SCONV_TO_UTF16| SCONV_FROM_UTF16);
|
4163
|
+
from_cp = sc->from_cp;
|
4164
|
+
sc->from_cp = CP_UTF8;
|
4165
|
+
r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
|
4166
|
+
aes->aes_utf8.s, aes->aes_utf8.length, sc);
|
4167
|
+
sc->flag = flag;
|
4168
|
+
sc->from_cp = from_cp;
|
4169
|
+
if (r == 0)
|
4170
|
+
aes->aes_set |= AES_SET_WCS;
|
4171
|
+
#endif
|
4172
|
+
} else {
|
4173
|
+
r = archive_wstring_append_from_mbs_in_codepage(
|
4174
|
+
&(aes->aes_wcs), mbs, len, sc);
|
4175
|
+
if (r == 0)
|
4176
|
+
aes->aes_set = AES_SET_WCS;
|
4177
|
+
else
|
4178
|
+
aes->aes_set = 0;
|
4179
|
+
}
|
4180
|
+
#else
|
4181
|
+
r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc);
|
4182
|
+
if (r == 0)
|
4183
|
+
aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
|
4184
|
+
else
|
4185
|
+
aes->aes_set = 0;
|
4186
|
+
#endif
|
4187
|
+
return (r);
|
4188
|
+
}
|
4189
|
+
|
4190
|
+
/*
|
4191
|
+
* The 'update' form tries to proactively update all forms of
|
4192
|
+
* this string (WCS and MBS) and returns an error if any of
|
4193
|
+
* them fail. This is used by the 'pax' handler, for instance,
|
4194
|
+
* to detect and report character-conversion failures early while
|
4195
|
+
* still allowing clients to get potentially useful values from
|
4196
|
+
* the more tolerant lazy conversions. (get_mbs and get_wcs will
|
4197
|
+
* strive to give the user something useful, so you can get hopefully
|
4198
|
+
* usable values even if some of the character conversions are failing.)
|
4199
|
+
*/
|
4200
|
+
int
|
4201
|
+
archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
|
4202
|
+
const char *utf8)
|
4203
|
+
{
|
4204
|
+
struct archive_string_conv *sc;
|
4205
|
+
int r;
|
4206
|
+
|
4207
|
+
if (utf8 == NULL) {
|
4208
|
+
aes->aes_set = 0;
|
4209
|
+
return (0); /* Succeeded in clearing everything. */
|
4210
|
+
}
|
4211
|
+
|
4212
|
+
/* Save the UTF8 string. */
|
4213
|
+
archive_strcpy(&(aes->aes_utf8), utf8);
|
4214
|
+
|
4215
|
+
/* Empty the mbs and wcs strings. */
|
4216
|
+
archive_string_empty(&(aes->aes_mbs));
|
4217
|
+
archive_wstring_empty(&(aes->aes_wcs));
|
4218
|
+
|
4219
|
+
aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */
|
4220
|
+
|
4221
|
+
/* Try converting UTF-8 to MBS, return false on failure. */
|
4222
|
+
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
|
4223
|
+
if (sc == NULL)
|
4224
|
+
return (-1);/* Couldn't allocate memory for sc. */
|
4225
|
+
r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
|
4226
|
+
if (a == NULL)
|
4227
|
+
free_sconv_object(sc);
|
4228
|
+
if (r != 0)
|
4229
|
+
return (-1);
|
4230
|
+
aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */
|
4231
|
+
|
4232
|
+
/* Try converting MBS to WCS, return false on failure. */
|
4233
|
+
if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
|
4234
|
+
aes->aes_mbs.length))
|
4235
|
+
return (-1);
|
4236
|
+
aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
|
4237
|
+
|
4238
|
+
/* All conversions succeeded. */
|
4239
|
+
return (0);
|
4240
|
+
}
|