RubyGems - zip_tricks - Versions diffs - 4.7.4 → 4.8.0 - Mend

zip_tricks 4.7.4 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/.rubocop.yml +2 -0
data/CHANGELOG.md +10 -0
data/lib/zip_tricks/path_set.rb +148 -0
data/lib/zip_tricks/size_estimator.rb +3 -2
data/lib/zip_tricks/streamer.rb +30 -40
data/lib/zip_tricks/uniquify_filename.rb +38 -0
data/lib/zip_tricks/version.rb +1 -1
metadata +4 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d1119a96243d5da6b423a2a3c992efc6fbb580fa
-  data.tar.gz: 63a1d0b2c93e5b231568e83efb60220862c07b7e
+  metadata.gz: 5c7e14be1038151588e016705af5249cacbfea8a
+  data.tar.gz: fc4e68ce38ea2091f7a718a8dcfff09d3e40fbb6
 SHA512:
-  metadata.gz: 89c7300a0c3af8ceb0408d2c4315fc62e4abf5fecc53ca93219bf9ce30c146d9cc8bca193d35908f1d24516bc1298c12332850ff0352dd561ed67a56a6fe7211
-  data.tar.gz: 20cdfca60a6ac3cc5beaa57bf74dba06bd09ef18931931ddead388de68e6316650e4aad14fe96b7838b415babaafe9f8aa72ca5a9ccd547eb535f3e6f27901f4
+  metadata.gz: 05d776ff8a5c0cea81f66aba9ac37ad2400c0abea38660ca94365f15649c5d9cc8c06f6d768d2f657b70fae823fe6704bcd335611ff9693e9550246cc04ba7e6
+  data.tar.gz: de2264703c398fba2d9a0c44b485206f2acdfb119c9aff76ab8855e2d84cdc2be4da6802af7fbe967f780229d8582d256439ee923c59aa0782d4b1d07b9104f0

data/.rubocop.yml CHANGED

@@ -1,5 +1,7 @@
 inherit_gem:
   wetransfer_style: ruby/default.yml
+AllCops:
+  TargetRubyVersion: 2.1
 Layout/FirstMethodArgumentLineBreak:
   Enabled: false
 Layout/FirstMethodParameterLineBreak:

data/CHANGELOG.md CHANGED

@@ -1,3 +1,13 @@
+## 4.8.0
+* Make sure that when directories clobber files and vice versa we raise a clear error. Add `PathSet` which keeps track of entries
+  and all the directories needed to create them, document `PathSet`
+* Move the `uniquify_filenames` function into a module for easier removal later
+* Add the `auto_rename_duplicate_filenames` parameter to `Streamer` constructor. We need to make this optional
+  because making filenames unique can be very tricky when subdirectories are involved, and strictly
+  speaking we should not be applying this transformation at all - there should be no output of
+  duplicate filenames by the caller. So making the filenames should be available, but optional.
 ## 4.7.4
 * Use a single fixed capacity string in StreamCRC32.from_io to avoid unnecessary allocations

data/lib/zip_tricks/path_set.rb ADDED

@@ -0,0 +1,148 @@
+# rubocop:disable Layout/IndentHeredoc
+# A ZIP archive contains a flat list of entries. These entries can implicitly
+# create directories when the archive is expanded. For example, an entry with
+# the filename of "some folder/file.docx" will make the unarchiving application
+# create a directory called "some folder" automatically, and then deposit the
+# file "file.docx" in that directory. These "implicit" directories can be
+# arbitrarily nested, and create a tree structure of directories. That structure
+# however is implicit as the archive contains a flat list.
+#
+# This creates opportunities for conflicts. For example, imagine the following
+# structure:
+#
+# * `something/` - specifies an empty directory with the name "something"
+# * `something` - specifies a file, creates a conflict
+#
+# This can be prevented with filename uniqueness checks. It does get funkier however
+# as the rabbit hole goes down:
+#
+# * `dir/subdir/another_subdir/yet_another_subdir/file.bin` - declares a file and directories
+# * `dir/subdir/another_subdir/yet_another_subdir` - declares a file at one of the levels, creates a conflict
+#
+# The results of this ZIP structure aren't very easy to predict as they depend on the
+# application that opens the archive. For example, BOMArchiveHelper on macOS will expand files
+# as they are declared in the ZIP, but once a conflict occurs it will fail with "error -21". It
+# is not very transparent to the user why unarchiving fails, and it has to - and can reliably - only
+# be prevented when the archive gets created.
+#
+# Unfortunately that conflicts with another "magical" feature of ZipTricks which automatically
+# "fixes" duplicate filenames - filenames (paths) which have already been added to the archive.
+# This fix is performed by appending (1), then (2) and so forth to the filename so that the
+# conflict is avoided. This is not possible to apply to directories, because when one of the
+# path components is reused in multiple filenames it means those entities should end up in
+# the same directory (subdirectory) once the archive is opened.
+class ZipTricks::PathSet
+  class Conflict < StandardError
+  end
+  class FileClobbersDirectory < Conflict
+  end
+  class DirectoryClobbersFile < Conflict
+  end
+  def initialize
+    @known_directories = Set.new
+    @known_files = Set.new
+  end
+  # Adds a directory path to the set of known paths, including
+  # all the directories that contain it. So, calling
+  #    add_directory_path("dir/dir2/dir3")
+  # will add "dir", "dir/dir2", "dir/dir2/dir3".
+  #
+  # @param path[String] the path to the directory to add
+  # @return [void]
+  def add_directory_path(path)
+    path_and_ancestors(path).each do |parent_directory_path|
+      if @known_files.include?(parent_directory_path)
+        # Have to use the old-fashioned heredocs because ZipTricks
+        # aims to be compatible with MRI 2.1+ syntax, and squiggly
+        # heredoc is only available starting 2.3+
+        error_message = <<ERR
+The path #{parent_directory_path.inspect} which has to be added
+as a directory is already used for a file.
+The directory at this path would get created implicitly
+to produce #{path.inspect} during decompresison.
+This would make some archive utilities refuse to open
+the ZIP.
+ERR
+        raise DirectoryClobbersFile, error_message
+      end
+      @known_directories << parent_directory_path
+    end
+  end
+  # Adds a file path to the set of known paths, including
+  # all the directories that contain it. Once a file has been added,
+  # it is no longer possible to add a directory having the same path
+  # as this would cause conflict.
+  #
+  # The operation also adds all the containing directories for the file, so
+  #    add_file_path("dir/dir2/file.doc")
+  # will add "dir" and "dir/dir2" as directories, "dir/dir2/dir3".
+  #
+  # @param file_path[String] the path to the directory to add
+  # @return [void]
+  def add_file_path(file_path)
+    if @known_files.include?(file_path)
+      error_message = <<ERR
+The file at #{file_path.inspect} has already been included
+in the archive. Adding it the second time would cause
+the first file to be overwritten during unarchiving, and
+could also get the archive flagged as invalid.
+ERR
+      raise Conflict, error_message
+    end
+    if @known_directories.include?(file_path)
+      error_message = <<ERR
+The path #{file_path.inspect} is already used for
+a directory, but you are trying to add it as a file.
+This would make some archive utilities refuse
+to open the ZIP.
+ERR
+      raise FileClobbersDirectory, error_message
+    end
+    # Add all the directories which this file is contained in
+    *dir_components, _file_name = non_empty_path_components(file_path)
+    add_directory_path(dir_components.join('/'))
+    # ...and then the file itself
+    @known_files << file_path
+  end
+  # Tells whether a specific full path is already known to the PathSet.
+  # Can be a path for a directory or for a file.
+  #
+  # @param path_in_archive[String] the path to check for inclusion
+  # @return [Boolean]
+  def include?(path_in_archive)
+    @known_files.include?(path_in_archive) || @known_directories.include?(path_in_archive)
+  end
+  # Clears the contained sets
+  # @return [void]
+  def clear
+    @known_files.clear
+    @known_directories.clear
+  end
+  private
+  def non_empty_path_components(path)
+    path.split('/').reject(&:empty?)
+  end
+  def path_and_ancestors(path)
+    path_components = non_empty_path_components(path)
+    path_components.each_with_object([]) do |component, seen|
+      seen << [seen.last, component].compact.join('/')
+    end
+  end
+end

data/lib/zip_tricks/size_estimator.rb CHANGED

@@ -20,10 +20,11 @@ class ZipTricks::SizeEstimator
   #               uncompressed_size: 89281911, compressed_size: 121908)
   #     end
   #
+  # @param kwargs_for_streamer_new Any options to pass to Streamer, see {Streamer#initialize}
   # @return [Integer] the size of the resulting archive, in bytes
   # @yield [SizeEstimator] the estimator
-  def self.estimate
-    streamer = ZipTricks::Streamer.new(ZipTricks::NullWriter)
+  def self.estimate(**kwargs_for_streamer_new)
+    streamer = ZipTricks::Streamer.new(ZipTricks::NullWriter, **kwargs_for_streamer_new)
     estimator = new(streamer)
     yield(estimator)
     streamer.close # Returns the .tell of the contained IO

data/lib/zip_tricks/streamer.rb CHANGED

@@ -140,13 +140,19 @@ class ZipTricks::Streamer
   # @param stream[IO] the destination IO for the ZIP. Anything that responds to `<<` can be used.
   # @param writer[ZipTricks::ZipWriter] the object to be used as the writer.
   #    Defaults to an instance of ZipTricks::ZipWriter, normally you won't need to override it
-  def initialize(stream, writer: create_writer)
+  # @param auto_rename_duplicate_filenames[Boolean] whether duplicate filenames, when encountered,
+  #    should be suffixed with (1), (2) etc. Default value is `true` since it
+  #    used to be the default behavior.
+  #
+  # **DEPRECATION NOTICE** In ZipTricks version 5 `auto_rename_duplicate_filenames` will default to `false`
+  def initialize(stream, writer: create_writer, auto_rename_duplicate_filenames: true)
     raise InvalidOutput, 'The stream must respond to #<<' unless stream.respond_to?(:<<)
+    @dedupe_filenames = auto_rename_duplicate_filenames
     @out = ZipTricks::WriteAndTell.new(stream)
     @files = []
     @local_header_offsets = []
-    @filenames_set = Set.new
+    @path_set = ZipTricks::PathSet.new
     @writer = writer
   end
@@ -387,7 +393,7 @@ class ZipTricks::Streamer
     # Clear the files so that GC will not have to trace all the way to here to deallocate them
     @files.clear
-    @filenames_set.clear
+    @path_set.clear
     # and return the final offset
     @out.tell
@@ -429,22 +435,31 @@ class ZipTricks::Streamer
   private
   def add_file_and_write_local_header(
-filename:,
-modification_time:,
-crc32:,
-storage_mode:,
-compressed_size:,
-uncompressed_size:,
-use_data_descriptor:)
-    # Clean backslashes and uniqify filenames if there are duplicates
+      filename:,
+      modification_time:,
+      crc32:,
+      storage_mode:,
+      compressed_size:,
+      uncompressed_size:,
+      use_data_descriptor:)
+    # Clean backslashes
     filename = remove_backslash(filename)
-    filename = uniquify_name(filename) if @filenames_set.include?(filename)
     raise UnknownMode, "Unknown compression mode #{storage_mode}" unless [STORED, DEFLATED].include?(storage_mode)
     raise Overflow, 'Filename is too long' if filename.bytesize > 0xFFFF
+    # If we need to massage filenames to enforce uniqueness,
+    # do so before we check for file/directory conflicts
+    filename = ZipTricks::UniquifyFilename.call(filename, @path_set) if @dedupe_filenames
+    # Make sure there is no file/directory clobbering (conflicts), or - if deduping is disabled -
+    # no duplicate filenames/paths
+    if filename.end_with?('/')
+      @path_set.add_directory_path(filename)
+    else
+      @path_set.add_file_path(filename)
+    end
     if use_data_descriptor
       crc32 = 0
       compressed_size = 0
@@ -460,7 +475,6 @@ use_data_descriptor:)
                   use_data_descriptor)
     @files << e
-    @filenames_set << e.filename
     @local_header_offsets << @out.tell
     @writer.write_local_file_header(io: @out,
@@ -476,28 +490,4 @@ use_data_descriptor:)
   def remove_backslash(filename)
     filename.tr('\\', '_')
   end
-  def uniquify_name(filename)
-    # we add (1), (2), (n) at the end of a filename if there is a duplicate
-    copy_pattern = /\((\d+)\)$/
-    parts = filename.split('.')
-    ext = if parts.last =~ /gz|zip/ && parts.size > 2
-            parts.pop(2)
-          elsif parts.size > 1
-            parts.pop
-          end
-    fn_last_part = parts.pop
-    duplicate_counter = 1
-    loop do
-      fn_last_part = if fn_last_part =~ copy_pattern
-                       fn_last_part.sub(copy_pattern, "(#{duplicate_counter})")
-                     else
-                       "#{fn_last_part} (#{duplicate_counter})"
-                     end
-      new_filename = (parts + [fn_last_part, ext]).compact.join('.')
-      return new_filename unless @filenames_set.include?(new_filename)
-      duplicate_counter += 1
-    end
-  end
 end

data/lib/zip_tricks/uniquify_filename.rb ADDED

@@ -0,0 +1,38 @@
+module ZipTricks::UniquifyFilename
+  # Makes a given filename unique by appending a (n) suffix
+  # between just before the filename extension. So "file.txt" gets
+  # transformed into "file (1).txt". The transformation is applied
+  # repeatedly as long as the generated filename is present
+  # in `while_included_in` object
+  #
+  # @param path[String] the path to make unique
+  # @param while_included_in[#include?] an object that stores the list of already used paths
+  # @return [String] the path as is, or with the suffix required to make it unique
+  def self.call(path, while_included_in)
+    return path unless while_included_in.include?(path)
+    # we add (1), (2), (n) at the end of a filename before the filename extension,
+    # but only if there is a duplicate
+    copy_pattern = /\((\d+)\)$/
+    parts = path.split('.')
+    ext = if parts.last =~ /gz|zip/ && parts.size > 2
+            parts.pop(2)
+          elsif parts.size > 1
+            parts.pop
+          end
+    fn_last_part = parts.pop
+    duplicate_counter = 1
+    loop do
+      fn_last_part = if fn_last_part =~ copy_pattern
+                       fn_last_part.sub(copy_pattern, "(#{duplicate_counter})")
+                     else
+                       "#{fn_last_part} (#{duplicate_counter})"
+                     end
+      new_path = (parts + [fn_last_part, ext]).compact.join('.')
+      return new_path unless while_included_in.include?(new_path)
+      duplicate_counter += 1
+    end
+  end
+end

data/lib/zip_tricks/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ZipTricks
-  VERSION = '4.7.4'
+  VERSION = '4.8.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zip_tricks
 version: !ruby/object:Gem::Version
-  version: 4.7.4
+  version: 4.8.0
 platform: ruby
 authors:
 - Julik Tarkhanov
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-06-16 00:00:00.000000000 Z
+date: 2019-08-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -228,6 +228,7 @@ files:
 - lib/zip_tricks/file_reader/stored_reader.rb
 - lib/zip_tricks/null_writer.rb
 - lib/zip_tricks/output_enumerator.rb
+- lib/zip_tricks/path_set.rb
 - lib/zip_tricks/rack_body.rb
 - lib/zip_tricks/rails_streaming.rb
 - lib/zip_tricks/remote_io.rb
@@ -239,6 +240,7 @@ files:
 - lib/zip_tricks/streamer/entry.rb
 - lib/zip_tricks/streamer/stored_writer.rb
 - lib/zip_tricks/streamer/writable.rb
+- lib/zip_tricks/uniquify_filename.rb
 - lib/zip_tricks/version.rb
 - lib/zip_tricks/write_and_tell.rb
 - lib/zip_tricks/write_buffer.rb