RubyGems - readorder - Versions diffs - 1.0.0 - Mend

readorder 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

data/HISTORY +4 -0
data/LICENSE +13 -0
data/README +158 -0
data/bin/readorder +11 -0
data/gemspec.rb +53 -0
data/lib/readorder/analyzer.rb +170 -0
data/lib/readorder/cli.rb +159 -0
data/lib/readorder/command.rb +147 -0
data/lib/readorder/commands/analyze.rb +17 -0
data/lib/readorder/commands/sort.rb +26 -0
data/lib/readorder/commands/test.rb +234 -0
data/lib/readorder/datum.rb +181 -0
data/lib/readorder/filelist.rb +61 -0
data/lib/readorder/log.rb +58 -0
data/lib/readorder/paths.rb +69 -0
data/lib/readorder/runner.rb +48 -0
data/lib/readorder/version.rb +30 -0
data/lib/readorder.rb +24 -0
data/spec/analyzer_spec.rb +51 -0
data/spec/command_spec.rb +37 -0
data/spec/filelist_spec.rb +53 -0
data/spec/log_spec.rb +13 -0
data/spec/paths_spec.rb +45 -0
data/spec/runner_spec.rb +46 -0
data/spec/spec_helper.rb +57 -0
data/spec/version_spec.rb +16 -0
data/tasks/announce.rake +39 -0
data/tasks/config.rb +107 -0
data/tasks/distribution.rake +38 -0
data/tasks/documentation.rake +32 -0
data/tasks/rspec.rake +29 -0
data/tasks/rubyforge.rake +51 -0
data/tasks/utils.rb +80 -0
metadata +161 -0

data/lib/readorder/command.rb ADDED Viewed

@@ -0,0 +1,147 @@
+require 'readorder'
+module Readorder
+  # The Command is the base class for any class that wants to implement a
+  # command line command for
+  #
+  # Inheriting from this calss will make the class registered and be available
+  # for invocation from the Runner class
+  #
+  # The lifecycle of a command is:
+  #
+  #   1) instantiation with a hash parameter
+  #   2) before
+  #   3) run
+  #   4) after
+  #   5) error calld if the runner catches and exception from the command
+  #
+  class Command
+    class Error < ::Readorder::Error ; end
+    def self.command_name
+      name.split("::").last.downcase
+    end
+    attr_reader :options
+    attr_reader :filelist
+    attr_reader :analyzer
+    attr_reader :output
+    def initialize( opts = {} )
+      @options = opts
+      @filelist = nil
+      @analyzer = nil
+      @output = nil
+    end
+    def filelist
+      unless @filelist then
+        begin
+          @filelist = Filelist.new( @options['filelist'] )
+        rescue => fe
+          msg = "Invalid file list.  The list of files containing filenames should be given on the commandline, or filenames should be sent in on stdin."
+          raise Error, msg
+        end
+      end
+      return @filelist
+    end
+    def analyzer
+      @analyzer ||= Analyzer.new( filelist, self.get_physical? )
+    end
+    def output
+      unless @output then
+        if options['output'] then
+          logger.info "output going to #{options['output']}"
+          @output = File.open( options['output'] , "w+" )
+        else
+          @output = $stdout
+        end
+      end
+      return @output
+    end
+    def get_physical?
+      return false if @options['inode']
+      unless Datum.is_linux? then
+        logger.warn "unable to get physical block number, this is not a linux machine, it is #{Config::CONFIG['host_os']}"
+        return false
+      end
+      unless Process.euid == 0 then
+        logger.warn "no permissions to get physical block number, try running as root."
+        return false
+      end
+      return true
+    end
+    def command_name
+      self.class.command_name
+    end
+    def logger
+      ::Logging::Logger[self]
+    end
+    # called by the Runner before the command, this can be used to setup
+    # additional items for the command
+    def before() ; end
+    # called by the Runner to execute the command
+    def run
+      raise Error, "Unknown command `#{command_name}`"
+    end
+    # called by the Runner if an error is encountered during the run method
+    def error() nil; end
+    # called by runner if a signal is hit
+    def shutdown() nil; end
+    # called by runner when all is done
+    def after()
+      if output != $stdout then
+        output.close
+      end
+      if options['error-filelist'] then
+        if analyzer.bad_data.size > 0 then
+          File.open( options['error-filelist'], "w+" ) do |f|
+            analyzer.dump_bad_data_to( f )
+          end
+          logger.info "wrote error filelist to #{options['error-filelist']}"
+        end
+      end
+    end
+    class << self
+      # this method is invoked by the Ruby interpreter whenever a class inherts
+      # from Command.  This is how commands register to be invoked
+      #
+      def inherited( klass )
+        return unless klass.instance_of? Class
+        return if commands.include? klass
+        commands << klass
+      end
+      # The list of commands registered.
+      #
+      def commands
+        unless defined? @commands
+          @commands = []
+        end
+        return @commands
+      end
+      # get the command klass for the given name
+      def find( name )
+        @commands.find { |klass| klass.command_name == name }
+      end
+    end
+  end
+end
+require 'readorder/commands/sort'
+require 'readorder/commands/analyze'
+require 'readorder/commands/test'

data/lib/readorder/commands/analyze.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Readorder
+  module Commands
+    #
+    # Analyze the list of files to sort and give a report
+    #
+    class Analyze < ::Readorder::Command
+      def run
+        analyzer.collect_data
+        output.puts @analyzer.summary_report
+        if options['data-csv'] then
+          File.open( options['data-csv'], "w+") { |f| analyzer.dump_good_data_to( f ) }
+          logger.info "dumped #{analyzer.good_data.size} rows to #{options['data-csv']}"
+        end
+      end
+    end
+  end
+end

data/lib/readorder/commands/sort.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'readorder/command'
+module Readorder
+  module Commands
+    #
+    # Run an anlyzer to gather all the information and then output the
+    # filenames to stdout or to the output file
+    #
+    class Sort < ::Readorder::Command
+      def run
+        analyzer.collect_data
+        analyzer.log_summary_report
+        data = nil
+        if get_physical? then
+          logger.info "using physical order"
+          data = analyzer.physical_order
+        else
+          logger.info "using inode order"
+          data = analyzer.inode_order
+        end
+        data.values.each do |d|
+          output.puts d.filename
+        end
+      end
+    end
+  end
+end

data/lib/readorder/commands/test.rb ADDED Viewed

@@ -0,0 +1,234 @@
+require 'stringio'
+module Readorder
+  module Commands
+    #
+    # Test reading all the contents of a subset of the files and report summary
+    # information on how long it takes to read the files given different
+    # reading orders.
+    #
+    class Test < ::Readorder::Command
+      #
+      # call-seq:
+      #   test.before -> nil
+      #
+      # Part of the Command lifecycle.  In the Test command this make sure we
+      # are on a Linux machine and running as root.
+      #
+      def before
+        super
+        if not Datum.is_linux? then
+          raise Error, "Only able to perform testing on linux.  I know how to dump the file sysem cache there."
+        end
+        if Process.euid != 0 then
+          raise Error, "Must be root to perform testing."
+        end
+      end
+      #
+      # call-seq:
+      #   test.first_of( Filelist ) -> Filelist
+      #
+      # Use the *percentage* option to take the first *percentage* of the input
+      # Filelist and return a new Filelist object continaing that subjset.
+      #
+      def first_of( data )
+        percentage = options['percentage']
+        logger.info "gathering the first #{percentage}% of the data"
+        lines = []
+        data.each_line { |l| lines << l.strip }
+        max_index = ( lines.size.to_f * ( percentage.to_f / 100.0  ) ).ceil
+        subset = lines[0..max_index]
+        return Filelist.new( StringIO.new( subset.join("\n") ) )
+      end
+      #
+      # call-seq:
+      #   test.sample_from( Filelist ) -> Filelist
+      #
+      # Use the *percentage* option to take a random subsampling of data from
+      # the input Filelist and return an new Filelist object containing that
+      # subset.
+      #
+      def sample_from( data )
+        logger.info "sampling a random #{options['percentage']}% of the data"
+        samples = []
+        total = 0
+        fraction = options['percentage'] / 100.0
+        data.each_line do |l|
+          total += 1
+          if rand < fraction
+            samples << l.strip
+          end
+        end
+        logger.info "sampled #{samples.size} of #{total}"
+        return Filelist.new( StringIO.new( samples.join("\n") ) )
+      end
+      #
+      # call-seq:
+      #   test.run -> nil
+      #
+      # Part of the Command lifecycle.
+      #
+      def run
+        test_using_random_sample
+        test_using_first_of
+      end
+      #
+      # call-seq:
+      #   test.test_using_random_sample
+      #
+      # Run the full test using a random subsample of the original Filelist
+      #
+      def test_using_random_sample
+        @filelist = nil
+        sublist = sample_from( self.filelist )
+        results = test_using_sublist( sublist )
+        output.puts "Test Using Random Sample".center(72)
+        output.puts "=" * 72
+        report_results( results )
+      end
+      #
+      # call-seq:
+      #   test.test_using_first_of
+      #
+      # Run the full test using a the first *percentage* of the original
+      # Filelist
+      #
+      def test_using_first_of
+        @filelist = nil
+        sublist = first_of( self.filelist )
+        results = test_using_sublist( sublist )
+        output.puts "Test Using First Of".center(72)
+        output.puts "=" * 72
+        report_results( results )
+      end
+      #
+      # call-seq:
+      #   test.test_using_sublist( Filelist ) -> Array of TimedValueMetric
+      #
+      # given a Filielist of messages run the whole test on them all
+      #
+      def test_using_sublist( sublist )
+        analyzer = Analyzer.new( sublist )
+        analyzer.collect_data
+        results = []
+        %w[ original_order inode_number first_physical_block_number ].each do |order|
+          logger.info "ordering #{analyzer.good_data.size} samples by #{order}"
+          tree = ::MultiRBTree.new
+          analyzer.good_data.each do |s|
+            rank = s.send( order )
+            tree[rank] = s
+          end
+          results << run_test( order, tree.values )
+        end
+        return results
+      end
+      #
+      # call-seq:
+      #   test.report_results( results ) -> nil
+      #
+      # Write the report of the timings to output
+      #
+      def report_results( timings )
+        t = timings.first
+        output.puts
+        output.puts "  Total files read : #{"%12d" % t.value_stats.count}"
+        output.puts "  Total bytes read : #{"%12d" % t.value_stats.sum}"
+        output.puts "  Minimum filesize : #{"%12d" % t.value_stats.min}"
+        output.puts "  Average filesize : #{"%16.3f" % t.value_stats.mean}"
+        output.puts "  Maximum filesize : #{"%12d" % t.value_stats.max}"
+        output.puts "  Stddev of sizes  : #{"%16.3f" % t.value_stats.stddev}"
+        output.puts
+        output.puts ["%28s" % "read order", "%20s" % "Elapsed time (sec)", "%22s" % "Read rate (bytes/sec)" ].join(" ")
+        output.puts "-" * 72
+        timings.each do |timing|
+          p = [ ]
+          p << "%28s" % timing.name
+          p << "%20.3f" % timing.timed_stats.sum
+          p << "%22.3f" % timing.rate
+          output.puts p.join(" ")
+        end
+        output.puts
+      end
+      #
+      #
+      # call-seq:
+      #   test.run_test( 'original', [ Datum, Dataum, ... ]) -> Hitimes::TimedValueMetric
+      #
+      # Loop over all the Datum instances in the array and read the contents of
+      # the file dumping them to /dev/null.  Timings of this process are recorded
+      # an a Hitimes::TimedValueMetric is returned which holds the results.
+      #
+      def run_test( test_name, data )
+        logger.info "running #{test_name} test on #{data.size} files"
+        self.drop_caches
+        timer = ::Hitimes::TimedValueMetric.new( test_name )
+        logger.info "  begin test"
+        data.each do |d|
+          timer.start
+          bytes = dump_to_dev_null( d )
+          timer.stop( bytes )
+          if timer.timed_stats.count % 10_000 == 0 then
+            logger.info "  processed #{timer.count} at #{"%0.3f" % timer.rate} bytes/sec"
+          end
+        end
+        logger.info "  end test"
+        logger.info "  processed #{timer.timed_stats.count} at #{"%0.3f" % timer.rate} bytes/sec"
+        return timer
+      end
+      #
+      # call-seq:
+      #   test.drop_caches -> nil
+      #
+      # Drop the caches on a linux filesystem.
+      #
+      # See proc(5) and /proc/sys/vm/drop_caches
+      #
+      def drop_caches
+        # old habits die hard
+        logger.info "  dropping caches"
+        3.times { %x[ /bin/sync ] }
+        File.open( "/proc/sys/vm/drop_caches", "w" ) do |f|
+          f.puts 3
+        end
+      end
+      #
+      # call-seq:
+      #   test.dump_to_dev_null( Datum ) -> Integer
+      #
+      # Write the contents of the file info in Datum to /dev/null and return the
+      # number of bytes written.
+      #
+      def dump_to_dev_null( datum )
+        bytes = 0
+        File.open( "/dev/null", "w+" ) do |writer|
+          File.open( datum.filename, "r") do |reader|
+            chunk_size = datum.stat.blksize || 4096
+            buf = String.new
+            loop do
+              begin
+                r = reader.sysread( chunk_size, buf )
+                bytes += writer.write( r )
+              rescue => e
+                break
+              end
+            end
+          end
+        end
+        return bytes
+      end
+    end
+  end
+end

data/lib/readorder/datum.rb ADDED Viewed

@@ -0,0 +1,181 @@
+require 'rbconfig'
+require 'pathname'
+module Readorder
+  #
+  # All the block, inode and stat information about one file
+  #
+  class Datum
+    # The fully qualified path of the file
+    attr_reader :filename
+    # The inode number of the file
+    attr_reader :inode_number
+    # The physical block number of the first disc block of the file.  This piece
+    # of data may not be gathered.  This will be nil if that is the case
+    attr_reader :first_physical_block_number
+    # if there is a reason this file is not eligible for analysis this explains
+    # why
+    attr_reader :error_reason
+    # File::Stat of the file
+    attr_reader :stat
+    # count of the number of physical disc blocks this file consumes.  This is
+    # only gathered if the *first_physical_block_number* is also gathered.
+    attr_reader :physical_block_count
+    # the original order in which the Datum was collected
+    attr_accessor :original_order
+    # Check if we are running on linux.  We use this to enable
+    # us to check the physical block id.
+    def self.is_linux?
+      @is_linux ||= ::Config::CONFIG['host_os'] =~ /linux/i
+    end
+    #
+    # call-seq:
+    #   Datum.new( filename ) -> Datum
+    #
+    # Create a new Datum instance for the given filename
+    #
+    def initialize( filename )
+      @filename = ::File.expand_path( filename.strip )
+      @inode_number = nil
+      @first_physical_block_number = nil
+      @physical_block_count = 0
+      @error_reason = nil
+      @original_order = 0
+      @stat = nil
+      @valid = false
+      @collected = false
+    end
+    #
+    # call-seq:
+    #   datum.size -> Integer
+    #
+    # The number of bytes the file consumes
+    #
+    def size
+      @stat.size
+    end
+    #
+    # call-seq:
+    #   datum.logger -> Logger
+    #
+    # The Logger for the instance
+    #
+    def logger
+      ::Logging::Logger[self]
+    end
+    #
+    # :call-seq:
+    #   datum.collect( get_physical = true ) -> true
+    #
+    # Collect all the information about the file we need.
+    # This includes:
+    #
+    # * making sure we have a valid file, this means the file exists
+    #   and is non-zero in size
+    # * getting the inode number of the file
+    # * getting the physical block number of the first block of the file
+    # * getting the device of the file
+    #
+    # If false is passed in, then the physical block number is not
+    # collected.
+    #
+    def collect( get_physical = true )
+      unless @collected then
+        begin
+          @stat = ::File.stat( @filename )
+          if not @stat.file? then
+            @valid = false
+            @error_reason = "Not a file"
+          elsif @stat.zero? then
+            @valid = false
+            @error_reason = "0 byte file"
+          else
+            @inode_number = @stat.ino
+            if get_physical then
+              @first_physical_block_number = self.find_first_physical_block_number
+            end
+            @valid = true
+          end
+        rescue => e
+          @error_reason = e.to_s
+          logger.warn e.to_s
+          @valid = false
+        ensure
+          @collected = true
+        end
+      end
+      return @collected
+    end
+    #
+    # call-seq:
+    #   datum.valid?
+    #
+    # Does this Datum represent a collection of valid data
+    #
+    def valid?
+      @valid
+    end
+    ####
+    # Not part of the public api
+    protected
+    # find the mountpoint for this datum.  We traverse up the Pathname
+    # of the datum until we get to a parent where #mountpoint? is true
+    #
+=begin
+    def find_mountpoint
+      p = Pathname.new( @filename ).parent
+      until p.mountpoint? do
+        p = p.parent
+      end
+      return p.to_s
+    end
+=end
+    #
+    # call-seq:
+    #   datum.find_first_physical_block_number -> Integer
+    #
+    # find the first physical block number, this only applies to linux
+    # machines.
+    #
+    # This is only called within the context of the #collect method
+    #
+    def find_first_physical_block_number
+      return nil unless Datum.is_linux?
+      first_block_num = 0
+      File.open( @filename ) do |f|
+        @stat.blocks.times do |i|
+          j = [i].pack("i")
+          # FIBMAP = 0x00000001
+          f.ioctl( 0x00000001, j )
+          block_id = j.unpack("i")[0]
+          if block_id > 0 then
+            first_block_num = block_id if block_id < first_block_num || first_block_num == 0
+            @physical_block_count += 1
+          end
+        end
+      end
+      return first_block_num
+    end
+  end
+end

data/lib/readorder/filelist.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module Readorder
+  #
+  # An interator over the contents of a bunch of files or IO objects
+  # depending on the initializer.
+  #
+  class Filelist
+    class Error < ::Readorder::Error; end
+    def initialize( sources = [] )
+      @sources = [ sources ].flatten
+      @current_source = nil
+      @sources.each do |s|
+        case s
+        when String
+          raise Error, "#{s} does not exist" unless File.exist?( s )
+          raise Error, "#{s} is not readable" unless File.readable?( s )
+        else
+          [ :gets, :close ].each do |meth|
+            raise Error, "#{s.inspect} does not respond to '#{meth}'" unless s.respond_to? meth
+          end
+        end
+      end
+    end
+    def current_source
+      if not @current_source then
+        cs = @sources.shift
+        case cs
+        when String
+          @current_source = File.open( cs )
+        else
+          # nil or respond_to? :gets
+          @current_source = cs
+        end
+      end
+      return @current_source
+    end
+    # return the next line from the sources, opening a new source if
+    # need be
+    def gets
+      loop do
+        return nil unless self.current_source
+        line = self.current_source.gets
+        return line if line
+        @current_source.close unless @current_source == $stdin
+        @current_source = nil
+      end
+    end
+    #
+    # Iterator yielding the line returned, stopping on no more lines
+    #
+    def each_line
+      while line = self.gets do
+        yield line
+      end
+    end
+  end
+end