readorder 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,147 @@
1
+ require 'readorder'
2
+
3
+ module Readorder
4
+ # The Command is the base class for any class that wants to implement a
5
+ # command line command for
6
+ #
7
+ # Inheriting from this calss will make the class registered and be available
8
+ # for invocation from the Runner class
9
+ #
10
+ # The lifecycle of a command is:
11
+ #
12
+ # 1) instantiation with a hash parameter
13
+ # 2) before
14
+ # 3) run
15
+ # 4) after
16
+ # 5) error calld if the runner catches and exception from the command
17
+ #
18
+ class Command
19
+ class Error < ::Readorder::Error ; end
20
+
21
+ def self.command_name
22
+ name.split("::").last.downcase
23
+ end
24
+
25
+ attr_reader :options
26
+ attr_reader :filelist
27
+ attr_reader :analyzer
28
+ attr_reader :output
29
+
30
+ def initialize( opts = {} )
31
+ @options = opts
32
+ @filelist = nil
33
+ @analyzer = nil
34
+ @output = nil
35
+ end
36
+
37
+ def filelist
38
+ unless @filelist then
39
+ begin
40
+ @filelist = Filelist.new( @options['filelist'] )
41
+ rescue => fe
42
+ msg = "Invalid file list. The list of files containing filenames should be given on the commandline, or filenames should be sent in on stdin."
43
+ raise Error, msg
44
+ end
45
+ end
46
+ return @filelist
47
+ end
48
+
49
+ def analyzer
50
+ @analyzer ||= Analyzer.new( filelist, self.get_physical? )
51
+ end
52
+
53
+ def output
54
+ unless @output then
55
+ if options['output'] then
56
+ logger.info "output going to #{options['output']}"
57
+ @output = File.open( options['output'] , "w+" )
58
+ else
59
+ @output = $stdout
60
+ end
61
+ end
62
+ return @output
63
+ end
64
+
65
+
66
+ def get_physical?
67
+ return false if @options['inode']
68
+ unless Datum.is_linux? then
69
+ logger.warn "unable to get physical block number, this is not a linux machine, it is #{Config::CONFIG['host_os']}"
70
+ return false
71
+ end
72
+ unless Process.euid == 0 then
73
+ logger.warn "no permissions to get physical block number, try running as root."
74
+ return false
75
+ end
76
+ return true
77
+ end
78
+
79
+ def command_name
80
+ self.class.command_name
81
+ end
82
+
83
+ def logger
84
+ ::Logging::Logger[self]
85
+ end
86
+
87
+ # called by the Runner before the command, this can be used to setup
88
+ # additional items for the command
89
+ def before() ; end
90
+
91
+ # called by the Runner to execute the command
92
+ def run
93
+ raise Error, "Unknown command `#{command_name}`"
94
+ end
95
+
96
+ # called by the Runner if an error is encountered during the run method
97
+ def error() nil; end
98
+
99
+ # called by runner if a signal is hit
100
+ def shutdown() nil; end
101
+
102
+ # called by runner when all is done
103
+ def after()
104
+ if output != $stdout then
105
+ output.close
106
+ end
107
+ if options['error-filelist'] then
108
+ if analyzer.bad_data.size > 0 then
109
+ File.open( options['error-filelist'], "w+" ) do |f|
110
+ analyzer.dump_bad_data_to( f )
111
+ end
112
+ logger.info "wrote error filelist to #{options['error-filelist']}"
113
+ end
114
+ end
115
+ end
116
+
117
+ class << self
118
+ # this method is invoked by the Ruby interpreter whenever a class inherts
119
+ # from Command. This is how commands register to be invoked
120
+ #
121
+ def inherited( klass )
122
+ return unless klass.instance_of? Class
123
+ return if commands.include? klass
124
+ commands << klass
125
+ end
126
+
127
+ # The list of commands registered.
128
+ #
129
+ def commands
130
+ unless defined? @commands
131
+ @commands = []
132
+ end
133
+ return @commands
134
+ end
135
+
136
+ # get the command klass for the given name
137
+ def find( name )
138
+ @commands.find { |klass| klass.command_name == name }
139
+ end
140
+
141
+ end
142
+ end
143
+ end
144
+
145
+ require 'readorder/commands/sort'
146
+ require 'readorder/commands/analyze'
147
+ require 'readorder/commands/test'
@@ -0,0 +1,17 @@
1
+ module Readorder
2
+ module Commands
3
+ #
4
+ # Analyze the list of files to sort and give a report
5
+ #
6
+ class Analyze < ::Readorder::Command
7
+ def run
8
+ analyzer.collect_data
9
+ output.puts @analyzer.summary_report
10
+ if options['data-csv'] then
11
+ File.open( options['data-csv'], "w+") { |f| analyzer.dump_good_data_to( f ) }
12
+ logger.info "dumped #{analyzer.good_data.size} rows to #{options['data-csv']}"
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ require 'readorder/command'
2
+ module Readorder
3
+ module Commands
4
+ #
5
+ # Run an anlyzer to gather all the information and then output the
6
+ # filenames to stdout or to the output file
7
+ #
8
+ class Sort < ::Readorder::Command
9
+ def run
10
+ analyzer.collect_data
11
+ analyzer.log_summary_report
12
+ data = nil
13
+ if get_physical? then
14
+ logger.info "using physical order"
15
+ data = analyzer.physical_order
16
+ else
17
+ logger.info "using inode order"
18
+ data = analyzer.inode_order
19
+ end
20
+ data.values.each do |d|
21
+ output.puts d.filename
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,234 @@
1
+ require 'stringio'
2
+ module Readorder
3
+ module Commands
4
+ #
5
+ # Test reading all the contents of a subset of the files and report summary
6
+ # information on how long it takes to read the files given different
7
+ # reading orders.
8
+ #
9
+ class Test < ::Readorder::Command
10
+
11
+ #
12
+ # call-seq:
13
+ # test.before -> nil
14
+ #
15
+ # Part of the Command lifecycle. In the Test command this make sure we
16
+ # are on a Linux machine and running as root.
17
+ #
18
+ def before
19
+ super
20
+ if not Datum.is_linux? then
21
+ raise Error, "Only able to perform testing on linux. I know how to dump the file sysem cache there."
22
+ end
23
+ if Process.euid != 0 then
24
+ raise Error, "Must be root to perform testing."
25
+ end
26
+ end
27
+
28
+ #
29
+ # call-seq:
30
+ # test.first_of( Filelist ) -> Filelist
31
+ #
32
+ # Use the *percentage* option to take the first *percentage* of the input
33
+ # Filelist and return a new Filelist object continaing that subjset.
34
+ #
35
+ def first_of( data )
36
+ percentage = options['percentage']
37
+ logger.info "gathering the first #{percentage}% of the data"
38
+ lines = []
39
+ data.each_line { |l| lines << l.strip }
40
+ max_index = ( lines.size.to_f * ( percentage.to_f / 100.0 ) ).ceil
41
+ subset = lines[0..max_index]
42
+ return Filelist.new( StringIO.new( subset.join("\n") ) )
43
+ end
44
+
45
+ #
46
+ # call-seq:
47
+ # test.sample_from( Filelist ) -> Filelist
48
+ #
49
+ # Use the *percentage* option to take a random subsampling of data from
50
+ # the input Filelist and return an new Filelist object containing that
51
+ # subset.
52
+ #
53
+ def sample_from( data )
54
+ logger.info "sampling a random #{options['percentage']}% of the data"
55
+ samples = []
56
+ total = 0
57
+ fraction = options['percentage'] / 100.0
58
+ data.each_line do |l|
59
+ total += 1
60
+ if rand < fraction
61
+ samples << l.strip
62
+ end
63
+ end
64
+ logger.info "sampled #{samples.size} of #{total}"
65
+ return Filelist.new( StringIO.new( samples.join("\n") ) )
66
+ end
67
+
68
+ #
69
+ # call-seq:
70
+ # test.run -> nil
71
+ #
72
+ # Part of the Command lifecycle.
73
+ #
74
+ def run
75
+ test_using_random_sample
76
+ test_using_first_of
77
+ end
78
+
79
+ #
80
+ # call-seq:
81
+ # test.test_using_random_sample
82
+ #
83
+ # Run the full test using a random subsample of the original Filelist
84
+ #
85
+ def test_using_random_sample
86
+ @filelist = nil
87
+ sublist = sample_from( self.filelist )
88
+ results = test_using_sublist( sublist )
89
+ output.puts "Test Using Random Sample".center(72)
90
+ output.puts "=" * 72
91
+ report_results( results )
92
+
93
+ end
94
+
95
+ #
96
+ # call-seq:
97
+ # test.test_using_first_of
98
+ #
99
+ # Run the full test using a the first *percentage* of the original
100
+ # Filelist
101
+ #
102
+ def test_using_first_of
103
+ @filelist = nil
104
+ sublist = first_of( self.filelist )
105
+ results = test_using_sublist( sublist )
106
+ output.puts "Test Using First Of".center(72)
107
+ output.puts "=" * 72
108
+ report_results( results )
109
+ end
110
+
111
+ #
112
+ # call-seq:
113
+ # test.test_using_sublist( Filelist ) -> Array of TimedValueMetric
114
+ #
115
+ # given a Filielist of messages run the whole test on them all
116
+ #
117
+ def test_using_sublist( sublist )
118
+ analyzer = Analyzer.new( sublist )
119
+ analyzer.collect_data
120
+ results = []
121
+
122
+ %w[ original_order inode_number first_physical_block_number ].each do |order|
123
+ logger.info "ordering #{analyzer.good_data.size} samples by #{order}"
124
+ tree = ::MultiRBTree.new
125
+ analyzer.good_data.each do |s|
126
+ rank = s.send( order )
127
+ tree[rank] = s
128
+ end
129
+ results << run_test( order, tree.values )
130
+ end
131
+ return results
132
+ end
133
+
134
+ #
135
+ # call-seq:
136
+ # test.report_results( results ) -> nil
137
+ #
138
+ # Write the report of the timings to output
139
+ #
140
+ def report_results( timings )
141
+ t = timings.first
142
+ output.puts
143
+ output.puts " Total files read : #{"%12d" % t.value_stats.count}"
144
+ output.puts " Total bytes read : #{"%12d" % t.value_stats.sum}"
145
+ output.puts " Minimum filesize : #{"%12d" % t.value_stats.min}"
146
+ output.puts " Average filesize : #{"%16.3f" % t.value_stats.mean}"
147
+ output.puts " Maximum filesize : #{"%12d" % t.value_stats.max}"
148
+ output.puts " Stddev of sizes : #{"%16.3f" % t.value_stats.stddev}"
149
+ output.puts
150
+
151
+ output.puts ["%28s" % "read order", "%20s" % "Elapsed time (sec)", "%22s" % "Read rate (bytes/sec)" ].join(" ")
152
+ output.puts "-" * 72
153
+ timings.each do |timing|
154
+ p = [ ]
155
+ p << "%28s" % timing.name
156
+ p << "%20.3f" % timing.timed_stats.sum
157
+ p << "%22.3f" % timing.rate
158
+ output.puts p.join(" ")
159
+ end
160
+ output.puts
161
+ end
162
+ #
163
+ #
164
+ # call-seq:
165
+ # test.run_test( 'original', [ Datum, Dataum, ... ]) -> Hitimes::TimedValueMetric
166
+ #
167
+ # Loop over all the Datum instances in the array and read the contents of
168
+ # the file dumping them to /dev/null. Timings of this process are recorded
169
+ # an a Hitimes::TimedValueMetric is returned which holds the results.
170
+ #
171
+ def run_test( test_name, data )
172
+ logger.info "running #{test_name} test on #{data.size} files"
173
+ self.drop_caches
174
+ timer = ::Hitimes::TimedValueMetric.new( test_name )
175
+ logger.info " begin test"
176
+ data.each do |d|
177
+ timer.start
178
+ bytes = dump_to_dev_null( d )
179
+ timer.stop( bytes )
180
+
181
+ if timer.timed_stats.count % 10_000 == 0 then
182
+ logger.info " processed #{timer.count} at #{"%0.3f" % timer.rate} bytes/sec"
183
+ end
184
+ end
185
+ logger.info " end test"
186
+ logger.info " processed #{timer.timed_stats.count} at #{"%0.3f" % timer.rate} bytes/sec"
187
+ return timer
188
+ end
189
+
190
+ #
191
+ # call-seq:
192
+ # test.drop_caches -> nil
193
+ #
194
+ # Drop the caches on a linux filesystem.
195
+ #
196
+ # See proc(5) and /proc/sys/vm/drop_caches
197
+ #
198
+ def drop_caches
199
+ # old habits die hard
200
+ logger.info " dropping caches"
201
+ 3.times { %x[ /bin/sync ] }
202
+ File.open( "/proc/sys/vm/drop_caches", "w" ) do |f|
203
+ f.puts 3
204
+ end
205
+ end
206
+
207
+ #
208
+ # call-seq:
209
+ # test.dump_to_dev_null( Datum ) -> Integer
210
+ #
211
+ # Write the contents of the file info in Datum to /dev/null and return the
212
+ # number of bytes written.
213
+ #
214
+ def dump_to_dev_null( datum )
215
+ bytes = 0
216
+ File.open( "/dev/null", "w+" ) do |writer|
217
+ File.open( datum.filename, "r") do |reader|
218
+ chunk_size = datum.stat.blksize || 4096
219
+ buf = String.new
220
+ loop do
221
+ begin
222
+ r = reader.sysread( chunk_size, buf )
223
+ bytes += writer.write( r )
224
+ rescue => e
225
+ break
226
+ end
227
+ end
228
+ end
229
+ end
230
+ return bytes
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,181 @@
1
+ require 'rbconfig'
2
+ require 'pathname'
3
+ module Readorder
4
+ #
5
+ # All the block, inode and stat information about one file
6
+ #
7
+ class Datum
8
+
9
+ # The fully qualified path of the file
10
+ attr_reader :filename
11
+
12
+ # The inode number of the file
13
+ attr_reader :inode_number
14
+
15
+ # The physical block number of the first disc block of the file. This piece
16
+ # of data may not be gathered. This will be nil if that is the case
17
+ attr_reader :first_physical_block_number
18
+
19
+ # if there is a reason this file is not eligible for analysis this explains
20
+ # why
21
+ attr_reader :error_reason
22
+
23
+ # File::Stat of the file
24
+ attr_reader :stat
25
+
26
+ # count of the number of physical disc blocks this file consumes. This is
27
+ # only gathered if the *first_physical_block_number* is also gathered.
28
+ attr_reader :physical_block_count
29
+
30
+ # the original order in which the Datum was collected
31
+ attr_accessor :original_order
32
+
33
+ # Check if we are running on linux. We use this to enable
34
+ # us to check the physical block id.
35
+ def self.is_linux?
36
+ @is_linux ||= ::Config::CONFIG['host_os'] =~ /linux/i
37
+ end
38
+
39
+ #
40
+ # call-seq:
41
+ # Datum.new( filename ) -> Datum
42
+ #
43
+ # Create a new Datum instance for the given filename
44
+ #
45
+ def initialize( filename )
46
+ @filename = ::File.expand_path( filename.strip )
47
+ @inode_number = nil
48
+ @first_physical_block_number = nil
49
+ @physical_block_count = 0
50
+ @error_reason = nil
51
+ @original_order = 0
52
+
53
+ @stat = nil
54
+ @valid = false
55
+ @collected = false
56
+ end
57
+
58
+ #
59
+ # call-seq:
60
+ # datum.size -> Integer
61
+ #
62
+ # The number of bytes the file consumes
63
+ #
64
+ def size
65
+ @stat.size
66
+ end
67
+
68
+ #
69
+ # call-seq:
70
+ # datum.logger -> Logger
71
+ #
72
+ # The Logger for the instance
73
+ #
74
+ def logger
75
+ ::Logging::Logger[self]
76
+ end
77
+
78
+ #
79
+ # :call-seq:
80
+ # datum.collect( get_physical = true ) -> true
81
+ #
82
+ # Collect all the information about the file we need.
83
+ # This includes:
84
+ #
85
+ # * making sure we have a valid file, this means the file exists
86
+ # and is non-zero in size
87
+ # * getting the inode number of the file
88
+ # * getting the physical block number of the first block of the file
89
+ # * getting the device of the file
90
+ #
91
+ # If false is passed in, then the physical block number is not
92
+ # collected.
93
+ #
94
+ def collect( get_physical = true )
95
+ unless @collected then
96
+ begin
97
+ @stat = ::File.stat( @filename )
98
+ if not @stat.file? then
99
+ @valid = false
100
+ @error_reason = "Not a file"
101
+ elsif @stat.zero? then
102
+ @valid = false
103
+ @error_reason = "0 byte file"
104
+ else
105
+ @inode_number = @stat.ino
106
+ if get_physical then
107
+ @first_physical_block_number = self.find_first_physical_block_number
108
+ end
109
+ @valid = true
110
+ end
111
+ rescue => e
112
+ @error_reason = e.to_s
113
+ logger.warn e.to_s
114
+ @valid = false
115
+ ensure
116
+ @collected = true
117
+ end
118
+ end
119
+ return @collected
120
+ end
121
+
122
+ #
123
+ # call-seq:
124
+ # datum.valid?
125
+ #
126
+ # Does this Datum represent a collection of valid data
127
+ #
128
+ def valid?
129
+ @valid
130
+ end
131
+
132
+ ####
133
+ # Not part of the public api
134
+ protected
135
+
136
+ # find the mountpoint for this datum. We traverse up the Pathname
137
+ # of the datum until we get to a parent where #mountpoint? is true
138
+ #
139
+ =begin
140
+ def find_mountpoint
141
+ p = Pathname.new( @filename ).parent
142
+ until p.mountpoint? do
143
+ p = p.parent
144
+ end
145
+ return p.to_s
146
+ end
147
+ =end
148
+
149
+ #
150
+ # call-seq:
151
+ # datum.find_first_physical_block_number -> Integer
152
+ #
153
+ # find the first physical block number, this only applies to linux
154
+ # machines.
155
+ #
156
+ # This is only called within the context of the #collect method
157
+ #
158
+ def find_first_physical_block_number
159
+ return nil unless Datum.is_linux?
160
+
161
+ first_block_num = 0
162
+ File.open( @filename ) do |f|
163
+ @stat.blocks.times do |i|
164
+
165
+ j = [i].pack("i")
166
+ # FIBMAP = 0x00000001
167
+ f.ioctl( 0x00000001, j )
168
+ block_id = j.unpack("i")[0]
169
+
170
+ if block_id > 0 then
171
+ first_block_num = block_id if block_id < first_block_num || first_block_num == 0
172
+ @physical_block_count += 1
173
+ end
174
+
175
+ end
176
+ end
177
+ return first_block_num
178
+
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,61 @@
1
+ module Readorder
2
+ #
3
+ # An interator over the contents of a bunch of files or IO objects
4
+ # depending on the initializer.
5
+ #
6
+ class Filelist
7
+ class Error < ::Readorder::Error; end
8
+
9
+ def initialize( sources = [] )
10
+ @sources = [ sources ].flatten
11
+ @current_source = nil
12
+ @sources.each do |s|
13
+ case s
14
+ when String
15
+ raise Error, "#{s} does not exist" unless File.exist?( s )
16
+ raise Error, "#{s} is not readable" unless File.readable?( s )
17
+ else
18
+ [ :gets, :close ].each do |meth|
19
+ raise Error, "#{s.inspect} does not respond to '#{meth}'" unless s.respond_to? meth
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ def current_source
26
+ if not @current_source then
27
+ cs = @sources.shift
28
+ case cs
29
+ when String
30
+ @current_source = File.open( cs )
31
+ else
32
+ # nil or respond_to? :gets
33
+ @current_source = cs
34
+ end
35
+ end
36
+ return @current_source
37
+ end
38
+
39
+ # return the next line from the sources, opening a new source if
40
+ # need be
41
+ def gets
42
+ loop do
43
+ return nil unless self.current_source
44
+ line = self.current_source.gets
45
+ return line if line
46
+
47
+ @current_source.close unless @current_source == $stdin
48
+ @current_source = nil
49
+ end
50
+ end
51
+
52
+ #
53
+ # Iterator yielding the line returned, stopping on no more lines
54
+ #
55
+ def each_line
56
+ while line = self.gets do
57
+ yield line
58
+ end
59
+ end
60
+ end
61
+ end