file_sort 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/file_sort.rb ADDED
@@ -0,0 +1,163 @@
1
+ class FileSort
2
+
3
+ DEFAULTS = {
4
+ sort_column: 0,
5
+ column_separator: "\t",
6
+ num_processes: 3,
7
+ parse_as: :int, #other options: :string
8
+ lines_per_split: 1e6,
9
+ replace_original: true,
10
+ log_output: true
11
+ }
12
+
13
+ def initialize(filename, options = {})
14
+ @filename = filename
15
+ @options = DEFAULTS.merge(options)
16
+ @options[:lines_per_split] = @options[:lines_per_split].to_i
17
+
18
+ @file_id_counter = 0
19
+ @files_to_sort = []
20
+ @files_to_merge = []
21
+ @active_workers = 0
22
+ @start_time = nil
23
+ @scheduler_thread = nil
24
+ end
25
+
26
+ def sort!
27
+ @start_time = Time.now
28
+ self.log("Sorting #{@filename} with up to #{@options[:num_processes]} processes.")
29
+ self.make_splits
30
+ @scheduler_thread = Thread.new do
31
+ while true
32
+ break if @active_workers == 0 and @files_to_sort.empty? and @files_to_merge.size == 1
33
+ if @active_workers < @options[:num_processes]
34
+ unless @files_to_sort.empty?
35
+ self.sort_split(@files_to_sort.shift)
36
+ next
37
+ end
38
+ unless @files_to_merge.size < 2
39
+ self.merge_splits(@files_to_merge.shift, @files_to_merge.shift, self.next_filename)
40
+ next
41
+ end
42
+ sleep
43
+ else
44
+ sleep
45
+ end
46
+ end
47
+ end
48
+ @scheduler_thread.join
49
+ final_name = "#{@filename}.sorted"
50
+ File.rename(@files_to_merge.first, final_name)
51
+ if @options[:replace_original]
52
+ File.delete(@filename)
53
+ File.rename(final_name, @filename)
54
+ end
55
+ self.log("#{@filename} sort complete.")
56
+ end
57
+
58
+ def make_splits
59
+ self.log("Splitting #{@filename} every #{@options[:lines_per_split]} lines")
60
+ self.worker_begin
61
+ Thread.new do
62
+ line_counter = 0
63
+ infile = File.open(@filename)
64
+ output_filename = self.next_filename
65
+ outfile = File.open(output_filename, "w")
66
+ while line = infile.gets
67
+ if line_counter >= @options[:lines_per_split]
68
+ outfile.close
69
+ self.log("Split written: #{fid output_filename}")
70
+ @files_to_sort << output_filename
71
+ output_filename = self.next_filename
72
+ outfile = File.open(output_filename, "w")
73
+ line_counter = 0
74
+ @scheduler_thread.wakeup
75
+ end
76
+ outfile.print(line)
77
+ line_counter += 1
78
+ end
79
+ infile.close
80
+ outfile.close
81
+ self.log("Split written: #{fid output_filename} (final split)")
82
+ @files_to_sort << output_filename
83
+ self.worker_done
84
+ end
85
+ end
86
+
87
+ def sort_split(filename)
88
+ sorted_filename = self.next_filename
89
+ self.log("Sorting #{fid filename} as #{fid sorted_filename}")
90
+ self.worker_begin
91
+ Thread.new do
92
+
93
+ pid = Process.spawn({
94
+ "input_filename" => filename,
95
+ "sorted_filename" => sorted_filename,
96
+ "sort_column" => @options[:sort_column].to_s,
97
+ "sort_as_int" => (@options[:parse_as] == :int ? "true" : "false"),
98
+ "column_separator" => @options[:column_separator]
99
+ }, "ruby #{File.join(File.dirname(__FILE__), 'sorter.rb')}")
100
+ Process.waitpid(pid)
101
+ File.delete(filename)
102
+ self.log("Sort complete for #{fid filename} as #{fid sorted_filename}")
103
+ @files_to_merge << sorted_filename
104
+ self.worker_done
105
+ end
106
+ end
107
+
108
+ def merge_splits(filename1, filename2, output_filename)
109
+ self.log("Merging (#{fid filename1}, #{fid filename2}) => #{fid output_filename}")
110
+ self.worker_begin
111
+ Thread.new do
112
+ pid = Process.spawn({
113
+ "filename1" => filename1,
114
+ "filename2" => filename2,
115
+ "output_filename" => output_filename,
116
+ "sort_column" => @options[:sort_column].to_s,
117
+ "sort_as_int" => (@options[:parse_as] == :int ? "true" : "false"),
118
+ "column_separator" => @options[:column_separator]
119
+ }, "ruby #{File.join(File.dirname(__FILE__), 'merger.rb')}")
120
+ Process.waitpid(pid)
121
+ File.delete(filename1)
122
+ File.delete(filename2)
123
+ self.log("Merge complete for (#{fid filename1}, #{fid filename2}) => #{fid output_filename}")
124
+ @files_to_merge << output_filename
125
+ self.worker_done
126
+ end
127
+ end
128
+
129
+ def worker_begin
130
+ @active_workers += 1
131
+ end
132
+
133
+ def worker_done
134
+ @active_workers -= 1
135
+ @scheduler_thread.wakeup
136
+ end
137
+
138
+ def next_filename
139
+ return "#{@filename}.#{(@file_id_counter += 1)}"
140
+ end
141
+
142
+ def fid(filename)
143
+ return "F-#{filename.split(".").last}"
144
+ end
145
+
146
+ def seconds_to_pretty_time(num_seconds)
147
+ hours = (num_seconds / (60**2)).to_i
148
+ minutes = ((num_seconds % (60**2)) / 60).to_i
149
+ padded_minutes = minutes < 10 ? "0#{minutes}" : minutes.to_s
150
+ seconds = num_seconds.round(0).to_i % 60
151
+ seconds_padded = seconds < 10 ? "0#{seconds}" : seconds.to_s
152
+ return "#{hours}:#{padded_minutes}:#{seconds_padded}"
153
+ end
154
+
155
+ def log(message)
156
+ return unless @options[:log_output]
157
+ puts "#{seconds_to_pretty_time(Time.now - @start_time)} #{message}"
158
+ end
159
+
160
+ end
161
+
162
+ #Run as
163
+ #FileSort.new("large-file-1000000.csv", {parse_as: :string, sort_column: 1}).sort!
@@ -0,0 +1,24 @@
1
+ class LargeFileGenerator
2
+
3
+ DEFAULTS = {
4
+ num_columns: 10,
5
+ num_rows: 1e6.to_i,
6
+ column_separator: "\t"
7
+ }
8
+
9
+ def initialize(filename, options = {})
10
+ @filename = filename
11
+ @options = DEFAULTS.merge(options)
12
+ end
13
+
14
+ def generate!
15
+ File.open(@filename, "w") do |f|
16
+ @options[:num_rows].times do
17
+ f.puts (0...@options[:num_columns]).map{rand(@options[:num_rows])}.join(@options[:column_separator])
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ #Run as
24
+ #LargeFileGenerator.new("large-file-1000000.csv", { num_rows: 1000000 }).generate!
data/lib/merger.rb ADDED
@@ -0,0 +1,57 @@
1
+ class Merger
2
+ def initialize
3
+ @filename1 = ENV["filename1"]
4
+ @filename2 = ENV["filename2"]
5
+ @output_filename = ENV["output_filename"]
6
+ @sort_column = ENV["sort_column"].to_i
7
+ @sort_as_int = ENV["sort_as_int"] == "true"
8
+ @column_separator = ENV["column_separator"]
9
+ end
10
+
11
+ def merge!
12
+ outfile = File.open(@output_filename, "w")
13
+ f1 = File.open(@filename1)
14
+ f2 = File.open(@filename2)
15
+
16
+ f1_line = self.get_line(f1)
17
+ f2_line = self.get_line(f2)
18
+ while !f1_line.nil? and !f2_line.nil?
19
+ if f1_line[@sort_column] < f2_line[@sort_column]
20
+ self.write_line(outfile, f1_line)
21
+ f1_line = self.get_line(f1)
22
+ else
23
+ self.write_line(outfile, f2_line)
24
+ f2_line = self.get_line(f2)
25
+ end
26
+ end
27
+
28
+ while !f1_line.nil?
29
+ self.write_line(outfile, f1_line)
30
+ f1_line = self.get_line(f1)
31
+ end
32
+
33
+ while !f2_line.nil?
34
+ self.write_line(outfile, f2_line)
35
+ f2_line = self.get_line(f2)
36
+ end
37
+
38
+
39
+ f1.close
40
+ f2.close
41
+ outfile.close
42
+ end
43
+
44
+ def get_line(stream)
45
+ line = stream.gets
46
+ return nil if line.nil?
47
+ line = line.chomp.split(@column_separator)
48
+ line[@sort_column] = line[@sort_column].to_i if @sort_as_int
49
+ return line
50
+ end
51
+
52
+ def write_line(stream, line)
53
+ stream.puts line.join(@column_separator)
54
+ end
55
+ end
56
+
57
+ Merger.new.merge!
data/lib/sorter.rb ADDED
@@ -0,0 +1,27 @@
1
+ class Sorter
2
+
3
+ def initialize
4
+ @input_filename = ENV["input_filename"]
5
+ @sorted_filename = ENV["sorted_filename"]
6
+ @sort_column = ENV["sort_column"].to_i
7
+ @sort_as_int = ENV["sort_as_int"] == "true"
8
+ @column_separator = ENV["column_separator"]
9
+ end
10
+
11
+ def sort!
12
+ lines = []
13
+ infile = File.open(@input_filename)
14
+ while line = infile.gets
15
+ line = line.chomp.split(@column_separator)
16
+ line[@sort_column] = line[@sort_column].to_i if @sort_as_int
17
+ lines << line
18
+ end
19
+ infile.close
20
+ lines.sort!{ |a, b| a[@sort_column] <=> b[@sort_column] }
21
+ outfile = File.open(@sorted_filename, "w")
22
+ lines.each{ |line| outfile.puts(line.join(@column_separator)) }
23
+ outfile.close
24
+ end
25
+ end
26
+
27
+ Sorter.new.sort!
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: file_sort
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tom O'Neill
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-27 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Sorts large files using merge sort on temporary files on the hard drive.
15
+ email: tom.oneill@live.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/file_sort.rb
21
+ - lib/large_file_generator.rb
22
+ - lib/merger.rb
23
+ - lib/sorter.rb
24
+ homepage: https://github.com/mopatches/file_sort_ruby_gem
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.24
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: FileSort - Sorts files too large to fit in RAM
48
+ test_files: []