file_sort 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/file_sort.rb +163 -0
- data/lib/large_file_generator.rb +24 -0
- data/lib/merger.rb +57 -0
- data/lib/sorter.rb +27 -0
- metadata +48 -0
data/lib/file_sort.rb
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
class FileSort
|
2
|
+
|
3
|
+
DEFAULTS = {
|
4
|
+
sort_column: 0,
|
5
|
+
column_separator: "\t",
|
6
|
+
num_processes: 3,
|
7
|
+
parse_as: :int, #other options: :string
|
8
|
+
lines_per_split: 1e6,
|
9
|
+
replace_original: true,
|
10
|
+
log_output: true
|
11
|
+
}
|
12
|
+
|
13
|
+
def initialize(filename, options = {})
|
14
|
+
@filename = filename
|
15
|
+
@options = DEFAULTS.merge(options)
|
16
|
+
@options[:lines_per_split] = @options[:lines_per_split].to_i
|
17
|
+
|
18
|
+
@file_id_counter = 0
|
19
|
+
@files_to_sort = []
|
20
|
+
@files_to_merge = []
|
21
|
+
@active_workers = 0
|
22
|
+
@start_time = nil
|
23
|
+
@scheduler_thread = nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def sort!
|
27
|
+
@start_time = Time.now
|
28
|
+
self.log("Sorting #{@filename} with up to #{@options[:num_processes]} processes.")
|
29
|
+
self.make_splits
|
30
|
+
@scheduler_thread = Thread.new do
|
31
|
+
while true
|
32
|
+
break if @active_workers == 0 and @files_to_sort.empty? and @files_to_merge.size == 1
|
33
|
+
if @active_workers < @options[:num_processes]
|
34
|
+
unless @files_to_sort.empty?
|
35
|
+
self.sort_split(@files_to_sort.shift)
|
36
|
+
next
|
37
|
+
end
|
38
|
+
unless @files_to_merge.size < 2
|
39
|
+
self.merge_splits(@files_to_merge.shift, @files_to_merge.shift, self.next_filename)
|
40
|
+
next
|
41
|
+
end
|
42
|
+
sleep
|
43
|
+
else
|
44
|
+
sleep
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
@scheduler_thread.join
|
49
|
+
final_name = "#{@filename}.sorted"
|
50
|
+
File.rename(@files_to_merge.first, final_name)
|
51
|
+
if @options[:replace_original]
|
52
|
+
File.delete(@filename)
|
53
|
+
File.rename(final_name, @filename)
|
54
|
+
end
|
55
|
+
self.log("#{@filename} sort complete.")
|
56
|
+
end
|
57
|
+
|
58
|
+
def make_splits
|
59
|
+
self.log("Splitting #{@filename} every #{@options[:lines_per_split]} lines")
|
60
|
+
self.worker_begin
|
61
|
+
Thread.new do
|
62
|
+
line_counter = 0
|
63
|
+
infile = File.open(@filename)
|
64
|
+
output_filename = self.next_filename
|
65
|
+
outfile = File.open(output_filename, "w")
|
66
|
+
while line = infile.gets
|
67
|
+
if line_counter >= @options[:lines_per_split]
|
68
|
+
outfile.close
|
69
|
+
self.log("Split written: #{fid output_filename}")
|
70
|
+
@files_to_sort << output_filename
|
71
|
+
output_filename = self.next_filename
|
72
|
+
outfile = File.open(output_filename, "w")
|
73
|
+
line_counter = 0
|
74
|
+
@scheduler_thread.wakeup
|
75
|
+
end
|
76
|
+
outfile.print(line)
|
77
|
+
line_counter += 1
|
78
|
+
end
|
79
|
+
infile.close
|
80
|
+
outfile.close
|
81
|
+
self.log("Split written: #{fid output_filename} (final split)")
|
82
|
+
@files_to_sort << output_filename
|
83
|
+
self.worker_done
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def sort_split(filename)
|
88
|
+
sorted_filename = self.next_filename
|
89
|
+
self.log("Sorting #{fid filename} as #{fid sorted_filename}")
|
90
|
+
self.worker_begin
|
91
|
+
Thread.new do
|
92
|
+
|
93
|
+
pid = Process.spawn({
|
94
|
+
"input_filename" => filename,
|
95
|
+
"sorted_filename" => sorted_filename,
|
96
|
+
"sort_column" => @options[:sort_column].to_s,
|
97
|
+
"sort_as_int" => (@options[:parse_as] == :int ? "true" : "false"),
|
98
|
+
"column_separator" => @options[:column_separator]
|
99
|
+
}, "ruby #{File.join(File.dirname(__FILE__), 'sorter.rb')}")
|
100
|
+
Process.waitpid(pid)
|
101
|
+
File.delete(filename)
|
102
|
+
self.log("Sort complete for #{fid filename} as #{fid sorted_filename}")
|
103
|
+
@files_to_merge << sorted_filename
|
104
|
+
self.worker_done
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def merge_splits(filename1, filename2, output_filename)
|
109
|
+
self.log("Merging (#{fid filename1}, #{fid filename2}) => #{fid output_filename}")
|
110
|
+
self.worker_begin
|
111
|
+
Thread.new do
|
112
|
+
pid = Process.spawn({
|
113
|
+
"filename1" => filename1,
|
114
|
+
"filename2" => filename2,
|
115
|
+
"output_filename" => output_filename,
|
116
|
+
"sort_column" => @options[:sort_column].to_s,
|
117
|
+
"sort_as_int" => (@options[:parse_as] == :int ? "true" : "false"),
|
118
|
+
"column_separator" => @options[:column_separator]
|
119
|
+
}, "ruby #{File.join(File.dirname(__FILE__), 'merger.rb')}")
|
120
|
+
Process.waitpid(pid)
|
121
|
+
File.delete(filename1)
|
122
|
+
File.delete(filename2)
|
123
|
+
self.log("Merge complete for (#{fid filename1}, #{fid filename2}) => #{fid output_filename}")
|
124
|
+
@files_to_merge << output_filename
|
125
|
+
self.worker_done
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def worker_begin
|
130
|
+
@active_workers += 1
|
131
|
+
end
|
132
|
+
|
133
|
+
def worker_done
|
134
|
+
@active_workers -= 1
|
135
|
+
@scheduler_thread.wakeup
|
136
|
+
end
|
137
|
+
|
138
|
+
def next_filename
|
139
|
+
return "#{@filename}.#{(@file_id_counter += 1)}"
|
140
|
+
end
|
141
|
+
|
142
|
+
def fid(filename)
|
143
|
+
return "F-#{filename.split(".").last}"
|
144
|
+
end
|
145
|
+
|
146
|
+
def seconds_to_pretty_time(num_seconds)
|
147
|
+
hours = (num_seconds / (60**2)).to_i
|
148
|
+
minutes = ((num_seconds % (60**2)) / 60).to_i
|
149
|
+
padded_minutes = minutes < 10 ? "0#{minutes}" : minutes.to_s
|
150
|
+
seconds = num_seconds.round(0).to_i % 60
|
151
|
+
seconds_padded = seconds < 10 ? "0#{seconds}" : seconds.to_s
|
152
|
+
return "#{hours}:#{padded_minutes}:#{seconds_padded}"
|
153
|
+
end
|
154
|
+
|
155
|
+
def log(message)
|
156
|
+
return unless @options[:log_output]
|
157
|
+
puts "#{seconds_to_pretty_time(Time.now - @start_time)} #{message}"
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
#Run as
|
163
|
+
#FileSort.new("large-file-1000000.csv", {parse_as: :string, sort_column: 1}).sort!
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class LargeFileGenerator
|
2
|
+
|
3
|
+
DEFAULTS = {
|
4
|
+
num_columns: 10,
|
5
|
+
num_rows: 1e6.to_i,
|
6
|
+
column_separator: "\t"
|
7
|
+
}
|
8
|
+
|
9
|
+
def initialize(filename, options = {})
|
10
|
+
@filename = filename
|
11
|
+
@options = DEFAULTS.merge(options)
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate!
|
15
|
+
File.open(@filename, "w") do |f|
|
16
|
+
@options[:num_rows].times do
|
17
|
+
f.puts (0...@options[:num_columns]).map{rand(@options[:num_rows])}.join(@options[:column_separator])
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
#Run as
|
24
|
+
#LargeFileGenerator.new("large-file-1000000.csv", { num_rows: 1000000 }).generate!
|
data/lib/merger.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
class Merger
|
2
|
+
def initialize
|
3
|
+
@filename1 = ENV["filename1"]
|
4
|
+
@filename2 = ENV["filename2"]
|
5
|
+
@output_filename = ENV["output_filename"]
|
6
|
+
@sort_column = ENV["sort_column"].to_i
|
7
|
+
@sort_as_int = ENV["sort_as_int"] == "true"
|
8
|
+
@column_separator = ENV["column_separator"]
|
9
|
+
end
|
10
|
+
|
11
|
+
def merge!
|
12
|
+
outfile = File.open(@output_filename, "w")
|
13
|
+
f1 = File.open(@filename1)
|
14
|
+
f2 = File.open(@filename2)
|
15
|
+
|
16
|
+
f1_line = self.get_line(f1)
|
17
|
+
f2_line = self.get_line(f2)
|
18
|
+
while !f1_line.nil? and !f2_line.nil?
|
19
|
+
if f1_line[@sort_column] < f2_line[@sort_column]
|
20
|
+
self.write_line(outfile, f1_line)
|
21
|
+
f1_line = self.get_line(f1)
|
22
|
+
else
|
23
|
+
self.write_line(outfile, f2_line)
|
24
|
+
f2_line = self.get_line(f2)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
while !f1_line.nil?
|
29
|
+
self.write_line(outfile, f1_line)
|
30
|
+
f1_line = self.get_line(f1)
|
31
|
+
end
|
32
|
+
|
33
|
+
while !f2_line.nil?
|
34
|
+
self.write_line(outfile, f2_line)
|
35
|
+
f2_line = self.get_line(f2)
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
f1.close
|
40
|
+
f2.close
|
41
|
+
outfile.close
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_line(stream)
|
45
|
+
line = stream.gets
|
46
|
+
return nil if line.nil?
|
47
|
+
line = line.chomp.split(@column_separator)
|
48
|
+
line[@sort_column] = line[@sort_column].to_i if @sort_as_int
|
49
|
+
return line
|
50
|
+
end
|
51
|
+
|
52
|
+
def write_line(stream, line)
|
53
|
+
stream.puts line.join(@column_separator)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
Merger.new.merge!
|
data/lib/sorter.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
class Sorter
|
2
|
+
|
3
|
+
def initialize
|
4
|
+
@input_filename = ENV["input_filename"]
|
5
|
+
@sorted_filename = ENV["sorted_filename"]
|
6
|
+
@sort_column = ENV["sort_column"].to_i
|
7
|
+
@sort_as_int = ENV["sort_as_int"] == "true"
|
8
|
+
@column_separator = ENV["column_separator"]
|
9
|
+
end
|
10
|
+
|
11
|
+
def sort!
|
12
|
+
lines = []
|
13
|
+
infile = File.open(@input_filename)
|
14
|
+
while line = infile.gets
|
15
|
+
line = line.chomp.split(@column_separator)
|
16
|
+
line[@sort_column] = line[@sort_column].to_i if @sort_as_int
|
17
|
+
lines << line
|
18
|
+
end
|
19
|
+
infile.close
|
20
|
+
lines.sort!{ |a, b| a[@sort_column] <=> b[@sort_column] }
|
21
|
+
outfile = File.open(@sorted_filename, "w")
|
22
|
+
lines.each{ |line| outfile.puts(line.join(@column_separator)) }
|
23
|
+
outfile.close
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Sorter.new.sort!
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: file_sort
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tom O'Neill
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-27 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Sorts large files using merge sort on temporary files on the hard drive.
|
15
|
+
email: tom.oneill@live.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/file_sort.rb
|
21
|
+
- lib/large_file_generator.rb
|
22
|
+
- lib/merger.rb
|
23
|
+
- lib/sorter.rb
|
24
|
+
homepage: https://github.com/mopatches/file_sort_ruby_gem
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.24
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: FileSort - Sorts files too large to fit in RAM
|
48
|
+
test_files: []
|