DanaDanger-data_transport 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/data_transport.rb +30 -0
- data/lib/data_transport/data_store.rb +31 -0
- data/lib/data_transport/data_store/active_record.rb +146 -0
- data/lib/data_transport/data_store/file.rb +117 -0
- data/lib/data_transport/map.rb +21 -0
- data/lib/data_transport/record/destination.rb +28 -0
- data/lib/data_transport/record/source.rb +24 -0
- metadata +59 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
require "data_transport/map"
|
2
|
+
require "data_transport/data_store"
|
3
|
+
|
4
|
+
module DataTransport
|
5
|
+
def self.default_batch_size
|
6
|
+
1000
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.map(input, output, options = {}, &block)
|
10
|
+
# Extract options.
|
11
|
+
ignore_duplicates = options.delete(:ignore_duplicates)
|
12
|
+
unless options.empty?
|
13
|
+
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
14
|
+
end
|
15
|
+
# If ignore_duplicates is true, make sure the output is a MySQL database.
|
16
|
+
if ignore_duplicates
|
17
|
+
unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
18
|
+
raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
# Run the transport.
|
22
|
+
output.reset
|
23
|
+
output.ignore_duplicates = true if ignore_duplicates
|
24
|
+
map = DataTransport::Map.new(&block)
|
25
|
+
input.each_record do |record|
|
26
|
+
output.write_record(map.map(record))
|
27
|
+
end
|
28
|
+
output.finalize
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "data_transport/data_store/file"
|
2
|
+
require "data_transport/data_store/active_record"
|
3
|
+
|
4
|
+
module DataTransport
|
5
|
+
class DataStore
|
6
|
+
def count
|
7
|
+
raise NotImplementedError
|
8
|
+
end
|
9
|
+
|
10
|
+
def each_record(batch_size = 1000)
|
11
|
+
raise NotImplementedError
|
12
|
+
end
|
13
|
+
|
14
|
+
def write_record(record)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def finalize
|
19
|
+
# Do nothing by default.
|
20
|
+
end
|
21
|
+
|
22
|
+
protected
|
23
|
+
|
24
|
+
def check_batch_size(batch_size)
|
25
|
+
batch_size ||= DataTransport.default_batch_size
|
26
|
+
raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
|
27
|
+
raise(RangeError, "batch size must be greater than zero") if batch_size < 1
|
28
|
+
batch_size
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
module DataTransport
|
2
|
+
class DataStore
|
3
|
+
class ActiveRecord < DataStore
|
4
|
+
attr_accessor :ignore_duplicates
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
super()
|
8
|
+
# Extract options.
|
9
|
+
@class = options.delete(:class)
|
10
|
+
@connection = options.delete(:connection)
|
11
|
+
@table_name = options.delete(:table_name)
|
12
|
+
@conditions = options.delete(:conditions)
|
13
|
+
@truncate = options.delete(:truncate)
|
14
|
+
# Make sure a class or connection and table name was provided.
|
15
|
+
if @class.nil? && (@connection.nil? || @table_name.nil?)
|
16
|
+
raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
|
17
|
+
end
|
18
|
+
raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
|
19
|
+
# If connection specs were provided instead of a class, make an
|
20
|
+
# anonymous ActiveRecord subclass.
|
21
|
+
unless @class
|
22
|
+
@class = Class.new(::ActiveRecord::Base)
|
23
|
+
@class.set_table_name @table_name
|
24
|
+
@class.establish_connection @connection
|
25
|
+
end
|
26
|
+
# Make sure the class descends from ActiveRecord::Base.
|
27
|
+
klass = @class.superclass
|
28
|
+
is_active_record = false
|
29
|
+
while klass
|
30
|
+
if klass == ::ActiveRecord::Base
|
31
|
+
is_active_record = true
|
32
|
+
break
|
33
|
+
end
|
34
|
+
klass = klass.superclass
|
35
|
+
end
|
36
|
+
raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
|
37
|
+
# Check for unknown options.
|
38
|
+
unless options.empty?
|
39
|
+
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
40
|
+
end
|
41
|
+
# Ask the database how much data it can handle in one query. This only
|
42
|
+
# works on MySQL.
|
43
|
+
begin
|
44
|
+
rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
|
45
|
+
@max_allowed_packet = rows.first["Value"].to_i - 512
|
46
|
+
rescue
|
47
|
+
end
|
48
|
+
# Fetch column information
|
49
|
+
@columns = {}
|
50
|
+
@class.columns.each {|c| @columns[c.name.to_sym] = c}
|
51
|
+
end
|
52
|
+
|
53
|
+
def klass
|
54
|
+
@class
|
55
|
+
end
|
56
|
+
|
57
|
+
def count
|
58
|
+
@class.count(:conditions => @conditions)
|
59
|
+
end
|
60
|
+
|
61
|
+
def each_record(batch_size = nil)
|
62
|
+
batch_size = check_batch_size(batch_size)
|
63
|
+
|
64
|
+
conn = @class.connection
|
65
|
+
column_names = conn.columns(@class.table_name).collect {|c| c.name}
|
66
|
+
|
67
|
+
offset = 0
|
68
|
+
record = {}
|
69
|
+
base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
|
70
|
+
@class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
|
71
|
+
while true
|
72
|
+
sql = base_query.dup
|
73
|
+
conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
|
74
|
+
offset += batch_size
|
75
|
+
rows = conn.select_rows(sql)
|
76
|
+
break if rows.empty?
|
77
|
+
rows.each do |row|
|
78
|
+
record.clear
|
79
|
+
column_names.each_with_index do |column_name, i|
|
80
|
+
column_name = column_name.to_sym
|
81
|
+
record[column_name] = @columns[column_name].type_cast(row[i])
|
82
|
+
end
|
83
|
+
yield record
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def write_record(record)
|
89
|
+
conn = @class.connection
|
90
|
+
# If no SQL has been produced yet, start an INSERT statement.
|
91
|
+
@sql_buffer ||= start_insert_sql(record)
|
92
|
+
# Convert the record into a string of quoted values.
|
93
|
+
values = []
|
94
|
+
record.each {|k, v| values << conn.quote(v, @columns[k])}
|
95
|
+
values = "(#{values.join ","}),"
|
96
|
+
# Write the record.
|
97
|
+
if @max_allowed_packet.nil?
|
98
|
+
# We have no information on the database's maximum allowed packet
|
99
|
+
# size, so it's safest to write the record immediately.
|
100
|
+
@sql_buffer << values
|
101
|
+
finalize
|
102
|
+
elsif @sql_buffer.length + record.length > @max_allowed_packet
|
103
|
+
# Appending this record to the SQL buffer will exceed the maximum
|
104
|
+
# allowed packet size. Send the buffer to the database and start a
|
105
|
+
# new statement with this record.
|
106
|
+
finalize
|
107
|
+
@sql_buffer = start_insert_sql
|
108
|
+
@sql_buffer << values
|
109
|
+
else
|
110
|
+
# This record will not cause the SQL buffer to exceed the maximum
|
111
|
+
# allowed packet size. Append it to the SQL buffer.
|
112
|
+
@sql_buffer << values
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def finalize
|
117
|
+
if @truncate
|
118
|
+
conn = @class.connection
|
119
|
+
begin
|
120
|
+
conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
|
121
|
+
rescue
|
122
|
+
@class.delete_all
|
123
|
+
end
|
124
|
+
@truncate = false
|
125
|
+
end
|
126
|
+
if @sql_buffer[-1,1] == ","
|
127
|
+
@sql_buffer.chop!
|
128
|
+
@class.connection.execute(@sql_buffer)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def reset
|
133
|
+
self.ignore_duplicates = false
|
134
|
+
@sql_buffer = nil
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def start_insert_sql(record)
|
140
|
+
"INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
|
141
|
+
"#{@class.connection.quote_table_name(@class.table_name)} " +
|
142
|
+
"(#{record.keys.join ","}) VALUES "
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module DataTransport
|
2
|
+
class DataStore
|
3
|
+
class File < DataStore
|
4
|
+
attr_reader :mode
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
super()
|
8
|
+
|
9
|
+
@header = options.delete(:header)
|
10
|
+
@delimiter = options.delete(:delimiter) || "\t"
|
11
|
+
@path = options.delete(:path)
|
12
|
+
@null = options.delete(:null) || ""
|
13
|
+
@keys = options.delete(:keys)
|
14
|
+
|
15
|
+
raise(ArgumentError, "missing required option `path'") if @path.nil?
|
16
|
+
unless options.empty?
|
17
|
+
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def count
|
22
|
+
return @count if @count
|
23
|
+
self.mode = :input
|
24
|
+
line_count = 0
|
25
|
+
rewind_and_restore do
|
26
|
+
io.readline if @header
|
27
|
+
until io.eof?
|
28
|
+
io.gets
|
29
|
+
line_count += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
@count = line_count
|
33
|
+
end
|
34
|
+
|
35
|
+
def each_record(batch_size = nil)
|
36
|
+
self.mode = :input
|
37
|
+
|
38
|
+
batch_size = check_batch_size(batch_size)
|
39
|
+
|
40
|
+
io.rewind
|
41
|
+
io.readline if @header
|
42
|
+
until io.eof?
|
43
|
+
line = io.gets || break
|
44
|
+
line.chomp!
|
45
|
+
values = line.split(/#{@delimiter}/)
|
46
|
+
if keys.length != values.length
|
47
|
+
raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
|
48
|
+
end
|
49
|
+
record = {}
|
50
|
+
keys.length.times {|i| record[keys[i]] = values[i]}
|
51
|
+
yield record
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def write_record(record)
|
56
|
+
self.mode = :output
|
57
|
+
# If no key order was ever specified, make one up.
|
58
|
+
@keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
|
59
|
+
# Write the header if this is the first record.
|
60
|
+
if @header && io.pos == 0
|
61
|
+
io.puts(keys.join(@delimiter))
|
62
|
+
end
|
63
|
+
# Write the values in a predictable order.
|
64
|
+
values = keys.collect do |k|
|
65
|
+
record[k].nil?? @null : record[k]
|
66
|
+
end
|
67
|
+
io.puts(values.join(@delimiter))
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def mode=(new_mode)
|
73
|
+
if !@mode.nil? && @mode != new_mode
|
74
|
+
raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
|
75
|
+
end
|
76
|
+
unless [:input, :output].include?(new_mode)
|
77
|
+
raise ArgumentError, "unknown mode `#{new_mode}'"
|
78
|
+
end
|
79
|
+
@mode = new_mode
|
80
|
+
end
|
81
|
+
|
82
|
+
def io
|
83
|
+
return @io if @io
|
84
|
+
if mode == :output
|
85
|
+
@io = ::File.open(@path, "w")
|
86
|
+
@io.rewind
|
87
|
+
@io
|
88
|
+
else
|
89
|
+
@io = ::File.open(@path, "r")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def keys
|
94
|
+
return @keys if @keys
|
95
|
+
return [] if mode == :output
|
96
|
+
line = rewind_and_restore { io.readline }
|
97
|
+
line.chomp!
|
98
|
+
fields = line.split(/#{@delimiter}/)
|
99
|
+
if @header
|
100
|
+
@keys = fields.collect! {|hdr| hdr.downcase.to_sym}
|
101
|
+
else
|
102
|
+
@keys = (0..(fields.length - 1)).to_a.collect! do |i|
|
103
|
+
sprintf("column%02d", i).to_sym
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def rewind_and_restore
|
109
|
+
pos = io.pos
|
110
|
+
io.rewind
|
111
|
+
result = yield
|
112
|
+
io.seek(pos)
|
113
|
+
result
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "data_transport/record/source"
|
2
|
+
require "data_transport/record/destination"
|
3
|
+
|
4
|
+
module DataTransport
|
5
|
+
class Map
|
6
|
+
attr_reader :source, :destination
|
7
|
+
|
8
|
+
def initialize(&block)
|
9
|
+
@block = block
|
10
|
+
@source = DataTransport::Record::Source.new
|
11
|
+
@destination = DataTransport::Record::Destination.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def map(record)
|
15
|
+
@source.record = record
|
16
|
+
@destination.reset!
|
17
|
+
@block.call(@source, @destination)
|
18
|
+
@destination.record
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DataTransport
|
2
|
+
module Record
|
3
|
+
class Destination
|
4
|
+
attr_reader :record
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@record = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def reset!
|
11
|
+
@record.clear
|
12
|
+
end
|
13
|
+
|
14
|
+
def method_missing(name, *args)
|
15
|
+
name_s = name.to_s
|
16
|
+
if name_s[-1,1] == "="
|
17
|
+
unless args.length == 1
|
18
|
+
raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
|
19
|
+
end
|
20
|
+
name_s.chop!
|
21
|
+
@record[name_s.to_sym] = args.first
|
22
|
+
else
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module DataTransport
|
2
|
+
module Record
|
3
|
+
class Source
|
4
|
+
def record=(record)
|
5
|
+
@record = record
|
6
|
+
end
|
7
|
+
|
8
|
+
def id
|
9
|
+
method_missing :id
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(name, *args)
|
13
|
+
if @record.has_key?(name)
|
14
|
+
unless args.empty?
|
15
|
+
raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
|
16
|
+
end
|
17
|
+
@record[name]
|
18
|
+
else
|
19
|
+
super
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: DanaDanger-data_transport
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dana Danger
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-08 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email:
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/data_transport.rb
|
26
|
+
- lib/data_transport/map.rb
|
27
|
+
- lib/data_transport/data_store.rb
|
28
|
+
- lib/data_transport/data_store/active_record.rb
|
29
|
+
- lib/data_transport/data_store/file.rb
|
30
|
+
- lib/data_transport/record/destination.rb
|
31
|
+
- lib/data_transport/record/source.rb
|
32
|
+
has_rdoc: false
|
33
|
+
homepage: http://github.com/DanaDanger/data_transport
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.2.0
|
55
|
+
signing_key:
|
56
|
+
specification_version: 2
|
57
|
+
summary: A gem for importing and exporting large quantities of data.
|
58
|
+
test_files: []
|
59
|
+
|