DanaDanger-data_transport 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/data_transport.rb +30 -0
- data/lib/data_transport/data_store.rb +31 -0
- data/lib/data_transport/data_store/active_record.rb +146 -0
- data/lib/data_transport/data_store/file.rb +117 -0
- data/lib/data_transport/map.rb +21 -0
- data/lib/data_transport/record/destination.rb +28 -0
- data/lib/data_transport/record/source.rb +24 -0
- metadata +59 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
require "data_transport/map"
|
2
|
+
require "data_transport/data_store"
|
3
|
+
|
4
|
+
module DataTransport
|
5
|
+
def self.default_batch_size
|
6
|
+
1000
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.map(input, output, options = {}, &block)
|
10
|
+
# Extract options.
|
11
|
+
ignore_duplicates = options.delete(:ignore_duplicates)
|
12
|
+
unless options.empty?
|
13
|
+
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
14
|
+
end
|
15
|
+
# If ignore_duplicates is true, make sure the output is a MySQL database.
|
16
|
+
if ignore_duplicates
|
17
|
+
unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
18
|
+
raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
# Run the transport.
|
22
|
+
output.reset
|
23
|
+
output.ignore_duplicates = true if ignore_duplicates
|
24
|
+
map = DataTransport::Map.new(&block)
|
25
|
+
input.each_record do |record|
|
26
|
+
output.write_record(map.map(record))
|
27
|
+
end
|
28
|
+
output.finalize
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "data_transport/data_store/file"
|
2
|
+
require "data_transport/data_store/active_record"
|
3
|
+
|
4
|
+
module DataTransport
|
5
|
+
class DataStore
|
6
|
+
def count
|
7
|
+
raise NotImplementedError
|
8
|
+
end
|
9
|
+
|
10
|
+
def each_record(batch_size = 1000)
|
11
|
+
raise NotImplementedError
|
12
|
+
end
|
13
|
+
|
14
|
+
def write_record(record)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def finalize
|
19
|
+
# Do nothing by default.
|
20
|
+
end
|
21
|
+
|
22
|
+
protected
|
23
|
+
|
24
|
+
def check_batch_size(batch_size)
|
25
|
+
batch_size ||= DataTransport.default_batch_size
|
26
|
+
raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
|
27
|
+
raise(RangeError, "batch size must be greater than zero") if batch_size < 1
|
28
|
+
batch_size
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
module DataTransport
|
2
|
+
class DataStore
|
3
|
+
class ActiveRecord < DataStore
|
4
|
+
attr_accessor :ignore_duplicates
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
super()
|
8
|
+
# Extract options.
|
9
|
+
@class = options.delete(:class)
|
10
|
+
@connection = options.delete(:connection)
|
11
|
+
@table_name = options.delete(:table_name)
|
12
|
+
@conditions = options.delete(:conditions)
|
13
|
+
@truncate = options.delete(:truncate)
|
14
|
+
# Make sure a class or connection and table name was provided.
|
15
|
+
if @class.nil? && (@connection.nil? || @table_name.nil?)
|
16
|
+
raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
|
17
|
+
end
|
18
|
+
raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
|
19
|
+
# If connection specs were provided instead of a class, make an
|
20
|
+
# anonymous ActiveRecord subclass.
|
21
|
+
unless @class
|
22
|
+
@class = Class.new(::ActiveRecord::Base)
|
23
|
+
@class.set_table_name @table_name
|
24
|
+
@class.establish_connection @connection
|
25
|
+
end
|
26
|
+
# Make sure the class descends from ActiveRecord::Base.
|
27
|
+
klass = @class.superclass
|
28
|
+
is_active_record = false
|
29
|
+
while klass
|
30
|
+
if klass == ::ActiveRecord::Base
|
31
|
+
is_active_record = true
|
32
|
+
break
|
33
|
+
end
|
34
|
+
klass = klass.superclass
|
35
|
+
end
|
36
|
+
raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
|
37
|
+
# Check for unknown options.
|
38
|
+
unless options.empty?
|
39
|
+
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
40
|
+
end
|
41
|
+
# Ask the database how much data it can handle in one query. This only
|
42
|
+
# works on MySQL.
|
43
|
+
begin
|
44
|
+
rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
|
45
|
+
@max_allowed_packet = rows.first["Value"].to_i - 512
|
46
|
+
rescue
|
47
|
+
end
|
48
|
+
# Fetch column information
|
49
|
+
@columns = {}
|
50
|
+
@class.columns.each {|c| @columns[c.name.to_sym] = c}
|
51
|
+
end
|
52
|
+
|
53
|
+
def klass
|
54
|
+
@class
|
55
|
+
end
|
56
|
+
|
57
|
+
def count
|
58
|
+
@class.count(:conditions => @conditions)
|
59
|
+
end
|
60
|
+
|
61
|
+
def each_record(batch_size = nil)
|
62
|
+
batch_size = check_batch_size(batch_size)
|
63
|
+
|
64
|
+
conn = @class.connection
|
65
|
+
column_names = conn.columns(@class.table_name).collect {|c| c.name}
|
66
|
+
|
67
|
+
offset = 0
|
68
|
+
record = {}
|
69
|
+
base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
|
70
|
+
@class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
|
71
|
+
while true
|
72
|
+
sql = base_query.dup
|
73
|
+
conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
|
74
|
+
offset += batch_size
|
75
|
+
rows = conn.select_rows(sql)
|
76
|
+
break if rows.empty?
|
77
|
+
rows.each do |row|
|
78
|
+
record.clear
|
79
|
+
column_names.each_with_index do |column_name, i|
|
80
|
+
column_name = column_name.to_sym
|
81
|
+
record[column_name] = @columns[column_name].type_cast(row[i])
|
82
|
+
end
|
83
|
+
yield record
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def write_record(record)
|
89
|
+
conn = @class.connection
|
90
|
+
# If no SQL has been produced yet, start an INSERT statement.
|
91
|
+
@sql_buffer ||= start_insert_sql(record)
|
92
|
+
# Convert the record into a string of quoted values.
|
93
|
+
values = []
|
94
|
+
record.each {|k, v| values << conn.quote(v, @columns[k])}
|
95
|
+
values = "(#{values.join ","}),"
|
96
|
+
# Write the record.
|
97
|
+
if @max_allowed_packet.nil?
|
98
|
+
# We have no information on the database's maximum allowed packet
|
99
|
+
# size, so it's safest to write the record immediately.
|
100
|
+
@sql_buffer << values
|
101
|
+
finalize
|
102
|
+
elsif @sql_buffer.length + record.length > @max_allowed_packet
|
103
|
+
# Appending this record to the SQL buffer will exceed the maximum
|
104
|
+
# allowed packet size. Send the buffer to the database and start a
|
105
|
+
# new statement with this record.
|
106
|
+
finalize
|
107
|
+
@sql_buffer = start_insert_sql
|
108
|
+
@sql_buffer << values
|
109
|
+
else
|
110
|
+
# This record will not cause the SQL buffer to exceed the maximum
|
111
|
+
# allowed packet size. Append it to the SQL buffer.
|
112
|
+
@sql_buffer << values
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def finalize
|
117
|
+
if @truncate
|
118
|
+
conn = @class.connection
|
119
|
+
begin
|
120
|
+
conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
|
121
|
+
rescue
|
122
|
+
@class.delete_all
|
123
|
+
end
|
124
|
+
@truncate = false
|
125
|
+
end
|
126
|
+
if @sql_buffer[-1,1] == ","
|
127
|
+
@sql_buffer.chop!
|
128
|
+
@class.connection.execute(@sql_buffer)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def reset
|
133
|
+
self.ignore_duplicates = false
|
134
|
+
@sql_buffer = nil
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def start_insert_sql(record)
|
140
|
+
"INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
|
141
|
+
"#{@class.connection.quote_table_name(@class.table_name)} " +
|
142
|
+
"(#{record.keys.join ","}) VALUES "
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module DataTransport
|
2
|
+
class DataStore
|
3
|
+
class File < DataStore
|
4
|
+
attr_reader :mode
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
super()
|
8
|
+
|
9
|
+
@header = options.delete(:header)
|
10
|
+
@delimiter = options.delete(:delimiter) || "\t"
|
11
|
+
@path = options.delete(:path)
|
12
|
+
@null = options.delete(:null) || ""
|
13
|
+
@keys = options.delete(:keys)
|
14
|
+
|
15
|
+
raise(ArgumentError, "missing required option `path'") if @path.nil?
|
16
|
+
unless options.empty?
|
17
|
+
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def count
|
22
|
+
return @count if @count
|
23
|
+
self.mode = :input
|
24
|
+
line_count = 0
|
25
|
+
rewind_and_restore do
|
26
|
+
io.readline if @header
|
27
|
+
until io.eof?
|
28
|
+
io.gets
|
29
|
+
line_count += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
@count = line_count
|
33
|
+
end
|
34
|
+
|
35
|
+
def each_record(batch_size = nil)
|
36
|
+
self.mode = :input
|
37
|
+
|
38
|
+
batch_size = check_batch_size(batch_size)
|
39
|
+
|
40
|
+
io.rewind
|
41
|
+
io.readline if @header
|
42
|
+
until io.eof?
|
43
|
+
line = io.gets || break
|
44
|
+
line.chomp!
|
45
|
+
values = line.split(/#{@delimiter}/)
|
46
|
+
if keys.length != values.length
|
47
|
+
raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
|
48
|
+
end
|
49
|
+
record = {}
|
50
|
+
keys.length.times {|i| record[keys[i]] = values[i]}
|
51
|
+
yield record
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def write_record(record)
|
56
|
+
self.mode = :output
|
57
|
+
# If no key order was ever specified, make one up.
|
58
|
+
@keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
|
59
|
+
# Write the header if this is the first record.
|
60
|
+
if @header && io.pos == 0
|
61
|
+
io.puts(keys.join(@delimiter))
|
62
|
+
end
|
63
|
+
# Write the values in a predictable order.
|
64
|
+
values = keys.collect do |k|
|
65
|
+
record[k].nil?? @null : record[k]
|
66
|
+
end
|
67
|
+
io.puts(values.join(@delimiter))
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def mode=(new_mode)
|
73
|
+
if !@mode.nil? && @mode != new_mode
|
74
|
+
raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
|
75
|
+
end
|
76
|
+
unless [:input, :output].include?(new_mode)
|
77
|
+
raise ArgumentError, "unknown mode `#{new_mode}'"
|
78
|
+
end
|
79
|
+
@mode = new_mode
|
80
|
+
end
|
81
|
+
|
82
|
+
def io
|
83
|
+
return @io if @io
|
84
|
+
if mode == :output
|
85
|
+
@io = ::File.open(@path, "w")
|
86
|
+
@io.rewind
|
87
|
+
@io
|
88
|
+
else
|
89
|
+
@io = ::File.open(@path, "r")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def keys
|
94
|
+
return @keys if @keys
|
95
|
+
return [] if mode == :output
|
96
|
+
line = rewind_and_restore { io.readline }
|
97
|
+
line.chomp!
|
98
|
+
fields = line.split(/#{@delimiter}/)
|
99
|
+
if @header
|
100
|
+
@keys = fields.collect! {|hdr| hdr.downcase.to_sym}
|
101
|
+
else
|
102
|
+
@keys = (0..(fields.length - 1)).to_a.collect! do |i|
|
103
|
+
sprintf("column%02d", i).to_sym
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def rewind_and_restore
|
109
|
+
pos = io.pos
|
110
|
+
io.rewind
|
111
|
+
result = yield
|
112
|
+
io.seek(pos)
|
113
|
+
result
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "data_transport/record/source"
|
2
|
+
require "data_transport/record/destination"
|
3
|
+
|
4
|
+
module DataTransport
|
5
|
+
class Map
|
6
|
+
attr_reader :source, :destination
|
7
|
+
|
8
|
+
def initialize(&block)
|
9
|
+
@block = block
|
10
|
+
@source = DataTransport::Record::Source.new
|
11
|
+
@destination = DataTransport::Record::Destination.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def map(record)
|
15
|
+
@source.record = record
|
16
|
+
@destination.reset!
|
17
|
+
@block.call(@source, @destination)
|
18
|
+
@destination.record
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DataTransport
|
2
|
+
module Record
|
3
|
+
class Destination
|
4
|
+
attr_reader :record
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@record = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def reset!
|
11
|
+
@record.clear
|
12
|
+
end
|
13
|
+
|
14
|
+
def method_missing(name, *args)
|
15
|
+
name_s = name.to_s
|
16
|
+
if name_s[-1,1] == "="
|
17
|
+
unless args.length == 1
|
18
|
+
raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
|
19
|
+
end
|
20
|
+
name_s.chop!
|
21
|
+
@record[name_s.to_sym] = args.first
|
22
|
+
else
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module DataTransport
|
2
|
+
module Record
|
3
|
+
class Source
|
4
|
+
def record=(record)
|
5
|
+
@record = record
|
6
|
+
end
|
7
|
+
|
8
|
+
def id
|
9
|
+
method_missing :id
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(name, *args)
|
13
|
+
if @record.has_key?(name)
|
14
|
+
unless args.empty?
|
15
|
+
raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
|
16
|
+
end
|
17
|
+
@record[name]
|
18
|
+
else
|
19
|
+
super
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: DanaDanger-data_transport
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dana Danger
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-08 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email:
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/data_transport.rb
|
26
|
+
- lib/data_transport/map.rb
|
27
|
+
- lib/data_transport/data_store.rb
|
28
|
+
- lib/data_transport/data_store/active_record.rb
|
29
|
+
- lib/data_transport/data_store/file.rb
|
30
|
+
- lib/data_transport/record/destination.rb
|
31
|
+
- lib/data_transport/record/source.rb
|
32
|
+
has_rdoc: false
|
33
|
+
homepage: http://github.com/DanaDanger/data_transport
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.2.0
|
55
|
+
signing_key:
|
56
|
+
specification_version: 2
|
57
|
+
summary: A gem for importing and exporting large quantities of data.
|
58
|
+
test_files: []
|
59
|
+
|