DanaDanger-data_transport 0.1.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/data_transport.rb +36 -14
- data/lib/data_transport/data_store.rb +4 -8
- data/lib/data_transport/data_store/active_record.rb +61 -17
- data/lib/data_transport/data_store/file.rb +82 -13
- data/lib/data_transport/record/destination.rb +2 -2
- data/lib/data_transport/record/source.rb +2 -2
- metadata +2 -3
- data/lib/data_transport/map.rb +0 -21
data/lib/data_transport.rb
CHANGED
@@ -1,29 +1,51 @@
|
|
1
|
-
require "data_transport/map"
|
2
1
|
require "data_transport/data_store"
|
2
|
+
require "data_transport/record/source"
|
3
|
+
require "data_transport/record/destination"
|
3
4
|
|
4
5
|
module DataTransport
|
5
|
-
|
6
|
-
1000
|
7
|
-
end
|
6
|
+
DEFAULT_BATCH_SIZE = 1000 # :nodoc:
|
8
7
|
|
8
|
+
# Reads records from an input data source, processes them with the supplied
|
9
|
+
# block, and writes them to an output data source. Accepts the following
|
10
|
+
# options:
|
11
|
+
#
|
12
|
+
# batch_size:: Records are read from the input in batches. This option sets
|
13
|
+
# the number of records in a single batch. Default is 1000.
|
14
|
+
#
|
15
|
+
# The block is passed two objects that represent the source and destination
|
16
|
+
# record. These objects have methods that reflect the attributes of the
|
17
|
+
# records. The following example reads the +name+ and +price+ attributes from
|
18
|
+
# input records, downcases the name, multiplies the price by 100, and writes
|
19
|
+
# them to the output:
|
20
|
+
#
|
21
|
+
# # input = DataTransport::DataSource:: ...
|
22
|
+
# # output = DataTransport::DataSource:: ...
|
23
|
+
#
|
24
|
+
# DataTransport.map(input, output) do |src, dst|
|
25
|
+
# dst.name = src.name.downcase
|
26
|
+
# dst.price = (src.price * 100).to_i
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# The destination doesn't necessarily have to have the same attributes as the
|
30
|
+
# source (or even the same number of attributes). The transformations that
|
31
|
+
# can be accomplished are limited only by what you can do in a block of Ruby.
|
9
32
|
def self.map(input, output, options = {}, &block)
|
10
33
|
# Extract options.
|
11
|
-
|
34
|
+
batch_size = options[:batch_size] || DEFAULT_BATCH_SIZE
|
35
|
+
raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
|
36
|
+
raise(RangeError, "batch size must be greater than zero") if batch_size < 1
|
12
37
|
unless options.empty?
|
13
38
|
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
14
39
|
end
|
15
|
-
# If ignore_duplicates is true, make sure the output is a MySQL database.
|
16
|
-
if ignore_duplicates
|
17
|
-
unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
18
|
-
raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
|
19
|
-
end
|
20
|
-
end
|
21
40
|
# Run the transport.
|
22
41
|
output.reset
|
23
|
-
|
24
|
-
|
42
|
+
source = DataTransport::Record::Source.new
|
43
|
+
destination = DataTransport::Record::Destination.new
|
25
44
|
input.each_record do |record|
|
26
|
-
|
45
|
+
source.record = record
|
46
|
+
destination.reset!
|
47
|
+
yield source, destination
|
48
|
+
output.write_record(destination.record)
|
27
49
|
end
|
28
50
|
output.finalize
|
29
51
|
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require "data_transport/data_store/file"
|
2
|
+
require "data_transport/data_store/csv_file"
|
2
3
|
require "data_transport/data_store/active_record"
|
3
4
|
|
4
5
|
module DataTransport
|
5
|
-
class DataStore
|
6
|
+
class DataStore # :nodoc:
|
6
7
|
def count
|
7
8
|
raise NotImplementedError
|
8
9
|
end
|
@@ -19,13 +20,8 @@ module DataTransport
|
|
19
20
|
# Do nothing by default.
|
20
21
|
end
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
def check_batch_size(batch_size)
|
25
|
-
batch_size ||= DataTransport.default_batch_size
|
26
|
-
raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
|
27
|
-
raise(RangeError, "batch size must be greater than zero") if batch_size < 1
|
28
|
-
batch_size
|
23
|
+
def reset
|
24
|
+
# Do nothing by default.
|
29
25
|
end
|
30
26
|
end
|
31
27
|
end
|
@@ -1,16 +1,54 @@
|
|
1
1
|
module DataTransport
|
2
2
|
class DataStore
|
3
|
+
# Data store that reads and writes records in a database via ActiveRecord.
|
4
|
+
# This class is specifically optimized for reading and writing large
|
5
|
+
# numbers of records, providing a significant advantage over using
|
6
|
+
# ActiveRecord directly.
|
7
|
+
#
|
8
|
+
# On MySQL databases, records are written in batches of the largest size
|
9
|
+
# possible instead of being inserted one by one.
|
3
10
|
class ActiveRecord < DataStore
|
4
|
-
|
5
|
-
|
11
|
+
# There are two ways to initialize this data store. The first is by
|
12
|
+
# specifying one of your ActiveRecord models:
|
13
|
+
#
|
14
|
+
# DataTransport::DataStore::ActiveRecord.new :class => MyModel
|
15
|
+
#
|
16
|
+
# The second is by providing an ActiveRecord database specification (as
|
17
|
+
# read from database.yml, for example) and a table name:
|
18
|
+
#
|
19
|
+
# db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
|
20
|
+
# DataTransport::DataStore::ActiveRecord.new(
|
21
|
+
# :connection => db_spec,
|
22
|
+
# :table_name => "sprockets"
|
23
|
+
# )
|
24
|
+
#
|
25
|
+
# The second form is useful for importing or exporting data in non-Rails
|
26
|
+
# applications.
|
27
|
+
#
|
28
|
+
# In addition, the following options are accepted:
|
29
|
+
#
|
30
|
+
# conditions:: Conditions describing which records to read. This can
|
31
|
+
# be anything that ActiveRecord will recognize, such as
|
32
|
+
# a hash table, an array with substitutions, or raw SQL.
|
33
|
+
# Default is nil (no conditions, read all records).
|
34
|
+
# truncate:: If true, the table will be truncated before any records
|
35
|
+
# are written. On MySQL databases, this is performed by
|
36
|
+
# executing a TRUNCATE TABLE query; all other databases
|
37
|
+
# use ActiveRecord's delete_all method.
|
38
|
+
# ignore_errors:: If true, errors that occur during record insertion will
|
39
|
+
# be ignored. This is useful if your table has a unique
|
40
|
+
# index and you want to silently drop records with
|
41
|
+
# duplicate keys. Currently this only works on MySQL.
|
42
|
+
# Default is false.
|
6
43
|
def initialize(options = {})
|
7
44
|
super()
|
8
45
|
# Extract options.
|
9
|
-
@class
|
10
|
-
@connection
|
11
|
-
@table_name
|
12
|
-
@conditions
|
13
|
-
@truncate
|
46
|
+
@class = options.delete(:class)
|
47
|
+
@connection = options.delete(:connection)
|
48
|
+
@table_name = options.delete(:table_name)
|
49
|
+
@conditions = options.delete(:conditions)
|
50
|
+
@truncate = options.delete(:truncate)
|
51
|
+
@ignore_errors = options.delete(:ignore_errors)
|
14
52
|
# Make sure a class or connection and table name was provided.
|
15
53
|
if @class.nil? && (@connection.nil? || @table_name.nil?)
|
16
54
|
raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
|
@@ -34,6 +72,13 @@ module DataTransport
|
|
34
72
|
klass = klass.superclass
|
35
73
|
end
|
36
74
|
raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
|
75
|
+
# If ignore_errors is true, make sure we're connected to a MySQL
|
76
|
+
# database.
|
77
|
+
if @ignore_errors
|
78
|
+
unless @class.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
79
|
+
raise ArgumentError, "ignore_errors can only be used with a MySQL database"
|
80
|
+
end
|
81
|
+
end
|
37
82
|
# Check for unknown options.
|
38
83
|
unless options.empty?
|
39
84
|
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
@@ -50,17 +95,17 @@ module DataTransport
|
|
50
95
|
@class.columns.each {|c| @columns[c.name.to_sym] = c}
|
51
96
|
end
|
52
97
|
|
53
|
-
def klass
|
98
|
+
def klass # :nodoc:
|
54
99
|
@class
|
55
100
|
end
|
56
101
|
|
102
|
+
# Returns the number of records in the table that match the data store's
|
103
|
+
# conditions.
|
57
104
|
def count
|
58
105
|
@class.count(:conditions => @conditions)
|
59
106
|
end
|
60
107
|
|
61
|
-
def each_record(batch_size
|
62
|
-
batch_size = check_batch_size(batch_size)
|
63
|
-
|
108
|
+
def each_record(batch_size) # :nodoc:
|
64
109
|
conn = @class.connection
|
65
110
|
column_names = conn.columns(@class.table_name).collect {|c| c.name}
|
66
111
|
|
@@ -85,7 +130,7 @@ module DataTransport
|
|
85
130
|
end
|
86
131
|
end
|
87
132
|
|
88
|
-
def write_record(record)
|
133
|
+
def write_record(record) # :nodoc:
|
89
134
|
conn = @class.connection
|
90
135
|
# If no SQL has been produced yet, start an INSERT statement.
|
91
136
|
@sql_buffer ||= start_insert_sql(record)
|
@@ -113,7 +158,7 @@ module DataTransport
|
|
113
158
|
end
|
114
159
|
end
|
115
160
|
|
116
|
-
def finalize
|
161
|
+
def finalize # :nodoc:
|
117
162
|
if @truncate
|
118
163
|
conn = @class.connection
|
119
164
|
begin
|
@@ -123,21 +168,20 @@ module DataTransport
|
|
123
168
|
end
|
124
169
|
@truncate = false
|
125
170
|
end
|
126
|
-
if @sql_buffer[-1,1] == ","
|
171
|
+
if @sql_buffer && @sql_buffer[-1,1] == ","
|
127
172
|
@sql_buffer.chop!
|
128
173
|
@class.connection.execute(@sql_buffer)
|
129
174
|
end
|
130
175
|
end
|
131
176
|
|
132
|
-
def reset
|
133
|
-
self.ignore_duplicates = false
|
177
|
+
def reset # :nodoc:
|
134
178
|
@sql_buffer = nil
|
135
179
|
end
|
136
180
|
|
137
181
|
private
|
138
182
|
|
139
183
|
def start_insert_sql(record)
|
140
|
-
"INSERT #{
|
184
|
+
"INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
|
141
185
|
"#{@class.connection.quote_table_name(@class.table_name)} " +
|
142
186
|
"(#{record.keys.join ","}) VALUES "
|
143
187
|
end
|
@@ -1,23 +1,65 @@
|
|
1
1
|
module DataTransport
|
2
2
|
class DataStore
|
3
|
+
# Data store that reads and writes records in a flat text file.
|
4
|
+
#
|
5
|
+
# Although this class can read and write CSV files, you should use the
|
6
|
+
# CSVFile data store for that instead of this one.
|
3
7
|
class File < DataStore
|
4
|
-
attr_reader :mode
|
8
|
+
attr_reader :mode # :nodoc:
|
5
9
|
|
10
|
+
# Accepts the following options:
|
11
|
+
#
|
12
|
+
# header:: If true, the file has a header row that contains the names
|
13
|
+
# of each field. Default is false.
|
14
|
+
# delimiter:: String that separates individual fields in a row. Default
|
15
|
+
# is "\t".
|
16
|
+
# enclosure:: String that encloses individual fields. For example, if
|
17
|
+
# this is set to "\"", fields will be enclosed in double
|
18
|
+
# quotes. Default is nil (no enclosure).
|
19
|
+
# escape:: Escape sequence for occurrences of the enclosure string in
|
20
|
+
# field values. Set this to the special value :double if
|
21
|
+
# enclosure characters are escaped by doubling them (like in
|
22
|
+
# CSV and SQL). Default is nil.
|
23
|
+
# path:: Path to the file.
|
24
|
+
# null:: String that represents fields whose value is nil (but not
|
25
|
+
# blank). Default is "".
|
26
|
+
# keys:: Array of field names. Not necessary for files with a header
|
27
|
+
# row. Default for files without a header row is fieldXX,
|
28
|
+
# where XX is numbered sequentially starting from 00.
|
6
29
|
def initialize(options = {})
|
7
30
|
super()
|
8
|
-
|
31
|
+
# Extract options.
|
9
32
|
@header = options.delete(:header)
|
10
33
|
@delimiter = options.delete(:delimiter) || "\t"
|
34
|
+
@enclosure = options.delete(:enclosure)
|
35
|
+
@escape = options.delete(:escape)
|
11
36
|
@path = options.delete(:path)
|
12
37
|
@null = options.delete(:null) || ""
|
13
38
|
@keys = options.delete(:keys)
|
14
|
-
|
39
|
+
# Validate options.
|
15
40
|
raise(ArgumentError, "missing required option `path'") if @path.nil?
|
41
|
+
if @escape && @enclosure.nil?
|
42
|
+
raise(ArgumentError, "`escape' cannot be used without `enclosure'")
|
43
|
+
end
|
16
44
|
unless options.empty?
|
17
45
|
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
18
46
|
end
|
47
|
+
# Handle the special :double escape sequence.
|
48
|
+
@escape = @enclosure if @escape == :double
|
49
|
+
# Create an enclosure placeholder, which is used to avoid clobbering
|
50
|
+
# escaped enclosure characters during parsing.
|
51
|
+
if @escape
|
52
|
+
if @enclosure == 0.chr
|
53
|
+
safe_ch = 1.chr
|
54
|
+
else
|
55
|
+
safe_ch = 0.chr
|
56
|
+
end
|
57
|
+
@placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
|
58
|
+
end
|
19
59
|
end
|
20
60
|
|
61
|
+
# Returns the number of lines in the file (not counting the header, if
|
62
|
+
# there is one).
|
21
63
|
def count
|
22
64
|
return @count if @count
|
23
65
|
self.mode = :input
|
@@ -32,17 +74,14 @@ module DataTransport
|
|
32
74
|
@count = line_count
|
33
75
|
end
|
34
76
|
|
35
|
-
def each_record(batch_size = nil)
|
77
|
+
def each_record(batch_size = nil) # :nodoc:
|
36
78
|
self.mode = :input
|
37
|
-
|
38
|
-
batch_size = check_batch_size(batch_size)
|
39
|
-
|
40
79
|
io.rewind
|
41
80
|
io.readline if @header
|
42
81
|
until io.eof?
|
43
82
|
line = io.gets || break
|
44
83
|
line.chomp!
|
45
|
-
values = line
|
84
|
+
values = values_from_s(line)
|
46
85
|
if keys.length != values.length
|
47
86
|
raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
|
48
87
|
end
|
@@ -52,23 +91,53 @@ module DataTransport
|
|
52
91
|
end
|
53
92
|
end
|
54
93
|
|
55
|
-
def write_record(record)
|
94
|
+
def write_record(record) # :nodoc:
|
56
95
|
self.mode = :output
|
57
96
|
# If no key order was ever specified, make one up.
|
58
97
|
@keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
|
59
98
|
# Write the header if this is the first record.
|
60
99
|
if @header && io.pos == 0
|
61
|
-
io.puts(keys
|
100
|
+
io.puts(values_to_s(keys))
|
62
101
|
end
|
63
102
|
# Write the values in a predictable order.
|
64
103
|
values = keys.collect do |k|
|
65
104
|
record[k].nil?? @null : record[k]
|
66
105
|
end
|
67
|
-
io.puts(values
|
106
|
+
io.puts(values_to_s(values))
|
68
107
|
end
|
69
108
|
|
70
109
|
private
|
71
110
|
|
111
|
+
def values_to_s(values)
|
112
|
+
if @escape
|
113
|
+
values = values.collect do |v|
|
114
|
+
@enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
|
115
|
+
end
|
116
|
+
elsif @enclosure
|
117
|
+
values = values.collect {|v| @enclosure + v.to_s + @enclosure}
|
118
|
+
end
|
119
|
+
values.join(@delimiter)
|
120
|
+
end
|
121
|
+
|
122
|
+
def values_from_s(str)
|
123
|
+
if @escape
|
124
|
+
str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
|
125
|
+
values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
|
126
|
+
values.first.sub!(/^#{@enclosure}/, "")
|
127
|
+
values.last.sub!(/#{@enclosure}$/, "")
|
128
|
+
values.each do |v|
|
129
|
+
v.gsub!(/#{@placeholder}/, @enclosure)
|
130
|
+
end
|
131
|
+
elsif @enclosure
|
132
|
+
values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
|
133
|
+
values.first.sub!(/^#{@enclosure}/, "")
|
134
|
+
values.last.sub!(/#{@enclosure}$/, "")
|
135
|
+
else
|
136
|
+
values = str.split(/#{@delimiter}/)
|
137
|
+
end
|
138
|
+
values
|
139
|
+
end
|
140
|
+
|
72
141
|
def mode=(new_mode)
|
73
142
|
if !@mode.nil? && @mode != new_mode
|
74
143
|
raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
|
@@ -95,12 +164,12 @@ module DataTransport
|
|
95
164
|
return [] if mode == :output
|
96
165
|
line = rewind_and_restore { io.readline }
|
97
166
|
line.chomp!
|
98
|
-
fields = line
|
167
|
+
fields = values_from_s(line)
|
99
168
|
if @header
|
100
169
|
@keys = fields.collect! {|hdr| hdr.downcase.to_sym}
|
101
170
|
else
|
102
171
|
@keys = (0..(fields.length - 1)).to_a.collect! do |i|
|
103
|
-
sprintf("
|
172
|
+
sprintf("field%02d", i).to_sym
|
104
173
|
end
|
105
174
|
end
|
106
175
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: DanaDanger-data_transport
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dana Danger
|
@@ -23,13 +23,12 @@ extra_rdoc_files: []
|
|
23
23
|
|
24
24
|
files:
|
25
25
|
- lib/data_transport.rb
|
26
|
-
- lib/data_transport/map.rb
|
27
26
|
- lib/data_transport/data_store.rb
|
28
27
|
- lib/data_transport/data_store/active_record.rb
|
29
28
|
- lib/data_transport/data_store/file.rb
|
30
29
|
- lib/data_transport/record/destination.rb
|
31
30
|
- lib/data_transport/record/source.rb
|
32
|
-
has_rdoc:
|
31
|
+
has_rdoc: true
|
33
32
|
homepage: http://github.com/DanaDanger/data_transport
|
34
33
|
post_install_message:
|
35
34
|
rdoc_options: []
|
data/lib/data_transport/map.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require "data_transport/record/source"
|
2
|
-
require "data_transport/record/destination"
|
3
|
-
|
4
|
-
module DataTransport
|
5
|
-
class Map
|
6
|
-
attr_reader :source, :destination
|
7
|
-
|
8
|
-
def initialize(&block)
|
9
|
-
@block = block
|
10
|
-
@source = DataTransport::Record::Source.new
|
11
|
-
@destination = DataTransport::Record::Destination.new
|
12
|
-
end
|
13
|
-
|
14
|
-
def map(record)
|
15
|
-
@source.record = record
|
16
|
-
@destination.reset!
|
17
|
-
@block.call(@source, @destination)
|
18
|
-
@destination.record
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|