DanaDanger-data_transport 0.1.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/data_transport.rb +36 -14
- data/lib/data_transport/data_store.rb +4 -8
- data/lib/data_transport/data_store/active_record.rb +61 -17
- data/lib/data_transport/data_store/file.rb +82 -13
- data/lib/data_transport/record/destination.rb +2 -2
- data/lib/data_transport/record/source.rb +2 -2
- metadata +2 -3
- data/lib/data_transport/map.rb +0 -21
data/lib/data_transport.rb
CHANGED
@@ -1,29 +1,51 @@
|
|
1
|
-
require "data_transport/map"
|
2
1
|
require "data_transport/data_store"
|
2
|
+
require "data_transport/record/source"
|
3
|
+
require "data_transport/record/destination"
|
3
4
|
|
4
5
|
module DataTransport
|
5
|
-
|
6
|
-
1000
|
7
|
-
end
|
6
|
+
DEFAULT_BATCH_SIZE = 1000 # :nodoc:
|
8
7
|
|
8
|
+
# Reads records from an input data source, processes them with the supplied
|
9
|
+
# block, and writes them to an output data source. Accepts the following
|
10
|
+
# options:
|
11
|
+
#
|
12
|
+
# batch_size:: Records are read from the input in batches. This option sets
|
13
|
+
# the number of records in a single batch. Default is 1000.
|
14
|
+
#
|
15
|
+
# The block is passed two objects that represent the source and destination
|
16
|
+
# record. These objects have methods that reflect the attributes of the
|
17
|
+
# records. The following example reads the +name+ and +price+ attributes from
|
18
|
+
# input records, downcases the name, multiplies the price by 100, and writes
|
19
|
+
# them to the output:
|
20
|
+
#
|
21
|
+
# # input = DataTransport::DataSource:: ...
|
22
|
+
# # output = DataTransport::DataSource:: ...
|
23
|
+
#
|
24
|
+
# DataTransport.map(input, output) do |src, dst|
|
25
|
+
# dst.name = src.name.downcase
|
26
|
+
# dst.price = (src.price * 100).to_i
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# The destination doesn't necessarily have to have the same attributes as the
|
30
|
+
# source (or even the same number of attributes). The transformations that
|
31
|
+
# can be accomplished are limited only by what you can do in a block of Ruby.
|
9
32
|
def self.map(input, output, options = {}, &block)
|
10
33
|
# Extract options.
|
11
|
-
|
34
|
+
batch_size = options[:batch_size] || DEFAULT_BATCH_SIZE
|
35
|
+
raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
|
36
|
+
raise(RangeError, "batch size must be greater than zero") if batch_size < 1
|
12
37
|
unless options.empty?
|
13
38
|
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
14
39
|
end
|
15
|
-
# If ignore_duplicates is true, make sure the output is a MySQL database.
|
16
|
-
if ignore_duplicates
|
17
|
-
unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
18
|
-
raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
|
19
|
-
end
|
20
|
-
end
|
21
40
|
# Run the transport.
|
22
41
|
output.reset
|
23
|
-
|
24
|
-
|
42
|
+
source = DataTransport::Record::Source.new
|
43
|
+
destination = DataTransport::Record::Destination.new
|
25
44
|
input.each_record do |record|
|
26
|
-
|
45
|
+
source.record = record
|
46
|
+
destination.reset!
|
47
|
+
yield source, destination
|
48
|
+
output.write_record(destination.record)
|
27
49
|
end
|
28
50
|
output.finalize
|
29
51
|
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require "data_transport/data_store/file"
|
2
|
+
require "data_transport/data_store/csv_file"
|
2
3
|
require "data_transport/data_store/active_record"
|
3
4
|
|
4
5
|
module DataTransport
|
5
|
-
class DataStore
|
6
|
+
class DataStore # :nodoc:
|
6
7
|
def count
|
7
8
|
raise NotImplementedError
|
8
9
|
end
|
@@ -19,13 +20,8 @@ module DataTransport
|
|
19
20
|
# Do nothing by default.
|
20
21
|
end
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
def check_batch_size(batch_size)
|
25
|
-
batch_size ||= DataTransport.default_batch_size
|
26
|
-
raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
|
27
|
-
raise(RangeError, "batch size must be greater than zero") if batch_size < 1
|
28
|
-
batch_size
|
23
|
+
def reset
|
24
|
+
# Do nothing by default.
|
29
25
|
end
|
30
26
|
end
|
31
27
|
end
|
@@ -1,16 +1,54 @@
|
|
1
1
|
module DataTransport
|
2
2
|
class DataStore
|
3
|
+
# Data store that reads and writes records in a database via ActiveRecord.
|
4
|
+
# This class is specifically optimized for reading and writing large
|
5
|
+
# numbers of records, providing a significant advantage over using
|
6
|
+
# ActiveRecord directly.
|
7
|
+
#
|
8
|
+
# On MySQL databases, records are written in batches of the largest size
|
9
|
+
# possible instead of being inserted one by one.
|
3
10
|
class ActiveRecord < DataStore
|
4
|
-
|
5
|
-
|
11
|
+
# There are two ways to initialize this data store. The first is by
|
12
|
+
# specifying one of your ActiveRecord models:
|
13
|
+
#
|
14
|
+
# DataTransport::DataStore::ActiveRecord.new :class => MyModel
|
15
|
+
#
|
16
|
+
# The second is by providing an ActiveRecord database specification (as
|
17
|
+
# read from database.yml, for example) and a table name:
|
18
|
+
#
|
19
|
+
# db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
|
20
|
+
# DataTransport::DataStore::ActiveRecord.new(
|
21
|
+
# :connection => db_spec,
|
22
|
+
# :table_name => "sprockets"
|
23
|
+
# )
|
24
|
+
#
|
25
|
+
# The second form is useful for importing or exporting data in non-Rails
|
26
|
+
# applications.
|
27
|
+
#
|
28
|
+
# In addition, the following options are accepted:
|
29
|
+
#
|
30
|
+
# conditions:: Conditions describing which records to read. This can
|
31
|
+
# be anything that ActiveRecord will recognize, such as
|
32
|
+
# a hash table, an array with substitutions, or raw SQL.
|
33
|
+
# Default is nil (no conditions, read all records).
|
34
|
+
# truncate:: If true, the table will be truncated before any records
|
35
|
+
# are written. On MySQL databases, this is performed by
|
36
|
+
# executing a TRUNCATE TABLE query; all other databases
|
37
|
+
# use ActiveRecord's delete_all method.
|
38
|
+
# ignore_errors:: If true, errors that occur during record insertion will
|
39
|
+
# be ignored. This is useful if your table has a unique
|
40
|
+
# index and you want to silently drop records with
|
41
|
+
# duplicate keys. Currently this only works on MySQL.
|
42
|
+
# Default is false.
|
6
43
|
def initialize(options = {})
|
7
44
|
super()
|
8
45
|
# Extract options.
|
9
|
-
@class
|
10
|
-
@connection
|
11
|
-
@table_name
|
12
|
-
@conditions
|
13
|
-
@truncate
|
46
|
+
@class = options.delete(:class)
|
47
|
+
@connection = options.delete(:connection)
|
48
|
+
@table_name = options.delete(:table_name)
|
49
|
+
@conditions = options.delete(:conditions)
|
50
|
+
@truncate = options.delete(:truncate)
|
51
|
+
@ignore_errors = options.delete(:ignore_errors)
|
14
52
|
# Make sure a class or connection and table name was provided.
|
15
53
|
if @class.nil? && (@connection.nil? || @table_name.nil?)
|
16
54
|
raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
|
@@ -34,6 +72,13 @@ module DataTransport
|
|
34
72
|
klass = klass.superclass
|
35
73
|
end
|
36
74
|
raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
|
75
|
+
# If ignore_errors is true, make sure we're connected to a MySQL
|
76
|
+
# database.
|
77
|
+
if @ignore_errors
|
78
|
+
unless @class.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
79
|
+
raise ArgumentError, "ignore_errors can only be used with a MySQL database"
|
80
|
+
end
|
81
|
+
end
|
37
82
|
# Check for unknown options.
|
38
83
|
unless options.empty?
|
39
84
|
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
@@ -50,17 +95,17 @@ module DataTransport
|
|
50
95
|
@class.columns.each {|c| @columns[c.name.to_sym] = c}
|
51
96
|
end
|
52
97
|
|
53
|
-
def klass
|
98
|
+
def klass # :nodoc:
|
54
99
|
@class
|
55
100
|
end
|
56
101
|
|
102
|
+
# Returns the number of records in the table that match the data store's
|
103
|
+
# conditions.
|
57
104
|
def count
|
58
105
|
@class.count(:conditions => @conditions)
|
59
106
|
end
|
60
107
|
|
61
|
-
def each_record(batch_size
|
62
|
-
batch_size = check_batch_size(batch_size)
|
63
|
-
|
108
|
+
def each_record(batch_size) # :nodoc:
|
64
109
|
conn = @class.connection
|
65
110
|
column_names = conn.columns(@class.table_name).collect {|c| c.name}
|
66
111
|
|
@@ -85,7 +130,7 @@ module DataTransport
|
|
85
130
|
end
|
86
131
|
end
|
87
132
|
|
88
|
-
def write_record(record)
|
133
|
+
def write_record(record) # :nodoc:
|
89
134
|
conn = @class.connection
|
90
135
|
# If no SQL has been produced yet, start an INSERT statement.
|
91
136
|
@sql_buffer ||= start_insert_sql(record)
|
@@ -113,7 +158,7 @@ module DataTransport
|
|
113
158
|
end
|
114
159
|
end
|
115
160
|
|
116
|
-
def finalize
|
161
|
+
def finalize # :nodoc:
|
117
162
|
if @truncate
|
118
163
|
conn = @class.connection
|
119
164
|
begin
|
@@ -123,21 +168,20 @@ module DataTransport
|
|
123
168
|
end
|
124
169
|
@truncate = false
|
125
170
|
end
|
126
|
-
if @sql_buffer[-1,1] == ","
|
171
|
+
if @sql_buffer && @sql_buffer[-1,1] == ","
|
127
172
|
@sql_buffer.chop!
|
128
173
|
@class.connection.execute(@sql_buffer)
|
129
174
|
end
|
130
175
|
end
|
131
176
|
|
132
|
-
def reset
|
133
|
-
self.ignore_duplicates = false
|
177
|
+
def reset # :nodoc:
|
134
178
|
@sql_buffer = nil
|
135
179
|
end
|
136
180
|
|
137
181
|
private
|
138
182
|
|
139
183
|
def start_insert_sql(record)
|
140
|
-
"INSERT #{
|
184
|
+
"INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
|
141
185
|
"#{@class.connection.quote_table_name(@class.table_name)} " +
|
142
186
|
"(#{record.keys.join ","}) VALUES "
|
143
187
|
end
|
@@ -1,23 +1,65 @@
|
|
1
1
|
module DataTransport
|
2
2
|
class DataStore
|
3
|
+
# Data store that reads and writes records in a flat text file.
|
4
|
+
#
|
5
|
+
# Although this class can read and write CSV files, you should use the
|
6
|
+
# CSVFile data store for that instead of this one.
|
3
7
|
class File < DataStore
|
4
|
-
attr_reader :mode
|
8
|
+
attr_reader :mode # :nodoc:
|
5
9
|
|
10
|
+
# Accepts the following options:
|
11
|
+
#
|
12
|
+
# header:: If true, the file has a header row that contains the names
|
13
|
+
# of each field. Default is false.
|
14
|
+
# delimiter:: String that separates individual fields in a row. Default
|
15
|
+
# is "\t".
|
16
|
+
# enclosure:: String that encloses individual fields. For example, if
|
17
|
+
# this is set to "\"", fields will be enclosed in double
|
18
|
+
# quotes. Default is nil (no enclosure).
|
19
|
+
# escape:: Escape sequence for occurrences of the enclosure string in
|
20
|
+
# field values. Set this to the special value :double if
|
21
|
+
# enclosure characters are escaped by doubling them (like in
|
22
|
+
# CSV and SQL). Default is nil.
|
23
|
+
# path:: Path to the file.
|
24
|
+
# null:: String that represents fields whose value is nil (but not
|
25
|
+
# blank). Default is "".
|
26
|
+
# keys:: Array of field names. Not necessary for files with a header
|
27
|
+
# row. Default for files without a header row is fieldXX,
|
28
|
+
# where XX is numbered sequentially starting from 00.
|
6
29
|
def initialize(options = {})
|
7
30
|
super()
|
8
|
-
|
31
|
+
# Extract options.
|
9
32
|
@header = options.delete(:header)
|
10
33
|
@delimiter = options.delete(:delimiter) || "\t"
|
34
|
+
@enclosure = options.delete(:enclosure)
|
35
|
+
@escape = options.delete(:escape)
|
11
36
|
@path = options.delete(:path)
|
12
37
|
@null = options.delete(:null) || ""
|
13
38
|
@keys = options.delete(:keys)
|
14
|
-
|
39
|
+
# Validate options.
|
15
40
|
raise(ArgumentError, "missing required option `path'") if @path.nil?
|
41
|
+
if @escape && @enclosure.nil?
|
42
|
+
raise(ArgumentError, "`escape' cannot be used without `enclosure'")
|
43
|
+
end
|
16
44
|
unless options.empty?
|
17
45
|
raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
|
18
46
|
end
|
47
|
+
# Handle the special :double escape sequence.
|
48
|
+
@escape = @enclosure if @escape == :double
|
49
|
+
# Create an enclosure placeholder, which is used to avoid clobbering
|
50
|
+
# escaped enclosure characters during parsing.
|
51
|
+
if @escape
|
52
|
+
if @enclosure == 0.chr
|
53
|
+
safe_ch = 1.chr
|
54
|
+
else
|
55
|
+
safe_ch = 0.chr
|
56
|
+
end
|
57
|
+
@placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
|
58
|
+
end
|
19
59
|
end
|
20
60
|
|
61
|
+
# Returns the number of lines in the file (not counting the header, if
|
62
|
+
# there is one).
|
21
63
|
def count
|
22
64
|
return @count if @count
|
23
65
|
self.mode = :input
|
@@ -32,17 +74,14 @@ module DataTransport
|
|
32
74
|
@count = line_count
|
33
75
|
end
|
34
76
|
|
35
|
-
def each_record(batch_size = nil)
|
77
|
+
def each_record(batch_size = nil) # :nodoc:
|
36
78
|
self.mode = :input
|
37
|
-
|
38
|
-
batch_size = check_batch_size(batch_size)
|
39
|
-
|
40
79
|
io.rewind
|
41
80
|
io.readline if @header
|
42
81
|
until io.eof?
|
43
82
|
line = io.gets || break
|
44
83
|
line.chomp!
|
45
|
-
values = line
|
84
|
+
values = values_from_s(line)
|
46
85
|
if keys.length != values.length
|
47
86
|
raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
|
48
87
|
end
|
@@ -52,23 +91,53 @@ module DataTransport
|
|
52
91
|
end
|
53
92
|
end
|
54
93
|
|
55
|
-
def write_record(record)
|
94
|
+
def write_record(record) # :nodoc:
|
56
95
|
self.mode = :output
|
57
96
|
# If no key order was ever specified, make one up.
|
58
97
|
@keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
|
59
98
|
# Write the header if this is the first record.
|
60
99
|
if @header && io.pos == 0
|
61
|
-
io.puts(keys
|
100
|
+
io.puts(values_to_s(keys))
|
62
101
|
end
|
63
102
|
# Write the values in a predictable order.
|
64
103
|
values = keys.collect do |k|
|
65
104
|
record[k].nil?? @null : record[k]
|
66
105
|
end
|
67
|
-
io.puts(values
|
106
|
+
io.puts(values_to_s(values))
|
68
107
|
end
|
69
108
|
|
70
109
|
private
|
71
110
|
|
111
|
+
def values_to_s(values)
|
112
|
+
if @escape
|
113
|
+
values = values.collect do |v|
|
114
|
+
@enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
|
115
|
+
end
|
116
|
+
elsif @enclosure
|
117
|
+
values = values.collect {|v| @enclosure + v.to_s + @enclosure}
|
118
|
+
end
|
119
|
+
values.join(@delimiter)
|
120
|
+
end
|
121
|
+
|
122
|
+
def values_from_s(str)
|
123
|
+
if @escape
|
124
|
+
str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
|
125
|
+
values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
|
126
|
+
values.first.sub!(/^#{@enclosure}/, "")
|
127
|
+
values.last.sub!(/#{@enclosure}$/, "")
|
128
|
+
values.each do |v|
|
129
|
+
v.gsub!(/#{@placeholder}/, @enclosure)
|
130
|
+
end
|
131
|
+
elsif @enclosure
|
132
|
+
values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
|
133
|
+
values.first.sub!(/^#{@enclosure}/, "")
|
134
|
+
values.last.sub!(/#{@enclosure}$/, "")
|
135
|
+
else
|
136
|
+
values = str.split(/#{@delimiter}/)
|
137
|
+
end
|
138
|
+
values
|
139
|
+
end
|
140
|
+
|
72
141
|
def mode=(new_mode)
|
73
142
|
if !@mode.nil? && @mode != new_mode
|
74
143
|
raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
|
@@ -95,12 +164,12 @@ module DataTransport
|
|
95
164
|
return [] if mode == :output
|
96
165
|
line = rewind_and_restore { io.readline }
|
97
166
|
line.chomp!
|
98
|
-
fields = line
|
167
|
+
fields = values_from_s(line)
|
99
168
|
if @header
|
100
169
|
@keys = fields.collect! {|hdr| hdr.downcase.to_sym}
|
101
170
|
else
|
102
171
|
@keys = (0..(fields.length - 1)).to_a.collect! do |i|
|
103
|
-
sprintf("
|
172
|
+
sprintf("field%02d", i).to_sym
|
104
173
|
end
|
105
174
|
end
|
106
175
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: DanaDanger-data_transport
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dana Danger
|
@@ -23,13 +23,12 @@ extra_rdoc_files: []
|
|
23
23
|
|
24
24
|
files:
|
25
25
|
- lib/data_transport.rb
|
26
|
-
- lib/data_transport/map.rb
|
27
26
|
- lib/data_transport/data_store.rb
|
28
27
|
- lib/data_transport/data_store/active_record.rb
|
29
28
|
- lib/data_transport/data_store/file.rb
|
30
29
|
- lib/data_transport/record/destination.rb
|
31
30
|
- lib/data_transport/record/source.rb
|
32
|
-
has_rdoc:
|
31
|
+
has_rdoc: true
|
33
32
|
homepage: http://github.com/DanaDanger/data_transport
|
34
33
|
post_install_message:
|
35
34
|
rdoc_options: []
|
data/lib/data_transport/map.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require "data_transport/record/source"
|
2
|
-
require "data_transport/record/destination"
|
3
|
-
|
4
|
-
module DataTransport
|
5
|
-
class Map
|
6
|
-
attr_reader :source, :destination
|
7
|
-
|
8
|
-
def initialize(&block)
|
9
|
-
@block = block
|
10
|
-
@source = DataTransport::Record::Source.new
|
11
|
-
@destination = DataTransport::Record::Destination.new
|
12
|
-
end
|
13
|
-
|
14
|
-
def map(record)
|
15
|
-
@source.record = record
|
16
|
-
@destination.reset!
|
17
|
-
@block.call(@source, @destination)
|
18
|
-
@destination.record
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|