data_transport 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,210 @@
1
+ module DataTransport
2
+ class DataStore
3
+ # Data store that reads and writes records in a database via ActiveRecord.
4
+ # This class is specifically optimized for reading and writing large
5
+ # numbers of records, providing a significant advantage over using
6
+ # ActiveRecord directly.
7
+ class ActiveRecord < DataStore
8
+ # There are two ways to initialize this data store. The first is by
9
+ # specifying one of your ActiveRecord models:
10
+ #
11
+ # DataTransport::DataStore::ActiveRecord.new :class => MyModel
12
+ #
13
+ # The second is by providing an ActiveRecord database specification (as
14
+ # read from database.yml, for example) and a table name:
15
+ #
16
+ # db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
17
+ # DataTransport::DataStore::ActiveRecord.new(
18
+ # :connection => db_spec,
19
+ # :table_name => "sprockets"
20
+ # )
21
+ #
22
+ # The second form is useful for importing or exporting data in non-Rails
23
+ # applications.
24
+ #
25
+ # In addition, the following options are accepted:
26
+ #
27
+ # conditions:: Conditions describing which records to read. This can
28
+ # be anything that ActiveRecord will recognize, such as
29
+ # a hash table, an array with substitutions, or raw SQL.
30
+ # Default is nil (no conditions, read all records).
31
+ # truncate:: If true, the table will be truncated before any
32
+ # records are written. On databases that support it,
33
+ # this is performed by executing a TRUNCATE TABLE query;
34
+ # all other databases use ActiveRecord's delete_all
35
+ # method.
36
+ # ignore_errors:: If true, errors that occur during record insertion
37
+ # will be ignored. This is useful if your table has a
38
+ # unique index and you want to silently drop records
39
+ # with duplicate keys. Currently this only works on
40
+ # MySQL. Default is false.
41
+ # max_sql_length:: Maximum permissible length of an SQL query, in bytes.
42
+ # Rows to be inserted are buffered until the largest
43
+ # possible INSERT statement has been generated, at which
44
+ # point the statement is executed and a new INSERT
45
+ # statement begins. The default value varies depending
46
+ # on what type of database you're connected to. With
47
+ # SQLite, the default is 1,000,000. With MySQL, the
48
+ # default is the value of the +max_allowed_packet+
49
+ # variable minus 512. With all other databases, the
50
+ # default is 16,777,216.
51
+ def initialize(options = {})
52
+ super()
53
+ # Extract options.
54
+ @class = options.delete(:class)
55
+ @connection = options.delete(:connection)
56
+ @table_name = options.delete(:table_name)
57
+ @conditions = options.delete(:conditions)
58
+ @truncate = options.delete(:truncate)
59
+ @ignore_errors = options.delete(:ignore_errors)
60
+ @max_sql_length = options.delete(:max_sql_length)
61
+ # Make sure a class or connection and table name was provided.
62
+ if @class.nil? && (@connection.nil? || @table_name.nil?)
63
+ raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
64
+ end
65
+ raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
66
+ # If connection specs were provided instead of a class, make an
67
+ # anonymous ActiveRecord subclass.
68
+ unless @class
69
+ @class = Class.new(::ActiveRecord::Base)
70
+ @class.set_table_name @table_name
71
+ @class.establish_connection @connection
72
+ end
73
+ # Make sure the class descends from ActiveRecord::Base.
74
+ klass = @class.superclass
75
+ is_active_record = false
76
+ while klass
77
+ if klass == ::ActiveRecord::Base
78
+ is_active_record = true
79
+ break
80
+ end
81
+ klass = klass.superclass
82
+ end
83
+ raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
84
+ # If ignore_errors is true, make sure we're connected to a MySQL
85
+ # database. We don't use is_a? because if the MySQL adapter isn't
86
+ # loaded, referencing its class throws a NameError.
87
+ if @ignore_errors
88
+ unless @class.connection.class.to_s ==
89
+ "ActiveRecord::ConnectionAdapters::MysqlAdapter"
90
+ raise ArgumentError, "ignore_errors can only be used with a MySQL database"
91
+ end
92
+ end
93
+ # Check for unknown options.
94
+ unless options.empty?
95
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
96
+ end
97
+ # Figure out how much data the database can handle in one query. See
98
+ # the note above in the ignore_errors compatibility check about using
99
+ # stringified class names.
100
+ if @max_sql_length
101
+ @max_sql_length = @max_sql_length.to_i
102
+ else
103
+ case @class.connection.class.to_s
104
+ when "ActiveRecord::ConnectionAdapters::MysqlAdapter"
105
+ rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
106
+ @max_sql_length = rows.first["Value"].to_i - 512
107
+ when /\AActiveRecord::ConnectionAdapters::SQLite3?Adapter\Z/
108
+ @max_sql_length = 1_000_000
109
+ else
110
+ @max_sql_length = 16_777_216
111
+ end
112
+ end
113
+ # Fetch column information
114
+ @columns = {}
115
+ @class.columns.each {|c| @columns[c.name.to_sym] = c}
116
+ end
117
+
118
+ def klass # :nodoc:
119
+ @class
120
+ end
121
+
122
+ # Returns the number of records in the table that match the data store's
123
+ # conditions.
124
+ def count
125
+ @class.count(:conditions => @conditions)
126
+ end
127
+
128
+ def each_record(batch_size = nil) # :nodoc:
129
+ conn = @class.connection
130
+ column_names = conn.columns(@class.table_name).collect {|c| c.name}
131
+
132
+ offset = 0
133
+ record = {}
134
+ base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
135
+ @class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
136
+ while true
137
+ sql = base_query.dup
138
+ conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
139
+ offset += batch_size
140
+ rows = conn.select_rows(sql)
141
+ break if rows.empty?
142
+ rows.each do |row|
143
+ record.clear
144
+ column_names.each_with_index do |column_name, i|
145
+ column_name = column_name.to_sym
146
+ record[column_name] = @columns[column_name].type_cast(row[i])
147
+ end
148
+ yield record
149
+ end
150
+ end
151
+ end
152
+
153
+ def write_record(record) # :nodoc:
154
+ conn = @class.connection
155
+ # If no SQL has been produced yet, start an INSERT statement.
156
+ @sql_buffer ||= start_insert_sql(record)
157
+ # Convert the record into a string of quoted values.
158
+ values = []
159
+ record.each {|k, v| values << conn.quote(v, @columns[k])}
160
+ values = "(#{values.join ","}),"
161
+ # Write the record.
162
+ if @max_sql_length.nil?
163
+ # We have no information on the database's maximum allowed packet
164
+ # size, so it's safest to write the record immediately.
165
+ @sql_buffer << values
166
+ finalize
167
+ elsif @sql_buffer.length + record.length > @max_sql_length
168
+ # Appending this record to the SQL buffer will exceed the maximum
169
+ # allowed packet size. Send the buffer to the database and start a
170
+ # new statement with this record.
171
+ finalize
172
+ @sql_buffer = start_insert_sql
173
+ @sql_buffer << values
174
+ else
175
+ # This record will not cause the SQL buffer to exceed the maximum
176
+ # allowed packet size. Append it to the SQL buffer.
177
+ @sql_buffer << values
178
+ end
179
+ end
180
+
181
+ def finalize # :nodoc:
182
+ if @truncate
183
+ conn = @class.connection
184
+ begin
185
+ conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
186
+ rescue
187
+ @class.delete_all
188
+ end
189
+ @truncate = false
190
+ end
191
+ if @sql_buffer && @sql_buffer[-1,1] == ","
192
+ @sql_buffer.chop!
193
+ @class.connection.execute(@sql_buffer)
194
+ end
195
+ end
196
+
197
+ def reset # :nodoc:
198
+ @sql_buffer = nil
199
+ end
200
+
201
+ private
202
+
203
+ def start_insert_sql(record)
204
+ "INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
205
+ "#{@class.connection.quote_table_name(@class.table_name)} " +
206
+ "(#{record.keys.join ","}) VALUES "
207
+ end
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,25 @@
1
+ module DataTransport
2
+ class DataStore
3
+ # Identical to the File data store, except that it is preconfigured to read
4
+ # and write CSV files.
5
+ class CSVFile < File
6
+
7
+ # Accepts the same options as the File data store, except that the
8
+ # following options have different defaults:
9
+ #
10
+ # delimiter:: ","
11
+ # enclosure:: "\""
12
+ # escape:: :double
13
+ #
14
+ # These defaults describe the CSV format.
15
+ def initialize(options = {})
16
+ super({
17
+ :delimiter => ",",
18
+ :enclosure => "\"",
19
+ :escape => :double
20
+ }.merge(options))
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,186 @@
1
+ module DataTransport
2
+ class DataStore
3
+ # Data store that reads and writes records in a flat text file.
4
+ #
5
+ # Although this class can read and write CSV files, you should use the
6
+ # CSVFile data store for that instead of this one.
7
+ class File < DataStore
8
+ attr_reader :mode # :nodoc:
9
+
10
+ # Accepts the following options:
11
+ #
12
+ # header:: If true, the file has a header row that contains the names
13
+ # of each field. Default is false.
14
+ # delimiter:: String that separates individual fields in a row. Default
15
+ # is "\t".
16
+ # enclosure:: String that encloses individual fields. For example, if
17
+ # this is set to "\"", fields will be enclosed in double
18
+ # quotes. Default is nil (no enclosure).
19
+ # escape:: Escape sequence for occurrences of the enclosure string in
20
+ # field values. Set this to the special value :double if
21
+ # enclosure characters are escaped by doubling them (like in
22
+ # CSV and SQL). Default is nil.
23
+ # path:: Path to the file.
24
+ # null:: String that represents fields whose value is nil (but not
25
+ # blank). Default is "".
26
+ # keys:: Array of field names. Not necessary for files with a header
27
+ # row. Default for files without a header row is fieldXX,
28
+ # where XX is numbered sequentially starting from 00.
29
+ def initialize(options = {})
30
+ super()
31
+ # Extract options.
32
+ @header = options.delete(:header)
33
+ @delimiter = options.delete(:delimiter) || "\t"
34
+ @enclosure = options.delete(:enclosure)
35
+ @escape = options.delete(:escape)
36
+ @path = options.delete(:path)
37
+ @null = options.delete(:null) || ""
38
+ @keys = options.delete(:keys)
39
+ # Validate options.
40
+ raise(ArgumentError, "missing required option `path'") if @path.nil?
41
+ if @escape && @enclosure.nil?
42
+ raise(ArgumentError, "`escape' cannot be used without `enclosure'")
43
+ end
44
+ unless options.empty?
45
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
46
+ end
47
+ # Handle the special :double escape sequence.
48
+ @escape = @enclosure if @escape == :double
49
+ # Create an enclosure placeholder, which is used to avoid clobbering
50
+ # escaped enclosure characters during parsing.
51
+ if @escape
52
+ if @enclosure == 0.chr
53
+ safe_ch = 1.chr
54
+ else
55
+ safe_ch = 0.chr
56
+ end
57
+ @placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
58
+ end
59
+ end
60
+
61
+ # Returns the number of lines in the file (not counting the header, if
62
+ # there is one).
63
+ def count
64
+ return @count if @count
65
+ self.mode = :input
66
+ line_count = 0
67
+ rewind_and_restore do
68
+ io.readline if @header
69
+ until io.eof?
70
+ io.gets
71
+ line_count += 1
72
+ end
73
+ end
74
+ @count = line_count
75
+ end
76
+
77
+ def each_record(batch_size = nil) # :nodoc:
78
+ self.mode = :input
79
+ io.rewind
80
+ io.readline if @header
81
+ until io.eof?
82
+ line = io.gets || break
83
+ line.chomp!
84
+ values = values_from_s(line)
85
+ if keys.length != values.length
86
+ raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
87
+ end
88
+ record = {}
89
+ keys.length.times {|i| record[keys[i]] = values[i]}
90
+ yield record
91
+ end
92
+ end
93
+
94
+ def write_record(record) # :nodoc:
95
+ self.mode = :output
96
+ # If no key order was ever specified, make one up.
97
+ @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
98
+ # Write the header if this is the first record.
99
+ if @header && io.pos == 0
100
+ io.puts(values_to_s(keys))
101
+ end
102
+ # Write the values in a predictable order.
103
+ values = keys.collect do |k|
104
+ record[k].nil?? @null : record[k]
105
+ end
106
+ io.puts(values_to_s(values))
107
+ end
108
+
109
+ private
110
+
111
+ def values_to_s(values)
112
+ if @escape
113
+ values = values.collect do |v|
114
+ @enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
115
+ end
116
+ elsif @enclosure
117
+ values = values.collect {|v| @enclosure + v.to_s + @enclosure}
118
+ end
119
+ values.join(@delimiter)
120
+ end
121
+
122
+ def values_from_s(str)
123
+ if @escape
124
+ str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
125
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
126
+ values.first.sub!(/^#{@enclosure}/, "")
127
+ values.last.sub!(/#{@enclosure}$/, "")
128
+ values.each do |v|
129
+ v.gsub!(/#{@placeholder}/, @enclosure)
130
+ end
131
+ elsif @enclosure
132
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
133
+ values.first.sub!(/^#{@enclosure}/, "")
134
+ values.last.sub!(/#{@enclosure}$/, "")
135
+ else
136
+ values = str.split(/#{@delimiter}/)
137
+ end
138
+ values
139
+ end
140
+
141
+ def mode=(new_mode)
142
+ if !@mode.nil? && @mode != new_mode
143
+ raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
144
+ end
145
+ unless [:input, :output].include?(new_mode)
146
+ raise ArgumentError, "unknown mode `#{new_mode}'"
147
+ end
148
+ @mode = new_mode
149
+ end
150
+
151
+ def io
152
+ return @io if @io
153
+ if mode == :output
154
+ @io = ::File.open(@path, "w")
155
+ @io.rewind
156
+ @io
157
+ else
158
+ @io = ::File.open(@path, "r")
159
+ end
160
+ end
161
+
162
+ def keys
163
+ return @keys if @keys
164
+ return [] if mode == :output
165
+ line = rewind_and_restore { io.readline }
166
+ line.chomp!
167
+ fields = values_from_s(line)
168
+ if @header
169
+ @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
170
+ else
171
+ @keys = (0..(fields.length - 1)).to_a.collect! do |i|
172
+ sprintf("field%02d", i).to_sym
173
+ end
174
+ end
175
+ end
176
+
177
+ def rewind_and_restore
178
+ pos = io.pos
179
+ io.rewind
180
+ result = yield
181
+ io.seek(pos)
182
+ result
183
+ end
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,27 @@
1
+ require "data_transport/data_store/file"
2
+ require "data_transport/data_store/csv_file"
3
+ require "data_transport/data_store/active_record"
4
+
5
+ module DataTransport
6
+ class DataStore # :nodoc:
7
+ def count
8
+ raise NotImplementedError
9
+ end
10
+
11
+ def each_record(batch_size = nil)
12
+ raise NotImplementedError
13
+ end
14
+
15
+ def write_record(record)
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def finalize
20
+ # Do nothing by default.
21
+ end
22
+
23
+ def reset
24
+ # Do nothing by default.
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ module DataTransport
2
+ module Record # :nodoc:
3
+ class Destination # :nodoc:
4
+ attr_reader :record
5
+
6
+ def initialize
7
+ @record = {}
8
+ end
9
+
10
+ def reset!
11
+ @record.clear
12
+ end
13
+
14
+ def method_missing(name, *args)
15
+ name_s = name.to_s
16
+ if name_s[-1,1] == "="
17
+ unless args.length == 1
18
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
19
+ end
20
+ name_s.chop!
21
+ @record[name_s.to_sym] = args.first
22
+ else
23
+ super
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,24 @@
1
+ module DataTransport
2
+ module Record # :nodoc:
3
+ class Source # :nodoc:
4
+ def record=(record)
5
+ @record = record
6
+ end
7
+
8
+ def id
9
+ method_missing :id
10
+ end
11
+
12
+ def method_missing(name, *args)
13
+ if @record.has_key?(name)
14
+ unless args.empty?
15
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
16
+ end
17
+ @record[name]
18
+ else
19
+ super
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,52 @@
1
+ require "data_transport/data_store"
2
+ require "data_transport/record/source"
3
+ require "data_transport/record/destination"
4
+
5
+ module DataTransport
6
+ DEFAULT_BATCH_SIZE = 100_000 # :nodoc:
7
+
8
+ # Reads records from an input data source, processes them with the supplied
9
+ # block, and writes them to an output data source. Accepts the following
10
+ # options:
11
+ #
12
+ # batch_size:: Records are read from the input in batches. This option sets
13
+ # the number of records in a single batch. Default is 1000.
14
+ #
15
+ # The block is passed two objects that represent the source and destination
16
+ # record. These objects have methods that reflect the attributes of the
17
+ # records. The following example reads the +name+ and +price+ attributes from
18
+ # input records, downcases the name, multiplies the price by 100, and writes
19
+ # them to the output:
20
+ #
21
+ # # input = DataTransport::DataSource:: ...
22
+ # # output = DataTransport::DataSource:: ...
23
+ #
24
+ # DataTransport.map(input, output) do |src, dst|
25
+ # dst.name = src.name.downcase
26
+ # dst.price = (src.price * 100).to_i
27
+ # end
28
+ #
29
+ # The destination doesn't necessarily have to have the same attributes as the
30
+ # source (or even the same number of attributes). The transformations that
31
+ # can be accomplished are limited only by what you can do in a block of Ruby.
32
+ def self.map(input, output, options = {}, &block)
33
+ # Extract options.
34
+ batch_size = options.delete(:batch_size) || DEFAULT_BATCH_SIZE
35
+ raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
36
+ raise(RangeError, "batch size must be greater than zero") if batch_size < 1
37
+ unless options.empty?
38
+ raise(ArgumentError, "unrecognized options: `#{options.keys.join("', `")}'")
39
+ end
40
+ # Run the transport.
41
+ output.reset
42
+ source = DataTransport::Record::Source.new
43
+ destination = DataTransport::Record::Destination.new
44
+ input.each_record(batch_size) do |record|
45
+ source.record = record
46
+ destination.reset!
47
+ yield source, destination
48
+ output.write_record(destination.record)
49
+ end
50
+ output.finalize
51
+ end
52
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_transport
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 3
8
+ - 3
9
+ version: 0.3.3
10
+ platform: ruby
11
+ authors:
12
+ - Dana Contreras
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-27 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description:
22
+ email:
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/data_transport.rb
31
+ - lib/data_transport/data_store.rb
32
+ - lib/data_transport/data_store/active_record.rb
33
+ - lib/data_transport/data_store/csv_file.rb
34
+ - lib/data_transport/data_store/file.rb
35
+ - lib/data_transport/record/destination.rb
36
+ - lib/data_transport/record/source.rb
37
+ has_rdoc: true
38
+ homepage: http://github.com/DanaDanger/data_transport
39
+ licenses: []
40
+
41
+ post_install_message:
42
+ rdoc_options: []
43
+
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ segments:
51
+ - 0
52
+ version: "0"
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.3.6
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: A gem for importing and exporting large quantities of data.
67
+ test_files: []
68
+