data_transport 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,210 @@
1
+ module DataTransport
2
+ class DataStore
3
+ # Data store that reads and writes records in a database via ActiveRecord.
4
+ # This class is specifically optimized for reading and writing large
5
+ # numbers of records, providing a significant advantage over using
6
+ # ActiveRecord directly.
7
+ class ActiveRecord < DataStore
8
+ # There are two ways to initialize this data store. The first is by
9
+ # specifying one of your ActiveRecord models:
10
+ #
11
+ # DataTransport::DataStore::ActiveRecord.new :class => MyModel
12
+ #
13
+ # The second is by providing an ActiveRecord database specification (as
14
+ # read from database.yml, for example) and a table name:
15
+ #
16
+ # db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
17
+ # DataTransport::DataStore::ActiveRecord.new(
18
+ # :connection => db_spec,
19
+ # :table_name => "sprockets"
20
+ # )
21
+ #
22
+ # The second form is useful for importing or exporting data in non-Rails
23
+ # applications.
24
+ #
25
+ # In addition, the following options are accepted:
26
+ #
27
+ # conditions:: Conditions describing which records to read. This can
28
+ # be anything that ActiveRecord will recognize, such as
29
+ # a hash table, an array with substitutions, or raw SQL.
30
+ # Default is nil (no conditions, read all records).
31
+ # truncate:: If true, the table will be truncated before any
32
+ # records are written. On databases that support it,
33
+ # this is performed by executing a TRUNCATE TABLE query;
34
+ # all other databases use ActiveRecord's delete_all
35
+ # method.
36
+ # ignore_errors:: If true, errors that occur during record insertion
37
+ # will be ignored. This is useful if your table has a
38
+ # unique index and you want to silently drop records
39
+ # with duplicate keys. Currently this only works on
40
+ # MySQL. Default is false.
41
+ # max_sql_length:: Maximum permissible length of an SQL query, in bytes.
42
+ # Rows to be inserted are buffered until the largest
43
+ # possible INSERT statement has been generated, at which
44
+ # point the statement is executed and a new INSERT
45
+ # statement begins. The default value varies depending
46
+ # on what type of database you're connected to. With
47
+ # SQLite, the default is 1,000,000. With MySQL, the
48
+ # default is the value of the +max_allowed_packet+
49
+ # variable minus 512. With all other databases, the
50
+ # default is 16,777,216.
51
+ def initialize(options = {})
52
+ super()
53
+ # Extract options.
54
+ @class = options.delete(:class)
55
+ @connection = options.delete(:connection)
56
+ @table_name = options.delete(:table_name)
57
+ @conditions = options.delete(:conditions)
58
+ @truncate = options.delete(:truncate)
59
+ @ignore_errors = options.delete(:ignore_errors)
60
+ @max_sql_length = options.delete(:max_sql_length)
61
+ # Make sure a class or connection and table name was provided.
62
+ if @class.nil? && (@connection.nil? || @table_name.nil?)
63
+ raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
64
+ end
65
+ raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
66
+ # If connection specs were provided instead of a class, make an
67
+ # anonymous ActiveRecord subclass.
68
+ unless @class
69
+ @class = Class.new(::ActiveRecord::Base)
70
+ @class.set_table_name @table_name
71
+ @class.establish_connection @connection
72
+ end
73
+ # Make sure the class descends from ActiveRecord::Base.
74
+ klass = @class.superclass
75
+ is_active_record = false
76
+ while klass
77
+ if klass == ::ActiveRecord::Base
78
+ is_active_record = true
79
+ break
80
+ end
81
+ klass = klass.superclass
82
+ end
83
+ raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
84
+ # If ignore_errors is true, make sure we're connected to a MySQL
85
+ # database. We don't use is_a? because if the MySQL adapter isn't
86
+ # loaded, referencing its class throws a NameError.
87
+ if @ignore_errors
88
+ unless @class.connection.class.to_s ==
89
+ "ActiveRecord::ConnectionAdapters::MysqlAdapter"
90
+ raise ArgumentError, "ignore_errors can only be used with a MySQL database"
91
+ end
92
+ end
93
+ # Check for unknown options.
94
+ unless options.empty?
95
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
96
+ end
97
+ # Figure out how much data the database can handle in one query. See
98
+ # the note above in the ignore_errors compatibility check about using
99
+ # stringified class names.
100
+ if @max_sql_length
101
+ @max_sql_length = @max_sql_length.to_i
102
+ else
103
+ case @class.connection.class.to_s
104
+ when "ActiveRecord::ConnectionAdapters::MysqlAdapter"
105
+ rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
106
+ @max_sql_length = rows.first["Value"].to_i - 512
107
+ when /\AActiveRecord::ConnectionAdapters::SQLite3?Adapter\Z/
108
+ @max_sql_length = 1_000_000
109
+ else
110
+ @max_sql_length = 16_777_216
111
+ end
112
+ end
113
+ # Fetch column information
114
+ @columns = {}
115
+ @class.columns.each {|c| @columns[c.name.to_sym] = c}
116
+ end
117
+
118
+ def klass # :nodoc:
119
+ @class
120
+ end
121
+
122
+ # Returns the number of records in the table that match the data store's
123
+ # conditions.
124
+ def count
125
+ @class.count(:conditions => @conditions)
126
+ end
127
+
128
+ def each_record(batch_size = nil) # :nodoc:
129
+ conn = @class.connection
130
+ column_names = conn.columns(@class.table_name).collect {|c| c.name}
131
+
132
+ offset = 0
133
+ record = {}
134
+ base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
135
+ @class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
136
+ while true
137
+ sql = base_query.dup
138
+ conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
139
+ offset += batch_size
140
+ rows = conn.select_rows(sql)
141
+ break if rows.empty?
142
+ rows.each do |row|
143
+ record.clear
144
+ column_names.each_with_index do |column_name, i|
145
+ column_name = column_name.to_sym
146
+ record[column_name] = @columns[column_name].type_cast(row[i])
147
+ end
148
+ yield record
149
+ end
150
+ end
151
+ end
152
+
153
+ def write_record(record) # :nodoc:
154
+ conn = @class.connection
155
+ # If no SQL has been produced yet, start an INSERT statement.
156
+ @sql_buffer ||= start_insert_sql(record)
157
+ # Convert the record into a string of quoted values.
158
+ values = []
159
+ record.each {|k, v| values << conn.quote(v, @columns[k])}
160
+ values = "(#{values.join ","}),"
161
+ # Write the record.
162
+ if @max_sql_length.nil?
163
+ # We have no information on the database's maximum allowed packet
164
+ # size, so it's safest to write the record immediately.
165
+ @sql_buffer << values
166
+ finalize
167
+ elsif @sql_buffer.length + record.length > @max_sql_length
168
+ # Appending this record to the SQL buffer will exceed the maximum
169
+ # allowed packet size. Send the buffer to the database and start a
170
+ # new statement with this record.
171
+ finalize
172
+ @sql_buffer = start_insert_sql
173
+ @sql_buffer << values
174
+ else
175
+ # This record will not cause the SQL buffer to exceed the maximum
176
+ # allowed packet size. Append it to the SQL buffer.
177
+ @sql_buffer << values
178
+ end
179
+ end
180
+
181
+ def finalize # :nodoc:
182
+ if @truncate
183
+ conn = @class.connection
184
+ begin
185
+ conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
186
+ rescue
187
+ @class.delete_all
188
+ end
189
+ @truncate = false
190
+ end
191
+ if @sql_buffer && @sql_buffer[-1,1] == ","
192
+ @sql_buffer.chop!
193
+ @class.connection.execute(@sql_buffer)
194
+ end
195
+ end
196
+
197
+ def reset # :nodoc:
198
+ @sql_buffer = nil
199
+ end
200
+
201
+ private
202
+
203
+ def start_insert_sql(record)
204
+ "INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
205
+ "#{@class.connection.quote_table_name(@class.table_name)} " +
206
+ "(#{record.keys.join ","}) VALUES "
207
+ end
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,25 @@
1
+ module DataTransport
2
+ class DataStore
3
+ # Identical to the File data store, except that it is preconfigured to read
4
+ # and write CSV files.
5
+ class CSVFile < File
6
+
7
+ # Accepts the same options as the File data store, except that the
8
+ # following options have different defaults:
9
+ #
10
+ # delimiter:: ","
11
+ # enclosure:: "\""
12
+ # escape:: :double
13
+ #
14
+ # These defaults describe the CSV format.
15
+ def initialize(options = {})
16
+ super({
17
+ :delimiter => ",",
18
+ :enclosure => "\"",
19
+ :escape => :double
20
+ }.merge(options))
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,186 @@
1
+ module DataTransport
2
+ class DataStore
3
+ # Data store that reads and writes records in a flat text file.
4
+ #
5
+ # Although this class can read and write CSV files, you should use the
6
+ # CSVFile data store for that instead of this one.
7
+ class File < DataStore
8
+ attr_reader :mode # :nodoc:
9
+
10
+ # Accepts the following options:
11
+ #
12
+ # header:: If true, the file has a header row that contains the names
13
+ # of each field. Default is false.
14
+ # delimiter:: String that separates individual fields in a row. Default
15
+ # is "\t".
16
+ # enclosure:: String that encloses individual fields. For example, if
17
+ # this is set to "\"", fields will be enclosed in double
18
+ # quotes. Default is nil (no enclosure).
19
+ # escape:: Escape sequence for occurrences of the enclosure string in
20
+ # field values. Set this to the special value :double if
21
+ # enclosure characters are escaped by doubling them (like in
22
+ # CSV and SQL). Default is nil.
23
+ # path:: Path to the file.
24
+ # null:: String that represents fields whose value is nil (but not
25
+ # blank). Default is "".
26
+ # keys:: Array of field names. Not necessary for files with a header
27
+ # row. Default for files without a header row is fieldXX,
28
+ # where XX is numbered sequentially starting from 00.
29
+ def initialize(options = {})
30
+ super()
31
+ # Extract options.
32
+ @header = options.delete(:header)
33
+ @delimiter = options.delete(:delimiter) || "\t"
34
+ @enclosure = options.delete(:enclosure)
35
+ @escape = options.delete(:escape)
36
+ @path = options.delete(:path)
37
+ @null = options.delete(:null) || ""
38
+ @keys = options.delete(:keys)
39
+ # Validate options.
40
+ raise(ArgumentError, "missing required option `path'") if @path.nil?
41
+ if @escape && @enclosure.nil?
42
+ raise(ArgumentError, "`escape' cannot be used without `enclosure'")
43
+ end
44
+ unless options.empty?
45
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
46
+ end
47
+ # Handle the special :double escape sequence.
48
+ @escape = @enclosure if @escape == :double
49
+ # Create an enclosure placeholder, which is used to avoid clobbering
50
+ # escaped enclosure characters during parsing.
51
+ if @escape
52
+ if @enclosure == 0.chr
53
+ safe_ch = 1.chr
54
+ else
55
+ safe_ch = 0.chr
56
+ end
57
+ @placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
58
+ end
59
+ end
60
+
61
+ # Returns the number of lines in the file (not counting the header, if
62
+ # there is one).
63
+ def count
64
+ return @count if @count
65
+ self.mode = :input
66
+ line_count = 0
67
+ rewind_and_restore do
68
+ io.readline if @header
69
+ until io.eof?
70
+ io.gets
71
+ line_count += 1
72
+ end
73
+ end
74
+ @count = line_count
75
+ end
76
+
77
+ def each_record(batch_size = nil) # :nodoc:
78
+ self.mode = :input
79
+ io.rewind
80
+ io.readline if @header
81
+ until io.eof?
82
+ line = io.gets || break
83
+ line.chomp!
84
+ values = values_from_s(line)
85
+ if keys.length != values.length
86
+ raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
87
+ end
88
+ record = {}
89
+ keys.length.times {|i| record[keys[i]] = values[i]}
90
+ yield record
91
+ end
92
+ end
93
+
94
+ def write_record(record) # :nodoc:
95
+ self.mode = :output
96
+ # If no key order was ever specified, make one up.
97
+ @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
98
+ # Write the header if this is the first record.
99
+ if @header && io.pos == 0
100
+ io.puts(values_to_s(keys))
101
+ end
102
+ # Write the values in a predictable order.
103
+ values = keys.collect do |k|
104
+ record[k].nil?? @null : record[k]
105
+ end
106
+ io.puts(values_to_s(values))
107
+ end
108
+
109
+ private
110
+
111
+ def values_to_s(values)
112
+ if @escape
113
+ values = values.collect do |v|
114
+ @enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
115
+ end
116
+ elsif @enclosure
117
+ values = values.collect {|v| @enclosure + v.to_s + @enclosure}
118
+ end
119
+ values.join(@delimiter)
120
+ end
121
+
122
+ def values_from_s(str)
123
+ if @escape
124
+ str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
125
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
126
+ values.first.sub!(/^#{@enclosure}/, "")
127
+ values.last.sub!(/#{@enclosure}$/, "")
128
+ values.each do |v|
129
+ v.gsub!(/#{@placeholder}/, @enclosure)
130
+ end
131
+ elsif @enclosure
132
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
133
+ values.first.sub!(/^#{@enclosure}/, "")
134
+ values.last.sub!(/#{@enclosure}$/, "")
135
+ else
136
+ values = str.split(/#{@delimiter}/)
137
+ end
138
+ values
139
+ end
140
+
141
+ def mode=(new_mode)
142
+ if !@mode.nil? && @mode != new_mode
143
+ raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
144
+ end
145
+ unless [:input, :output].include?(new_mode)
146
+ raise ArgumentError, "unknown mode `#{new_mode}'"
147
+ end
148
+ @mode = new_mode
149
+ end
150
+
151
+ def io
152
+ return @io if @io
153
+ if mode == :output
154
+ @io = ::File.open(@path, "w")
155
+ @io.rewind
156
+ @io
157
+ else
158
+ @io = ::File.open(@path, "r")
159
+ end
160
+ end
161
+
162
+ def keys
163
+ return @keys if @keys
164
+ return [] if mode == :output
165
+ line = rewind_and_restore { io.readline }
166
+ line.chomp!
167
+ fields = values_from_s(line)
168
+ if @header
169
+ @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
170
+ else
171
+ @keys = (0..(fields.length - 1)).to_a.collect! do |i|
172
+ sprintf("field%02d", i).to_sym
173
+ end
174
+ end
175
+ end
176
+
177
+ def rewind_and_restore
178
+ pos = io.pos
179
+ io.rewind
180
+ result = yield
181
+ io.seek(pos)
182
+ result
183
+ end
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,27 @@
1
+ require "data_transport/data_store/file"
2
+ require "data_transport/data_store/csv_file"
3
+ require "data_transport/data_store/active_record"
4
+
5
+ module DataTransport
6
+ class DataStore # :nodoc:
7
+ def count
8
+ raise NotImplementedError
9
+ end
10
+
11
+ def each_record(batch_size = nil)
12
+ raise NotImplementedError
13
+ end
14
+
15
+ def write_record(record)
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def finalize
20
+ # Do nothing by default.
21
+ end
22
+
23
+ def reset
24
+ # Do nothing by default.
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ module DataTransport
2
+ module Record # :nodoc:
3
+ class Destination # :nodoc:
4
+ attr_reader :record
5
+
6
+ def initialize
7
+ @record = {}
8
+ end
9
+
10
+ def reset!
11
+ @record.clear
12
+ end
13
+
14
+ def method_missing(name, *args)
15
+ name_s = name.to_s
16
+ if name_s[-1,1] == "="
17
+ unless args.length == 1
18
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
19
+ end
20
+ name_s.chop!
21
+ @record[name_s.to_sym] = args.first
22
+ else
23
+ super
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,24 @@
1
+ module DataTransport
2
+ module Record # :nodoc:
3
+ class Source # :nodoc:
4
+ def record=(record)
5
+ @record = record
6
+ end
7
+
8
+ def id
9
+ method_missing :id
10
+ end
11
+
12
+ def method_missing(name, *args)
13
+ if @record.has_key?(name)
14
+ unless args.empty?
15
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
16
+ end
17
+ @record[name]
18
+ else
19
+ super
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,52 @@
1
+ require "data_transport/data_store"
2
+ require "data_transport/record/source"
3
+ require "data_transport/record/destination"
4
+
5
+ module DataTransport
6
+ DEFAULT_BATCH_SIZE = 100_000 # :nodoc:
7
+
8
+ # Reads records from an input data source, processes them with the supplied
9
+ # block, and writes them to an output data source. Accepts the following
10
+ # options:
11
+ #
12
+ # batch_size:: Records are read from the input in batches. This option sets
13
+ # the number of records in a single batch. Default is 1000.
14
+ #
15
+ # The block is passed two objects that represent the source and destination
16
+ # record. These objects have methods that reflect the attributes of the
17
+ # records. The following example reads the +name+ and +price+ attributes from
18
+ # input records, downcases the name, multiplies the price by 100, and writes
19
+ # them to the output:
20
+ #
21
+ # # input = DataTransport::DataSource:: ...
22
+ # # output = DataTransport::DataSource:: ...
23
+ #
24
+ # DataTransport.map(input, output) do |src, dst|
25
+ # dst.name = src.name.downcase
26
+ # dst.price = (src.price * 100).to_i
27
+ # end
28
+ #
29
+ # The destination doesn't necessarily have to have the same attributes as the
30
+ # source (or even the same number of attributes). The transformations that
31
+ # can be accomplished are limited only by what you can do in a block of Ruby.
32
+ def self.map(input, output, options = {}, &block)
33
+ # Extract options.
34
+ batch_size = options.delete(:batch_size) || DEFAULT_BATCH_SIZE
35
+ raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
36
+ raise(RangeError, "batch size must be greater than zero") if batch_size < 1
37
+ unless options.empty?
38
+ raise(ArgumentError, "unrecognized options: `#{options.keys.join("', `")}'")
39
+ end
40
+ # Run the transport.
41
+ output.reset
42
+ source = DataTransport::Record::Source.new
43
+ destination = DataTransport::Record::Destination.new
44
+ input.each_record(batch_size) do |record|
45
+ source.record = record
46
+ destination.reset!
47
+ yield source, destination
48
+ output.write_record(destination.record)
49
+ end
50
+ output.finalize
51
+ end
52
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_transport
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 3
8
+ - 3
9
+ version: 0.3.3
10
+ platform: ruby
11
+ authors:
12
+ - Dana Contreras
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-27 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description:
22
+ email:
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/data_transport.rb
31
+ - lib/data_transport/data_store.rb
32
+ - lib/data_transport/data_store/active_record.rb
33
+ - lib/data_transport/data_store/csv_file.rb
34
+ - lib/data_transport/data_store/file.rb
35
+ - lib/data_transport/record/destination.rb
36
+ - lib/data_transport/record/source.rb
37
+ has_rdoc: true
38
+ homepage: http://github.com/DanaDanger/data_transport
39
+ licenses: []
40
+
41
+ post_install_message:
42
+ rdoc_options: []
43
+
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ segments:
51
+ - 0
52
+ version: "0"
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.3.6
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: A gem for importing and exporting large quantities of data.
67
+ test_files: []
68
+