DanaDanger-data_transport 0.1.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,29 +1,51 @@
1
- require "data_transport/map"
2
1
  require "data_transport/data_store"
2
+ require "data_transport/record/source"
3
+ require "data_transport/record/destination"
3
4
 
4
5
  module DataTransport
5
- def self.default_batch_size
6
- 1000
7
- end
6
+ DEFAULT_BATCH_SIZE = 1000 # :nodoc:
8
7
 
8
+ # Reads records from an input data source, processes them with the supplied
9
+ # block, and writes them to an output data source. Accepts the following
10
+ # options:
11
+ #
12
+ # batch_size:: Records are read from the input in batches. This option sets
13
+ # the number of records in a single batch. Default is 1000.
14
+ #
15
+ # The block is passed two objects that represent the source and destination
16
+ # record. These objects have methods that reflect the attributes of the
17
+ # records. The following example reads the +name+ and +price+ attributes from
18
+ # input records, downcases the name, multiplies the price by 100, and writes
19
+ # them to the output:
20
+ #
21
+ # # input = DataTransport::DataSource:: ...
22
+ # # output = DataTransport::DataSource:: ...
23
+ #
24
+ # DataTransport.map(input, output) do |src, dst|
25
+ # dst.name = src.name.downcase
26
+ # dst.price = (src.price * 100).to_i
27
+ # end
28
+ #
29
+ # The destination doesn't necessarily have to have the same attributes as the
30
+ # source (or even the same number of attributes). The transformations that
31
+ # can be accomplished are limited only by what you can do in a block of Ruby.
9
32
  def self.map(input, output, options = {}, &block)
10
33
  # Extract options.
11
- ignore_duplicates = options.delete(:ignore_duplicates)
34
+ batch_size = options[:batch_size] || DEFAULT_BATCH_SIZE
35
+ raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
36
+ raise(RangeError, "batch size must be greater than zero") if batch_size < 1
12
37
  unless options.empty?
13
38
  raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
14
39
  end
15
- # If ignore_duplicates is true, make sure the output is a MySQL database.
16
- if ignore_duplicates
17
- unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
18
- raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
19
- end
20
- end
21
40
  # Run the transport.
22
41
  output.reset
23
- output.ignore_duplicates = true if ignore_duplicates
24
- map = DataTransport::Map.new(&block)
42
+ source = DataTransport::Record::Source.new
43
+ destination = DataTransport::Record::Destination.new
25
44
  input.each_record do |record|
26
- output.write_record(map.map(record))
45
+ source.record = record
46
+ destination.reset!
47
+ yield source, destination
48
+ output.write_record(destination.record)
27
49
  end
28
50
  output.finalize
29
51
  end
@@ -1,8 +1,9 @@
1
1
  require "data_transport/data_store/file"
2
+ require "data_transport/data_store/csv_file"
2
3
  require "data_transport/data_store/active_record"
3
4
 
4
5
  module DataTransport
5
- class DataStore
6
+ class DataStore # :nodoc:
6
7
  def count
7
8
  raise NotImplementedError
8
9
  end
@@ -19,13 +20,8 @@ module DataTransport
19
20
  # Do nothing by default.
20
21
  end
21
22
 
22
- protected
23
-
24
- def check_batch_size(batch_size)
25
- batch_size ||= DataTransport.default_batch_size
26
- raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
27
- raise(RangeError, "batch size must be greater than zero") if batch_size < 1
28
- batch_size
23
+ def reset
24
+ # Do nothing by default.
29
25
  end
30
26
  end
31
27
  end
@@ -1,16 +1,54 @@
1
1
  module DataTransport
2
2
  class DataStore
3
+ # Data store that reads and writes records in a database via ActiveRecord.
4
+ # This class is specifically optimized for reading and writing large
5
+ # numbers of records, providing a significant advantage over using
6
+ # ActiveRecord directly.
7
+ #
8
+ # On MySQL databases, records are written in batches of the largest size
9
+ # possible instead of being inserted one by one.
3
10
  class ActiveRecord < DataStore
4
- attr_accessor :ignore_duplicates
5
-
11
+ # There are two ways to initialize this data store. The first is by
12
+ # specifying one of your ActiveRecord models:
13
+ #
14
+ # DataTransport::DataStore::ActiveRecord.new :class => MyModel
15
+ #
16
+ # The second is by providing an ActiveRecord database specification (as
17
+ # read from database.yml, for example) and a table name:
18
+ #
19
+ # db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
20
+ # DataTransport::DataStore::ActiveRecord.new(
21
+ # :connection => db_spec,
22
+ # :table_name => "sprockets"
23
+ # )
24
+ #
25
+ # The second form is useful for importing or exporting data in non-Rails
26
+ # applications.
27
+ #
28
+ # In addition, the following options are accepted:
29
+ #
30
+ # conditions:: Conditions describing which records to read. This can
31
+ # be anything that ActiveRecord will recognize, such as
32
+ # a hash table, an array with substitutions, or raw SQL.
33
+ # Default is nil (no conditions, read all records).
34
+ # truncate:: If true, the table will be truncated before any records
35
+ # are written. On MySQL databases, this is performed by
36
+ # executing a TRUNCATE TABLE query; all other databases
37
+ # use ActiveRecord's delete_all method.
38
+ # ignore_errors:: If true, errors that occur during record insertion will
39
+ # be ignored. This is useful if your table has a unique
40
+ # index and you want to silently drop records with
41
+ # duplicate keys. Currently this only works on MySQL.
42
+ # Default is false.
6
43
  def initialize(options = {})
7
44
  super()
8
45
  # Extract options.
9
- @class = options.delete(:class)
10
- @connection = options.delete(:connection)
11
- @table_name = options.delete(:table_name)
12
- @conditions = options.delete(:conditions)
13
- @truncate = options.delete(:truncate)
46
+ @class = options.delete(:class)
47
+ @connection = options.delete(:connection)
48
+ @table_name = options.delete(:table_name)
49
+ @conditions = options.delete(:conditions)
50
+ @truncate = options.delete(:truncate)
51
+ @ignore_errors = options.delete(:ignore_errors)
14
52
  # Make sure a class or connection and table name was provided.
15
53
  if @class.nil? && (@connection.nil? || @table_name.nil?)
16
54
  raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
@@ -34,6 +72,13 @@ module DataTransport
34
72
  klass = klass.superclass
35
73
  end
36
74
  raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
75
+ # If ignore_errors is true, make sure we're connected to a MySQL
76
+ # database.
77
+ if @ignore_errors
78
+ unless @class.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
79
+ raise ArgumentError, "ignore_errors can only be used with a MySQL database"
80
+ end
81
+ end
37
82
  # Check for unknown options.
38
83
  unless options.empty?
39
84
  raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
@@ -50,17 +95,17 @@ module DataTransport
50
95
  @class.columns.each {|c| @columns[c.name.to_sym] = c}
51
96
  end
52
97
 
53
- def klass
98
+ def klass # :nodoc:
54
99
  @class
55
100
  end
56
101
 
102
+ # Returns the number of records in the table that match the data store's
103
+ # conditions.
57
104
  def count
58
105
  @class.count(:conditions => @conditions)
59
106
  end
60
107
 
61
- def each_record(batch_size = nil)
62
- batch_size = check_batch_size(batch_size)
63
-
108
+ def each_record(batch_size) # :nodoc:
64
109
  conn = @class.connection
65
110
  column_names = conn.columns(@class.table_name).collect {|c| c.name}
66
111
 
@@ -85,7 +130,7 @@ module DataTransport
85
130
  end
86
131
  end
87
132
 
88
- def write_record(record)
133
+ def write_record(record) # :nodoc:
89
134
  conn = @class.connection
90
135
  # If no SQL has been produced yet, start an INSERT statement.
91
136
  @sql_buffer ||= start_insert_sql(record)
@@ -113,7 +158,7 @@ module DataTransport
113
158
  end
114
159
  end
115
160
 
116
- def finalize
161
+ def finalize # :nodoc:
117
162
  if @truncate
118
163
  conn = @class.connection
119
164
  begin
@@ -123,21 +168,20 @@ module DataTransport
123
168
  end
124
169
  @truncate = false
125
170
  end
126
- if @sql_buffer[-1,1] == ","
171
+ if @sql_buffer && @sql_buffer[-1,1] == ","
127
172
  @sql_buffer.chop!
128
173
  @class.connection.execute(@sql_buffer)
129
174
  end
130
175
  end
131
176
 
132
- def reset
133
- self.ignore_duplicates = false
177
+ def reset # :nodoc:
134
178
  @sql_buffer = nil
135
179
  end
136
180
 
137
181
  private
138
182
 
139
183
  def start_insert_sql(record)
140
- "INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
184
+ "INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
141
185
  "#{@class.connection.quote_table_name(@class.table_name)} " +
142
186
  "(#{record.keys.join ","}) VALUES "
143
187
  end
@@ -1,23 +1,65 @@
1
1
  module DataTransport
2
2
  class DataStore
3
+ # Data store that reads and writes records in a flat text file.
4
+ #
5
+ # Although this class can read and write CSV files, you should use the
6
+ # CSVFile data store for that instead of this one.
3
7
  class File < DataStore
4
- attr_reader :mode
8
+ attr_reader :mode # :nodoc:
5
9
 
10
+ # Accepts the following options:
11
+ #
12
+ # header:: If true, the file has a header row that contains the names
13
+ # of each field. Default is false.
14
+ # delimiter:: String that separates individual fields in a row. Default
15
+ # is "\t".
16
+ # enclosure:: String that encloses individual fields. For example, if
17
+ # this is set to "\"", fields will be enclosed in double
18
+ # quotes. Default is nil (no enclosure).
19
+ # escape:: Escape sequence for occurrences of the enclosure string in
20
+ # field values. Set this to the special value :double if
21
+ # enclosure characters are escaped by doubling them (like in
22
+ # CSV and SQL). Default is nil.
23
+ # path:: Path to the file.
24
+ # null:: String that represents fields whose value is nil (but not
25
+ # blank). Default is "".
26
+ # keys:: Array of field names. Not necessary for files with a header
27
+ # row. Default for files without a header row is fieldXX,
28
+ # where XX is numbered sequentially starting from 00.
6
29
  def initialize(options = {})
7
30
  super()
8
-
31
+ # Extract options.
9
32
  @header = options.delete(:header)
10
33
  @delimiter = options.delete(:delimiter) || "\t"
34
+ @enclosure = options.delete(:enclosure)
35
+ @escape = options.delete(:escape)
11
36
  @path = options.delete(:path)
12
37
  @null = options.delete(:null) || ""
13
38
  @keys = options.delete(:keys)
14
-
39
+ # Validate options.
15
40
  raise(ArgumentError, "missing required option `path'") if @path.nil?
41
+ if @escape && @enclosure.nil?
42
+ raise(ArgumentError, "`escape' cannot be used without `enclosure'")
43
+ end
16
44
  unless options.empty?
17
45
  raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
18
46
  end
47
+ # Handle the special :double escape sequence.
48
+ @escape = @enclosure if @escape == :double
49
+ # Create an enclosure placeholder, which is used to avoid clobbering
50
+ # escaped enclosure characters during parsing.
51
+ if @escape
52
+ if @enclosure == 0.chr
53
+ safe_ch = 1.chr
54
+ else
55
+ safe_ch = 0.chr
56
+ end
57
+ @placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
58
+ end
19
59
  end
20
60
 
61
+ # Returns the number of lines in the file (not counting the header, if
62
+ # there is one).
21
63
  def count
22
64
  return @count if @count
23
65
  self.mode = :input
@@ -32,17 +74,14 @@ module DataTransport
32
74
  @count = line_count
33
75
  end
34
76
 
35
- def each_record(batch_size = nil)
77
+ def each_record(batch_size = nil) # :nodoc:
36
78
  self.mode = :input
37
-
38
- batch_size = check_batch_size(batch_size)
39
-
40
79
  io.rewind
41
80
  io.readline if @header
42
81
  until io.eof?
43
82
  line = io.gets || break
44
83
  line.chomp!
45
- values = line.split(/#{@delimiter}/)
84
+ values = values_from_s(line)
46
85
  if keys.length != values.length
47
86
  raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
48
87
  end
@@ -52,23 +91,53 @@ module DataTransport
52
91
  end
53
92
  end
54
93
 
55
- def write_record(record)
94
+ def write_record(record) # :nodoc:
56
95
  self.mode = :output
57
96
  # If no key order was ever specified, make one up.
58
97
  @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
59
98
  # Write the header if this is the first record.
60
99
  if @header && io.pos == 0
61
- io.puts(keys.join(@delimiter))
100
+ io.puts(values_to_s(keys))
62
101
  end
63
102
  # Write the values in a predictable order.
64
103
  values = keys.collect do |k|
65
104
  record[k].nil?? @null : record[k]
66
105
  end
67
- io.puts(values.join(@delimiter))
106
+ io.puts(values_to_s(values))
68
107
  end
69
108
 
70
109
  private
71
110
 
111
+ def values_to_s(values)
112
+ if @escape
113
+ values = values.collect do |v|
114
+ @enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
115
+ end
116
+ elsif @enclosure
117
+ values = values.collect {|v| @enclosure + v.to_s + @enclosure}
118
+ end
119
+ values.join(@delimiter)
120
+ end
121
+
122
+ def values_from_s(str)
123
+ if @escape
124
+ str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
125
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
126
+ values.first.sub!(/^#{@enclosure}/, "")
127
+ values.last.sub!(/#{@enclosure}$/, "")
128
+ values.each do |v|
129
+ v.gsub!(/#{@placeholder}/, @enclosure)
130
+ end
131
+ elsif @enclosure
132
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
133
+ values.first.sub!(/^#{@enclosure}/, "")
134
+ values.last.sub!(/#{@enclosure}$/, "")
135
+ else
136
+ values = str.split(/#{@delimiter}/)
137
+ end
138
+ values
139
+ end
140
+
72
141
  def mode=(new_mode)
73
142
  if !@mode.nil? && @mode != new_mode
74
143
  raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
@@ -95,12 +164,12 @@ module DataTransport
95
164
  return [] if mode == :output
96
165
  line = rewind_and_restore { io.readline }
97
166
  line.chomp!
98
- fields = line.split(/#{@delimiter}/)
167
+ fields = values_from_s(line)
99
168
  if @header
100
169
  @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
101
170
  else
102
171
  @keys = (0..(fields.length - 1)).to_a.collect! do |i|
103
- sprintf("column%02d", i).to_sym
172
+ sprintf("field%02d", i).to_sym
104
173
  end
105
174
  end
106
175
  end
@@ -1,6 +1,6 @@
1
1
  module DataTransport
2
- module Record
3
- class Destination
2
+ module Record # :nodoc:
3
+ class Destination # :nodoc:
4
4
  attr_reader :record
5
5
 
6
6
  def initialize
@@ -1,6 +1,6 @@
1
1
  module DataTransport
2
- module Record
3
- class Source
2
+ module Record # :nodoc:
3
+ class Source # :nodoc:
4
4
  def record=(record)
5
5
  @record = record
6
6
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: DanaDanger-data_transport
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: "0.2"
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dana Danger
@@ -23,13 +23,12 @@ extra_rdoc_files: []
23
23
 
24
24
  files:
25
25
  - lib/data_transport.rb
26
- - lib/data_transport/map.rb
27
26
  - lib/data_transport/data_store.rb
28
27
  - lib/data_transport/data_store/active_record.rb
29
28
  - lib/data_transport/data_store/file.rb
30
29
  - lib/data_transport/record/destination.rb
31
30
  - lib/data_transport/record/source.rb
32
- has_rdoc: false
31
+ has_rdoc: true
33
32
  homepage: http://github.com/DanaDanger/data_transport
34
33
  post_install_message:
35
34
  rdoc_options: []
@@ -1,21 +0,0 @@
1
- require "data_transport/record/source"
2
- require "data_transport/record/destination"
3
-
4
- module DataTransport
5
- class Map
6
- attr_reader :source, :destination
7
-
8
- def initialize(&block)
9
- @block = block
10
- @source = DataTransport::Record::Source.new
11
- @destination = DataTransport::Record::Destination.new
12
- end
13
-
14
- def map(record)
15
- @source.record = record
16
- @destination.reset!
17
- @block.call(@source, @destination)
18
- @destination.record
19
- end
20
- end
21
- end