DanaDanger-data_transport 0.1.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,51 @@
1
- require "data_transport/map"
2
1
  require "data_transport/data_store"
2
+ require "data_transport/record/source"
3
+ require "data_transport/record/destination"
3
4
 
4
5
  module DataTransport
5
- def self.default_batch_size
6
- 1000
7
- end
6
+ DEFAULT_BATCH_SIZE = 1000 # :nodoc:
8
7
 
8
+ # Reads records from an input data source, processes them with the supplied
9
+ # block, and writes them to an output data source. Accepts the following
10
+ # options:
11
+ #
12
+ # batch_size:: Records are read from the input in batches. This option sets
13
+ # the number of records in a single batch. Default is 1000.
14
+ #
15
+ # The block is passed two objects that represent the source and destination
16
+ # record. These objects have methods that reflect the attributes of the
17
+ # records. The following example reads the +name+ and +price+ attributes from
18
+ # input records, downcases the name, multiplies the price by 100, and writes
19
+ # them to the output:
20
+ #
21
+ # # input = DataTransport::DataSource:: ...
22
+ # # output = DataTransport::DataSource:: ...
23
+ #
24
+ # DataTransport.map(input, output) do |src, dst|
25
+ # dst.name = src.name.downcase
26
+ # dst.price = (src.price * 100).to_i
27
+ # end
28
+ #
29
+ # The destination doesn't necessarily have to have the same attributes as the
30
+ # source (or even the same number of attributes). The transformations that
31
+ # can be accomplished are limited only by what you can do in a block of Ruby.
9
32
  def self.map(input, output, options = {}, &block)
10
33
  # Extract options.
11
- ignore_duplicates = options.delete(:ignore_duplicates)
34
+ batch_size = options[:batch_size] || DEFAULT_BATCH_SIZE
35
+ raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
36
+ raise(RangeError, "batch size must be greater than zero") if batch_size < 1
12
37
  unless options.empty?
13
38
  raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
14
39
  end
15
- # If ignore_duplicates is true, make sure the output is a MySQL database.
16
- if ignore_duplicates
17
- unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
18
- raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
19
- end
20
- end
21
40
  # Run the transport.
22
41
  output.reset
23
- output.ignore_duplicates = true if ignore_duplicates
24
- map = DataTransport::Map.new(&block)
42
+ source = DataTransport::Record::Source.new
43
+ destination = DataTransport::Record::Destination.new
25
44
  input.each_record do |record|
26
- output.write_record(map.map(record))
45
+ source.record = record
46
+ destination.reset!
47
+ yield source, destination
48
+ output.write_record(destination.record)
27
49
  end
28
50
  output.finalize
29
51
  end
@@ -1,8 +1,9 @@
1
1
  require "data_transport/data_store/file"
2
+ require "data_transport/data_store/csv_file"
2
3
  require "data_transport/data_store/active_record"
3
4
 
4
5
  module DataTransport
5
- class DataStore
6
+ class DataStore # :nodoc:
6
7
  def count
7
8
  raise NotImplementedError
8
9
  end
@@ -19,13 +20,8 @@ module DataTransport
19
20
  # Do nothing by default.
20
21
  end
21
22
 
22
- protected
23
-
24
- def check_batch_size(batch_size)
25
- batch_size ||= DataTransport.default_batch_size
26
- raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
27
- raise(RangeError, "batch size must be greater than zero") if batch_size < 1
28
- batch_size
23
+ def reset
24
+ # Do nothing by default.
29
25
  end
30
26
  end
31
27
  end
@@ -1,16 +1,54 @@
1
1
  module DataTransport
2
2
  class DataStore
3
+ # Data store that reads and writes records in a database via ActiveRecord.
4
+ # This class is specifically optimized for reading and writing large
5
+ # numbers of records, providing a significant advantage over using
6
+ # ActiveRecord directly.
7
+ #
8
+ # On MySQL databases, records are written in batches of the largest size
9
+ # possible instead of being inserted one by one.
3
10
  class ActiveRecord < DataStore
4
- attr_accessor :ignore_duplicates
5
-
11
+ # There are two ways to initialize this data store. The first is by
12
+ # specifying one of your ActiveRecord models:
13
+ #
14
+ # DataTransport::DataStore::ActiveRecord.new :class => MyModel
15
+ #
16
+ # The second is by providing an ActiveRecord database specification (as
17
+ # read from database.yml, for example) and a table name:
18
+ #
19
+ # db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
20
+ # DataTransport::DataStore::ActiveRecord.new(
21
+ # :connection => db_spec,
22
+ # :table_name => "sprockets"
23
+ # )
24
+ #
25
+ # The second form is useful for importing or exporting data in non-Rails
26
+ # applications.
27
+ #
28
+ # In addition, the following options are accepted:
29
+ #
30
+ # conditions:: Conditions describing which records to read. This can
31
+ # be anything that ActiveRecord will recognize, such as
32
+ # a hash table, an array with substitutions, or raw SQL.
33
+ # Default is nil (no conditions, read all records).
34
+ # truncate:: If true, the table will be truncated before any records
35
+ # are written. On MySQL databases, this is performed by
36
+ # executing a TRUNCATE TABLE query; all other databases
37
+ # use ActiveRecord's delete_all method.
38
+ # ignore_errors:: If true, errors that occur during record insertion will
39
+ # be ignored. This is useful if your table has a unique
40
+ # index and you want to silently drop records with
41
+ # duplicate keys. Currently this only works on MySQL.
42
+ # Default is false.
6
43
  def initialize(options = {})
7
44
  super()
8
45
  # Extract options.
9
- @class = options.delete(:class)
10
- @connection = options.delete(:connection)
11
- @table_name = options.delete(:table_name)
12
- @conditions = options.delete(:conditions)
13
- @truncate = options.delete(:truncate)
46
+ @class = options.delete(:class)
47
+ @connection = options.delete(:connection)
48
+ @table_name = options.delete(:table_name)
49
+ @conditions = options.delete(:conditions)
50
+ @truncate = options.delete(:truncate)
51
+ @ignore_errors = options.delete(:ignore_errors)
14
52
  # Make sure a class or connection and table name was provided.
15
53
  if @class.nil? && (@connection.nil? || @table_name.nil?)
16
54
  raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
@@ -34,6 +72,13 @@ module DataTransport
34
72
  klass = klass.superclass
35
73
  end
36
74
  raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
75
+ # If ignore_errors is true, make sure we're connected to a MySQL
76
+ # database.
77
+ if @ignore_errors
78
+ unless @class.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
79
+ raise ArgumentError, "ignore_errors can only be used with a MySQL database"
80
+ end
81
+ end
37
82
  # Check for unknown options.
38
83
  unless options.empty?
39
84
  raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
@@ -50,17 +95,17 @@ module DataTransport
50
95
  @class.columns.each {|c| @columns[c.name.to_sym] = c}
51
96
  end
52
97
 
53
- def klass
98
+ def klass # :nodoc:
54
99
  @class
55
100
  end
56
101
 
102
+ # Returns the number of records in the table that match the data store's
103
+ # conditions.
57
104
  def count
58
105
  @class.count(:conditions => @conditions)
59
106
  end
60
107
 
61
- def each_record(batch_size = nil)
62
- batch_size = check_batch_size(batch_size)
63
-
108
+ def each_record(batch_size) # :nodoc:
64
109
  conn = @class.connection
65
110
  column_names = conn.columns(@class.table_name).collect {|c| c.name}
66
111
 
@@ -85,7 +130,7 @@ module DataTransport
85
130
  end
86
131
  end
87
132
 
88
- def write_record(record)
133
+ def write_record(record) # :nodoc:
89
134
  conn = @class.connection
90
135
  # If no SQL has been produced yet, start an INSERT statement.
91
136
  @sql_buffer ||= start_insert_sql(record)
@@ -113,7 +158,7 @@ module DataTransport
113
158
  end
114
159
  end
115
160
 
116
- def finalize
161
+ def finalize # :nodoc:
117
162
  if @truncate
118
163
  conn = @class.connection
119
164
  begin
@@ -123,21 +168,20 @@ module DataTransport
123
168
  end
124
169
  @truncate = false
125
170
  end
126
- if @sql_buffer[-1,1] == ","
171
+ if @sql_buffer && @sql_buffer[-1,1] == ","
127
172
  @sql_buffer.chop!
128
173
  @class.connection.execute(@sql_buffer)
129
174
  end
130
175
  end
131
176
 
132
- def reset
133
- self.ignore_duplicates = false
177
+ def reset # :nodoc:
134
178
  @sql_buffer = nil
135
179
  end
136
180
 
137
181
  private
138
182
 
139
183
  def start_insert_sql(record)
140
- "INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
184
+ "INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
141
185
  "#{@class.connection.quote_table_name(@class.table_name)} " +
142
186
  "(#{record.keys.join ","}) VALUES "
143
187
  end
@@ -1,23 +1,65 @@
1
1
  module DataTransport
2
2
  class DataStore
3
+ # Data store that reads and writes records in a flat text file.
4
+ #
5
+ # Although this class can read and write CSV files, you should use the
6
+ # CSVFile data store for that instead of this one.
3
7
  class File < DataStore
4
- attr_reader :mode
8
+ attr_reader :mode # :nodoc:
5
9
 
10
+ # Accepts the following options:
11
+ #
12
+ # header:: If true, the file has a header row that contains the names
13
+ # of each field. Default is false.
14
+ # delimiter:: String that separates individual fields in a row. Default
15
+ # is "\t".
16
+ # enclosure:: String that encloses individual fields. For example, if
17
+ # this is set to "\"", fields will be enclosed in double
18
+ # quotes. Default is nil (no enclosure).
19
+ # escape:: Escape sequence for occurrences of the enclosure string in
20
+ # field values. Set this to the special value :double if
21
+ # enclosure characters are escaped by doubling them (like in
22
+ # CSV and SQL). Default is nil.
23
+ # path:: Path to the file.
24
+ # null:: String that represents fields whose value is nil (but not
25
+ # blank). Default is "".
26
+ # keys:: Array of field names. Not necessary for files with a header
27
+ # row. Default for files without a header row is fieldXX,
28
+ # where XX is numbered sequentially starting from 00.
6
29
  def initialize(options = {})
7
30
  super()
8
-
31
+ # Extract options.
9
32
  @header = options.delete(:header)
10
33
  @delimiter = options.delete(:delimiter) || "\t"
34
+ @enclosure = options.delete(:enclosure)
35
+ @escape = options.delete(:escape)
11
36
  @path = options.delete(:path)
12
37
  @null = options.delete(:null) || ""
13
38
  @keys = options.delete(:keys)
14
-
39
+ # Validate options.
15
40
  raise(ArgumentError, "missing required option `path'") if @path.nil?
41
+ if @escape && @enclosure.nil?
42
+ raise(ArgumentError, "`escape' cannot be used without `enclosure'")
43
+ end
16
44
  unless options.empty?
17
45
  raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
18
46
  end
47
+ # Handle the special :double escape sequence.
48
+ @escape = @enclosure if @escape == :double
49
+ # Create an enclosure placeholder, which is used to avoid clobbering
50
+ # escaped enclosure characters during parsing.
51
+ if @escape
52
+ if @enclosure == 0.chr
53
+ safe_ch = 1.chr
54
+ else
55
+ safe_ch = 0.chr
56
+ end
57
+ @placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
58
+ end
19
59
  end
20
60
 
61
+ # Returns the number of lines in the file (not counting the header, if
62
+ # there is one).
21
63
  def count
22
64
  return @count if @count
23
65
  self.mode = :input
@@ -32,17 +74,14 @@ module DataTransport
32
74
  @count = line_count
33
75
  end
34
76
 
35
- def each_record(batch_size = nil)
77
+ def each_record(batch_size = nil) # :nodoc:
36
78
  self.mode = :input
37
-
38
- batch_size = check_batch_size(batch_size)
39
-
40
79
  io.rewind
41
80
  io.readline if @header
42
81
  until io.eof?
43
82
  line = io.gets || break
44
83
  line.chomp!
45
- values = line.split(/#{@delimiter}/)
84
+ values = values_from_s(line)
46
85
  if keys.length != values.length
47
86
  raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
48
87
  end
@@ -52,23 +91,53 @@ module DataTransport
52
91
  end
53
92
  end
54
93
 
55
- def write_record(record)
94
+ def write_record(record) # :nodoc:
56
95
  self.mode = :output
57
96
  # If no key order was ever specified, make one up.
58
97
  @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
59
98
  # Write the header if this is the first record.
60
99
  if @header && io.pos == 0
61
- io.puts(keys.join(@delimiter))
100
+ io.puts(values_to_s(keys))
62
101
  end
63
102
  # Write the values in a predictable order.
64
103
  values = keys.collect do |k|
65
104
  record[k].nil?? @null : record[k]
66
105
  end
67
- io.puts(values.join(@delimiter))
106
+ io.puts(values_to_s(values))
68
107
  end
69
108
 
70
109
  private
71
110
 
111
+ def values_to_s(values)
112
+ if @escape
113
+ values = values.collect do |v|
114
+ @enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
115
+ end
116
+ elsif @enclosure
117
+ values = values.collect {|v| @enclosure + v.to_s + @enclosure}
118
+ end
119
+ values.join(@delimiter)
120
+ end
121
+
122
+ def values_from_s(str)
123
+ if @escape
124
+ str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
125
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
126
+ values.first.sub!(/^#{@enclosure}/, "")
127
+ values.last.sub!(/#{@enclosure}$/, "")
128
+ values.each do |v|
129
+ v.gsub!(/#{@placeholder}/, @enclosure)
130
+ end
131
+ elsif @enclosure
132
+ values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
133
+ values.first.sub!(/^#{@enclosure}/, "")
134
+ values.last.sub!(/#{@enclosure}$/, "")
135
+ else
136
+ values = str.split(/#{@delimiter}/)
137
+ end
138
+ values
139
+ end
140
+
72
141
  def mode=(new_mode)
73
142
  if !@mode.nil? && @mode != new_mode
74
143
  raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
@@ -95,12 +164,12 @@ module DataTransport
95
164
  return [] if mode == :output
96
165
  line = rewind_and_restore { io.readline }
97
166
  line.chomp!
98
- fields = line.split(/#{@delimiter}/)
167
+ fields = values_from_s(line)
99
168
  if @header
100
169
  @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
101
170
  else
102
171
  @keys = (0..(fields.length - 1)).to_a.collect! do |i|
103
- sprintf("column%02d", i).to_sym
172
+ sprintf("field%02d", i).to_sym
104
173
  end
105
174
  end
106
175
  end
@@ -1,6 +1,6 @@
1
1
  module DataTransport
2
- module Record
3
- class Destination
2
+ module Record # :nodoc:
3
+ class Destination # :nodoc:
4
4
  attr_reader :record
5
5
 
6
6
  def initialize
@@ -1,6 +1,6 @@
1
1
  module DataTransport
2
- module Record
3
- class Source
2
+ module Record # :nodoc:
3
+ class Source # :nodoc:
4
4
  def record=(record)
5
5
  @record = record
6
6
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: DanaDanger-data_transport
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: "0.2"
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dana Danger
@@ -23,13 +23,12 @@ extra_rdoc_files: []
23
23
 
24
24
  files:
25
25
  - lib/data_transport.rb
26
- - lib/data_transport/map.rb
27
26
  - lib/data_transport/data_store.rb
28
27
  - lib/data_transport/data_store/active_record.rb
29
28
  - lib/data_transport/data_store/file.rb
30
29
  - lib/data_transport/record/destination.rb
31
30
  - lib/data_transport/record/source.rb
32
- has_rdoc: false
31
+ has_rdoc: true
33
32
  homepage: http://github.com/DanaDanger/data_transport
34
33
  post_install_message:
35
34
  rdoc_options: []
@@ -1,21 +0,0 @@
1
- require "data_transport/record/source"
2
- require "data_transport/record/destination"
3
-
4
- module DataTransport
5
- class Map
6
- attr_reader :source, :destination
7
-
8
- def initialize(&block)
9
- @block = block
10
- @source = DataTransport::Record::Source.new
11
- @destination = DataTransport::Record::Destination.new
12
- end
13
-
14
- def map(record)
15
- @source.record = record
16
- @destination.reset!
17
- @block.call(@source, @destination)
18
- @destination.record
19
- end
20
- end
21
- end