DanaDanger-data_transport 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ require "data_transport/map"
2
+ require "data_transport/data_store"
3
+
4
+ module DataTransport
5
+ def self.default_batch_size
6
+ 1000
7
+ end
8
+
9
+ def self.map(input, output, options = {}, &block)
10
+ # Extract options.
11
+ ignore_duplicates = options.delete(:ignore_duplicates)
12
+ unless options.empty?
13
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
14
+ end
15
+ # If ignore_duplicates is true, make sure the output is a MySQL database.
16
+ if ignore_duplicates
17
+ unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
18
+ raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
19
+ end
20
+ end
21
+ # Run the transport.
22
+ output.reset
23
+ output.ignore_duplicates = true if ignore_duplicates
24
+ map = DataTransport::Map.new(&block)
25
+ input.each_record do |record|
26
+ output.write_record(map.map(record))
27
+ end
28
+ output.finalize
29
+ end
30
+ end
@@ -0,0 +1,31 @@
1
+ require "data_transport/data_store/file"
2
+ require "data_transport/data_store/active_record"
3
+
4
+ module DataTransport
5
+ class DataStore
6
+ def count
7
+ raise NotImplementedError
8
+ end
9
+
10
+ def each_record(batch_size = 1000)
11
+ raise NotImplementedError
12
+ end
13
+
14
+ def write_record(record)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def finalize
19
+ # Do nothing by default.
20
+ end
21
+
22
+ protected
23
+
24
+ def check_batch_size(batch_size)
25
+ batch_size ||= DataTransport.default_batch_size
26
+ raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
27
+ raise(RangeError, "batch size must be greater than zero") if batch_size < 1
28
+ batch_size
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,146 @@
1
+ module DataTransport
2
+ class DataStore
3
+ class ActiveRecord < DataStore
4
+ attr_accessor :ignore_duplicates
5
+
6
+ def initialize(options = {})
7
+ super()
8
+ # Extract options.
9
+ @class = options.delete(:class)
10
+ @connection = options.delete(:connection)
11
+ @table_name = options.delete(:table_name)
12
+ @conditions = options.delete(:conditions)
13
+ @truncate = options.delete(:truncate)
14
+ # Make sure a class or connection and table name was provided.
15
+ if @class.nil? && (@connection.nil? || @table_name.nil?)
16
+ raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
17
+ end
18
+ raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
19
+ # If connection specs were provided instead of a class, make an
20
+ # anonymous ActiveRecord subclass.
21
+ unless @class
22
+ @class = Class.new(::ActiveRecord::Base)
23
+ @class.set_table_name @table_name
24
+ @class.establish_connection @connection
25
+ end
26
+ # Make sure the class descends from ActiveRecord::Base.
27
+ klass = @class.superclass
28
+ is_active_record = false
29
+ while klass
30
+ if klass == ::ActiveRecord::Base
31
+ is_active_record = true
32
+ break
33
+ end
34
+ klass = klass.superclass
35
+ end
36
+ raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
37
+ # Check for unknown options.
38
+ unless options.empty?
39
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
40
+ end
41
+ # Ask the database how much data it can handle in one query. This only
42
+ # works on MySQL.
43
+ begin
44
+ rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
45
+ @max_allowed_packet = rows.first["Value"].to_i - 512
46
+ rescue
47
+ end
48
+ # Fetch column information
49
+ @columns = {}
50
+ @class.columns.each {|c| @columns[c.name.to_sym] = c}
51
+ end
52
+
53
+ def klass
54
+ @class
55
+ end
56
+
57
+ def count
58
+ @class.count(:conditions => @conditions)
59
+ end
60
+
61
+ def each_record(batch_size = nil)
62
+ batch_size = check_batch_size(batch_size)
63
+
64
+ conn = @class.connection
65
+ column_names = conn.columns(@class.table_name).collect {|c| c.name}
66
+
67
+ offset = 0
68
+ record = {}
69
+ base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
70
+ @class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
71
+ while true
72
+ sql = base_query.dup
73
+ conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
74
+ offset += batch_size
75
+ rows = conn.select_rows(sql)
76
+ break if rows.empty?
77
+ rows.each do |row|
78
+ record.clear
79
+ column_names.each_with_index do |column_name, i|
80
+ column_name = column_name.to_sym
81
+ record[column_name] = @columns[column_name].type_cast(row[i])
82
+ end
83
+ yield record
84
+ end
85
+ end
86
+ end
87
+
88
+ def write_record(record)
89
+ conn = @class.connection
90
+ # If no SQL has been produced yet, start an INSERT statement.
91
+ @sql_buffer ||= start_insert_sql(record)
92
+ # Convert the record into a string of quoted values.
93
+ values = []
94
+ record.each {|k, v| values << conn.quote(v, @columns[k])}
95
+ values = "(#{values.join ","}),"
96
+ # Write the record.
97
+ if @max_allowed_packet.nil?
98
+ # We have no information on the database's maximum allowed packet
99
+ # size, so it's safest to write the record immediately.
100
+ @sql_buffer << values
101
+ finalize
102
+ elsif @sql_buffer.length + record.length > @max_allowed_packet
103
+ # Appending this record to the SQL buffer will exceed the maximum
104
+ # allowed packet size. Send the buffer to the database and start a
105
+ # new statement with this record.
106
+ finalize
107
+ @sql_buffer = start_insert_sql
108
+ @sql_buffer << values
109
+ else
110
+ # This record will not cause the SQL buffer to exceed the maximum
111
+ # allowed packet size. Append it to the SQL buffer.
112
+ @sql_buffer << values
113
+ end
114
+ end
115
+
116
+ def finalize
117
+ if @truncate
118
+ conn = @class.connection
119
+ begin
120
+ conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
121
+ rescue
122
+ @class.delete_all
123
+ end
124
+ @truncate = false
125
+ end
126
+ if @sql_buffer[-1,1] == ","
127
+ @sql_buffer.chop!
128
+ @class.connection.execute(@sql_buffer)
129
+ end
130
+ end
131
+
132
+ def reset
133
+ self.ignore_duplicates = false
134
+ @sql_buffer = nil
135
+ end
136
+
137
+ private
138
+
139
+ def start_insert_sql(record)
140
+ "INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
141
+ "#{@class.connection.quote_table_name(@class.table_name)} " +
142
+ "(#{record.keys.join ","}) VALUES "
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,117 @@
1
+ module DataTransport
2
+ class DataStore
3
+ class File < DataStore
4
+ attr_reader :mode
5
+
6
+ def initialize(options = {})
7
+ super()
8
+
9
+ @header = options.delete(:header)
10
+ @delimiter = options.delete(:delimiter) || "\t"
11
+ @path = options.delete(:path)
12
+ @null = options.delete(:null) || ""
13
+ @keys = options.delete(:keys)
14
+
15
+ raise(ArgumentError, "missing required option `path'") if @path.nil?
16
+ unless options.empty?
17
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
18
+ end
19
+ end
20
+
21
+ def count
22
+ return @count if @count
23
+ self.mode = :input
24
+ line_count = 0
25
+ rewind_and_restore do
26
+ io.readline if @header
27
+ until io.eof?
28
+ io.gets
29
+ line_count += 1
30
+ end
31
+ end
32
+ @count = line_count
33
+ end
34
+
35
+ def each_record(batch_size = nil)
36
+ self.mode = :input
37
+
38
+ batch_size = check_batch_size(batch_size)
39
+
40
+ io.rewind
41
+ io.readline if @header
42
+ until io.eof?
43
+ line = io.gets || break
44
+ line.chomp!
45
+ values = line.split(/#{@delimiter}/)
46
+ if keys.length != values.length
47
+ raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
48
+ end
49
+ record = {}
50
+ keys.length.times {|i| record[keys[i]] = values[i]}
51
+ yield record
52
+ end
53
+ end
54
+
55
+ def write_record(record)
56
+ self.mode = :output
57
+ # If no key order was ever specified, make one up.
58
+ @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
59
+ # Write the header if this is the first record.
60
+ if @header && io.pos == 0
61
+ io.puts(keys.join(@delimiter))
62
+ end
63
+ # Write the values in a predictable order.
64
+ values = keys.collect do |k|
65
+ record[k].nil?? @null : record[k]
66
+ end
67
+ io.puts(values.join(@delimiter))
68
+ end
69
+
70
+ private
71
+
72
+ def mode=(new_mode)
73
+ if !@mode.nil? && @mode != new_mode
74
+ raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
75
+ end
76
+ unless [:input, :output].include?(new_mode)
77
+ raise ArgumentError, "unknown mode `#{new_mode}'"
78
+ end
79
+ @mode = new_mode
80
+ end
81
+
82
+ def io
83
+ return @io if @io
84
+ if mode == :output
85
+ @io = ::File.open(@path, "w")
86
+ @io.rewind
87
+ @io
88
+ else
89
+ @io = ::File.open(@path, "r")
90
+ end
91
+ end
92
+
93
+ def keys
94
+ return @keys if @keys
95
+ return [] if mode == :output
96
+ line = rewind_and_restore { io.readline }
97
+ line.chomp!
98
+ fields = line.split(/#{@delimiter}/)
99
+ if @header
100
+ @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
101
+ else
102
+ @keys = (0..(fields.length - 1)).to_a.collect! do |i|
103
+ sprintf("column%02d", i).to_sym
104
+ end
105
+ end
106
+ end
107
+
108
+ def rewind_and_restore
109
+ pos = io.pos
110
+ io.rewind
111
+ result = yield
112
+ io.seek(pos)
113
+ result
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,21 @@
1
+ require "data_transport/record/source"
2
+ require "data_transport/record/destination"
3
+
4
+ module DataTransport
5
+ class Map
6
+ attr_reader :source, :destination
7
+
8
+ def initialize(&block)
9
+ @block = block
10
+ @source = DataTransport::Record::Source.new
11
+ @destination = DataTransport::Record::Destination.new
12
+ end
13
+
14
+ def map(record)
15
+ @source.record = record
16
+ @destination.reset!
17
+ @block.call(@source, @destination)
18
+ @destination.record
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module DataTransport
2
+ module Record
3
+ class Destination
4
+ attr_reader :record
5
+
6
+ def initialize
7
+ @record = {}
8
+ end
9
+
10
+ def reset!
11
+ @record.clear
12
+ end
13
+
14
+ def method_missing(name, *args)
15
+ name_s = name.to_s
16
+ if name_s[-1,1] == "="
17
+ unless args.length == 1
18
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
19
+ end
20
+ name_s.chop!
21
+ @record[name_s.to_sym] = args.first
22
+ else
23
+ super
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,24 @@
1
+ module DataTransport
2
+ module Record
3
+ class Source
4
+ def record=(record)
5
+ @record = record
6
+ end
7
+
8
+ def id
9
+ method_missing :id
10
+ end
11
+
12
+ def method_missing(name, *args)
13
+ if @record.has_key?(name)
14
+ unless args.empty?
15
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
16
+ end
17
+ @record[name]
18
+ else
19
+ super
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: DanaDanger-data_transport
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Dana Danger
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-08 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email:
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/data_transport.rb
26
+ - lib/data_transport/map.rb
27
+ - lib/data_transport/data_store.rb
28
+ - lib/data_transport/data_store/active_record.rb
29
+ - lib/data_transport/data_store/file.rb
30
+ - lib/data_transport/record/destination.rb
31
+ - lib/data_transport/record/source.rb
32
+ has_rdoc: false
33
+ homepage: http://github.com/DanaDanger/data_transport
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements: []
52
+
53
+ rubyforge_project:
54
+ rubygems_version: 1.2.0
55
+ signing_key:
56
+ specification_version: 2
57
+ summary: A gem for importing and exporting large quantities of data.
58
+ test_files: []
59
+