DanaDanger-data_transport 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,30 @@
1
+ require "data_transport/map"
2
+ require "data_transport/data_store"
3
+
4
+ module DataTransport
5
+ def self.default_batch_size
6
+ 1000
7
+ end
8
+
9
+ def self.map(input, output, options = {}, &block)
10
+ # Extract options.
11
+ ignore_duplicates = options.delete(:ignore_duplicates)
12
+ unless options.empty?
13
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
14
+ end
15
+ # If ignore_duplicates is true, make sure the output is a MySQL database.
16
+ if ignore_duplicates
17
+ unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
18
+ raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
19
+ end
20
+ end
21
+ # Run the transport.
22
+ output.reset
23
+ output.ignore_duplicates = true if ignore_duplicates
24
+ map = DataTransport::Map.new(&block)
25
+ input.each_record do |record|
26
+ output.write_record(map.map(record))
27
+ end
28
+ output.finalize
29
+ end
30
+ end
@@ -0,0 +1,31 @@
1
+ require "data_transport/data_store/file"
2
+ require "data_transport/data_store/active_record"
3
+
4
+ module DataTransport
5
+ class DataStore
6
+ def count
7
+ raise NotImplementedError
8
+ end
9
+
10
+ def each_record(batch_size = 1000)
11
+ raise NotImplementedError
12
+ end
13
+
14
+ def write_record(record)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def finalize
19
+ # Do nothing by default.
20
+ end
21
+
22
+ protected
23
+
24
+ def check_batch_size(batch_size)
25
+ batch_size ||= DataTransport.default_batch_size
26
+ raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
27
+ raise(RangeError, "batch size must be greater than zero") if batch_size < 1
28
+ batch_size
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,146 @@
1
+ module DataTransport
2
+ class DataStore
3
+ class ActiveRecord < DataStore
4
+ attr_accessor :ignore_duplicates
5
+
6
+ def initialize(options = {})
7
+ super()
8
+ # Extract options.
9
+ @class = options.delete(:class)
10
+ @connection = options.delete(:connection)
11
+ @table_name = options.delete(:table_name)
12
+ @conditions = options.delete(:conditions)
13
+ @truncate = options.delete(:truncate)
14
+ # Make sure a class or connection and table name was provided.
15
+ if @class.nil? && (@connection.nil? || @table_name.nil?)
16
+ raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
17
+ end
18
+ raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
19
+ # If connection specs were provided instead of a class, make an
20
+ # anonymous ActiveRecord subclass.
21
+ unless @class
22
+ @class = Class.new(::ActiveRecord::Base)
23
+ @class.set_table_name @table_name
24
+ @class.establish_connection @connection
25
+ end
26
+ # Make sure the class descends from ActiveRecord::Base.
27
+ klass = @class.superclass
28
+ is_active_record = false
29
+ while klass
30
+ if klass == ::ActiveRecord::Base
31
+ is_active_record = true
32
+ break
33
+ end
34
+ klass = klass.superclass
35
+ end
36
+ raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
37
+ # Check for unknown options.
38
+ unless options.empty?
39
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
40
+ end
41
+ # Ask the database how much data it can handle in one query. This only
42
+ # works on MySQL.
43
+ begin
44
+ rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
45
+ @max_allowed_packet = rows.first["Value"].to_i - 512
46
+ rescue
47
+ end
48
+ # Fetch column information
49
+ @columns = {}
50
+ @class.columns.each {|c| @columns[c.name.to_sym] = c}
51
+ end
52
+
53
+ def klass
54
+ @class
55
+ end
56
+
57
+ def count
58
+ @class.count(:conditions => @conditions)
59
+ end
60
+
61
+ def each_record(batch_size = nil)
62
+ batch_size = check_batch_size(batch_size)
63
+
64
+ conn = @class.connection
65
+ column_names = conn.columns(@class.table_name).collect {|c| c.name}
66
+
67
+ offset = 0
68
+ record = {}
69
+ base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
70
+ @class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
71
+ while true
72
+ sql = base_query.dup
73
+ conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
74
+ offset += batch_size
75
+ rows = conn.select_rows(sql)
76
+ break if rows.empty?
77
+ rows.each do |row|
78
+ record.clear
79
+ column_names.each_with_index do |column_name, i|
80
+ column_name = column_name.to_sym
81
+ record[column_name] = @columns[column_name].type_cast(row[i])
82
+ end
83
+ yield record
84
+ end
85
+ end
86
+ end
87
+
88
+ def write_record(record)
89
+ conn = @class.connection
90
+ # If no SQL has been produced yet, start an INSERT statement.
91
+ @sql_buffer ||= start_insert_sql(record)
92
+ # Convert the record into a string of quoted values.
93
+ values = []
94
+ record.each {|k, v| values << conn.quote(v, @columns[k])}
95
+ values = "(#{values.join ","}),"
96
+ # Write the record.
97
+ if @max_allowed_packet.nil?
98
+ # We have no information on the database's maximum allowed packet
99
+ # size, so it's safest to write the record immediately.
100
+ @sql_buffer << values
101
+ finalize
102
+ elsif @sql_buffer.length + record.length > @max_allowed_packet
103
+ # Appending this record to the SQL buffer will exceed the maximum
104
+ # allowed packet size. Send the buffer to the database and start a
105
+ # new statement with this record.
106
+ finalize
107
+ @sql_buffer = start_insert_sql
108
+ @sql_buffer << values
109
+ else
110
+ # This record will not cause the SQL buffer to exceed the maximum
111
+ # allowed packet size. Append it to the SQL buffer.
112
+ @sql_buffer << values
113
+ end
114
+ end
115
+
116
+ def finalize
117
+ if @truncate
118
+ conn = @class.connection
119
+ begin
120
+ conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
121
+ rescue
122
+ @class.delete_all
123
+ end
124
+ @truncate = false
125
+ end
126
+ if @sql_buffer[-1,1] == ","
127
+ @sql_buffer.chop!
128
+ @class.connection.execute(@sql_buffer)
129
+ end
130
+ end
131
+
132
+ def reset
133
+ self.ignore_duplicates = false
134
+ @sql_buffer = nil
135
+ end
136
+
137
+ private
138
+
139
+ def start_insert_sql(record)
140
+ "INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
141
+ "#{@class.connection.quote_table_name(@class.table_name)} " +
142
+ "(#{record.keys.join ","}) VALUES "
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,117 @@
1
+ module DataTransport
2
+ class DataStore
3
+ class File < DataStore
4
+ attr_reader :mode
5
+
6
+ def initialize(options = {})
7
+ super()
8
+
9
+ @header = options.delete(:header)
10
+ @delimiter = options.delete(:delimiter) || "\t"
11
+ @path = options.delete(:path)
12
+ @null = options.delete(:null) || ""
13
+ @keys = options.delete(:keys)
14
+
15
+ raise(ArgumentError, "missing required option `path'") if @path.nil?
16
+ unless options.empty?
17
+ raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
18
+ end
19
+ end
20
+
21
+ def count
22
+ return @count if @count
23
+ self.mode = :input
24
+ line_count = 0
25
+ rewind_and_restore do
26
+ io.readline if @header
27
+ until io.eof?
28
+ io.gets
29
+ line_count += 1
30
+ end
31
+ end
32
+ @count = line_count
33
+ end
34
+
35
+ def each_record(batch_size = nil)
36
+ self.mode = :input
37
+
38
+ batch_size = check_batch_size(batch_size)
39
+
40
+ io.rewind
41
+ io.readline if @header
42
+ until io.eof?
43
+ line = io.gets || break
44
+ line.chomp!
45
+ values = line.split(/#{@delimiter}/)
46
+ if keys.length != values.length
47
+ raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
48
+ end
49
+ record = {}
50
+ keys.length.times {|i| record[keys[i]] = values[i]}
51
+ yield record
52
+ end
53
+ end
54
+
55
+ def write_record(record)
56
+ self.mode = :output
57
+ # If no key order was ever specified, make one up.
58
+ @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
59
+ # Write the header if this is the first record.
60
+ if @header && io.pos == 0
61
+ io.puts(keys.join(@delimiter))
62
+ end
63
+ # Write the values in a predictable order.
64
+ values = keys.collect do |k|
65
+ record[k].nil?? @null : record[k]
66
+ end
67
+ io.puts(values.join(@delimiter))
68
+ end
69
+
70
+ private
71
+
72
+ def mode=(new_mode)
73
+ if !@mode.nil? && @mode != new_mode
74
+ raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
75
+ end
76
+ unless [:input, :output].include?(new_mode)
77
+ raise ArgumentError, "unknown mode `#{new_mode}'"
78
+ end
79
+ @mode = new_mode
80
+ end
81
+
82
+ def io
83
+ return @io if @io
84
+ if mode == :output
85
+ @io = ::File.open(@path, "w")
86
+ @io.rewind
87
+ @io
88
+ else
89
+ @io = ::File.open(@path, "r")
90
+ end
91
+ end
92
+
93
+ def keys
94
+ return @keys if @keys
95
+ return [] if mode == :output
96
+ line = rewind_and_restore { io.readline }
97
+ line.chomp!
98
+ fields = line.split(/#{@delimiter}/)
99
+ if @header
100
+ @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
101
+ else
102
+ @keys = (0..(fields.length - 1)).to_a.collect! do |i|
103
+ sprintf("column%02d", i).to_sym
104
+ end
105
+ end
106
+ end
107
+
108
+ def rewind_and_restore
109
+ pos = io.pos
110
+ io.rewind
111
+ result = yield
112
+ io.seek(pos)
113
+ result
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,21 @@
1
+ require "data_transport/record/source"
2
+ require "data_transport/record/destination"
3
+
4
+ module DataTransport
5
+ class Map
6
+ attr_reader :source, :destination
7
+
8
+ def initialize(&block)
9
+ @block = block
10
+ @source = DataTransport::Record::Source.new
11
+ @destination = DataTransport::Record::Destination.new
12
+ end
13
+
14
+ def map(record)
15
+ @source.record = record
16
+ @destination.reset!
17
+ @block.call(@source, @destination)
18
+ @destination.record
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module DataTransport
2
+ module Record
3
+ class Destination
4
+ attr_reader :record
5
+
6
+ def initialize
7
+ @record = {}
8
+ end
9
+
10
+ def reset!
11
+ @record.clear
12
+ end
13
+
14
+ def method_missing(name, *args)
15
+ name_s = name.to_s
16
+ if name_s[-1,1] == "="
17
+ unless args.length == 1
18
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
19
+ end
20
+ name_s.chop!
21
+ @record[name_s.to_sym] = args.first
22
+ else
23
+ super
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,24 @@
1
+ module DataTransport
2
+ module Record
3
+ class Source
4
+ def record=(record)
5
+ @record = record
6
+ end
7
+
8
+ def id
9
+ method_missing :id
10
+ end
11
+
12
+ def method_missing(name, *args)
13
+ if @record.has_key?(name)
14
+ unless args.empty?
15
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
16
+ end
17
+ @record[name]
18
+ else
19
+ super
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: DanaDanger-data_transport
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Dana Danger
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-08 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email:
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/data_transport.rb
26
+ - lib/data_transport/map.rb
27
+ - lib/data_transport/data_store.rb
28
+ - lib/data_transport/data_store/active_record.rb
29
+ - lib/data_transport/data_store/file.rb
30
+ - lib/data_transport/record/destination.rb
31
+ - lib/data_transport/record/source.rb
32
+ has_rdoc: false
33
+ homepage: http://github.com/DanaDanger/data_transport
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements: []
52
+
53
+ rubyforge_project:
54
+ rubygems_version: 1.2.0
55
+ signing_key:
56
+ specification_version: 2
57
+ summary: A gem for importing and exporting large quantities of data.
58
+ test_files: []
59
+