mosql 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml CHANGED
@@ -1,6 +1,5 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 1.8.7
4
3
  - 1.9.3
5
4
  services:
6
5
  - mongodb
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- mosql (0.1.2)
4
+ mosql (0.2.0)
5
5
  bson_ext
6
6
  json
7
7
  log4r
data/README.md CHANGED
@@ -92,7 +92,7 @@ MongoDB `_id` field will be mapped to an SQL `id` column.
92
92
  At present, MoSQL does not support using the dot notation to access
93
93
  elements inside arrays.
94
94
 
95
- As a shorthand, you can specify a one-elment hash of the form `name:
95
+ As a shorthand, you can specify a one-element hash of the form `name:
96
96
  TYPE`, in which case `name` will be used for both the source attribute
97
97
  and the name of the destination column. You can see this shorthand for
98
98
  the `title` and `created` attributes, above.
@@ -127,7 +127,7 @@ command-line parameters.
127
127
 
128
128
  1. Create the appropriate SQL tables
129
129
  2. Import data from the Mongo database
130
- 3. Start tailing the mongo oplog, propogating changes from MongoDB to SQL.
130
+ 3. Start tailing the mongo oplog, propagating changes from MongoDB to SQL.
131
131
 
132
132
 
133
133
  After the first run, `mosql` will store the status of the optailer in
@@ -143,13 +143,10 @@ You likely want to run `mosql` against a secondary node, at least for
143
143
  the initial import, which will cause large amounts of disk activity on
144
144
  the target node. One option is to specify this in your connect URI:
145
145
 
146
- mosql --mongo mongodb://node1,node2,node3?slaveOk=true
146
+ mosql --mongo mongodb://node1,node2,node3?readPreference=secondary
147
147
 
148
- (You should be able to specify `?readPreference=secondary`, but the
149
- Mongo Ruby driver does not appear to support that usage. I've filed a
150
- [bug with 10gen][bug-read-pref] about this omission).
151
-
152
- [bug-read-pref]: https://jira.mongodb.org/browse/RUBY-547
148
+ (Note that this requires you be using at least version 1.8.3 of
149
+ `mongo-ruby-driver`)
153
150
 
154
151
  ## Advanced usage
155
152
 
@@ -185,6 +182,24 @@ written that code yet.
185
182
 
186
183
  [plv8]: http://code.google.com/p/plv8js/
187
184
 
185
+ ## Authentication
186
+
187
+ At present, in order to use MoSQL with a MongoDB instance requiring
188
+ authentication, you must:
189
+
190
+ - Have a user with access to the admin database.
191
+ - Specify the `admin` database in the `--mongo` argument
192
+ - Specify the username and password in the `--mongo` argument
193
+
194
+ e.g.
195
+
196
+ ```
197
+ mosql --mongo mongdb://$USER@$PASSWORD:$HOST/admin
198
+ ```
199
+
200
+ I have not yet tested using MoSQL with 2.4's "roles" support. Drop me
201
+ a note if you figure out anything I should know.
202
+
188
203
  ## Sharded clusters
189
204
 
190
205
  MoSQL does not have special support for sharded Mongo clusters at this
data/lib/mosql/cli.rb CHANGED
@@ -27,7 +27,7 @@ module MoSQL
27
27
  %w[TERM INT USR2].each do |sig|
28
28
  Signal.trap(sig) do
29
29
  log.info("Got SIG#{sig}. Preparing to exit...")
30
- @done = true
30
+ @streamer.stop
31
31
  end
32
32
  end
33
33
  end
@@ -90,6 +90,10 @@ module MoSQL
90
90
  opts.on("--no-drop-tables", "Don't drop the table if it exists during the initial import") do
91
91
  @options[:no_drop_tables] = true
92
92
  end
93
+
94
+ opts.on("--unsafe", "Ignore rows that cause errors on insert") do
95
+ @options[:unsafe] = true
96
+ end
93
97
  end
94
98
 
95
99
  optparse.parse!(@args)
@@ -104,7 +108,7 @@ module MoSQL
104
108
  end
105
109
 
106
110
  def connect_mongo
107
- @mongo = Mongo::Connection.from_uri(options[:mongo])
111
+ @mongo = Mongo::MongoClient.from_uri(options[:mongo])
108
112
  config = @mongo['admin'].command(:ismaster => 1)
109
113
  if !config['setName'] && !options[:skip_tail]
110
114
  log.warn("`#{options[:mongo]}' is not a replset.")
@@ -116,7 +120,7 @@ module MoSQL
116
120
  end
117
121
 
118
122
  def connect_sql
119
- @sql = MoSQL::SQLAdapter.new(@schemamap, options[:sql], options[:schema])
123
+ @sql = MoSQL::SQLAdapter.new(@schema, options[:sql], options[:schema])
120
124
  if options[:verbose] >= 2
121
125
  @sql.db.sql_log_level = :debug
122
126
  @sql.db.loggers << Logger.new($stderr)
@@ -125,7 +129,13 @@ module MoSQL
125
129
 
126
130
  def load_collections
127
131
  collections = YAML.load_file(@options[:collections])
128
- @schemamap = MoSQL::Schema.new(collections)
132
+ begin
133
+ @schema = MoSQL::Schema.new(collections)
134
+ rescue MoSQL::SchemaError => e
135
+ log.error("Error parsing collection map `#{@options[:collections]}':")
136
+ log.error(e.to_s)
137
+ exit(1)
138
+ end
129
139
  end
130
140
 
131
141
  def run
@@ -139,183 +149,16 @@ module MoSQL
139
149
  @tailer = MoSQL::Tailer.new([@mongo], :existing, metadata_table,
140
150
  :service => options[:service])
141
151
 
142
- if options[:reimport] || tailer.read_timestamp.seconds == 0
143
- initial_import
144
- end
145
-
146
- unless options[:skip_tail]
147
- optail
148
- end
149
- end
150
-
151
- # Helpers
152
-
153
- def collection_for_ns(ns)
154
- dbname, collection = ns.split(".", 2)
155
- @mongo.db(dbname).collection(collection)
156
- end
157
-
158
- def bulk_upsert(table, ns, items)
159
- begin
160
- @schemamap.copy_data(table.db, ns, items)
161
- rescue Sequel::DatabaseError => e
162
- log.debug("Bulk insert error (#{e}), attempting invidual upserts...")
163
- cols = @schemamap.all_columns(@schemamap.find_ns(ns))
164
- items.each do |it|
165
- h = {}
166
- cols.zip(it).each { |k,v| h[k] = v }
167
- @sql.upsert(table, @schemamap.primary_sql_key_for_ns(ns), h)
168
- end
169
- end
170
- end
171
-
172
- def with_retries(tries=10)
173
- tries.times do |try|
174
- begin
175
- yield
176
- rescue Mongo::ConnectionError, Mongo::ConnectionFailure, Mongo::OperationFailure => e
177
- # Duplicate key error
178
- raise if e.kind_of?(Mongo::OperationFailure) && [11000, 11001].include?(e.error_code)
179
- # Cursor timeout
180
- raise if e.kind_of?(Mongo::OperationFailure) && e.message =~ /^Query response returned CURSOR_NOT_FOUND/
181
- delay = 0.5 * (1.5 ** try)
182
- log.warn("Mongo exception: #{e}, sleeping #{delay}s...")
183
- sleep(delay)
184
- end
185
- end
186
- end
187
-
188
- def track_time
189
- start = Time.now
190
- yield
191
- Time.now - start
192
- end
152
+ @streamer = Streamer.new(:options => @options,
153
+ :tailer => @tailer,
154
+ :mongo => @mongo,
155
+ :sql => @sql,
156
+ :schema => @schema)
193
157
 
194
- def initial_import
195
- @schemamap.create_schema(@sql.db, !options[:no_drop_tables])
158
+ @streamer.import
196
159
 
197
160
  unless options[:skip_tail]
198
- start_ts = @mongo['local']['oplog.rs'].find_one({}, {:sort => [['$natural', -1]]})['ts']
199
- end
200
-
201
- want_dbs = @schemamap.all_mongo_dbs & @mongo.database_names
202
- want_dbs.each do |dbname|
203
- log.info("Importing for Mongo DB #{dbname}...")
204
- db = @mongo.db(dbname)
205
- want = Set.new(@schemamap.collections_for_mongo_db(dbname))
206
- db.collections.select { |c| want.include?(c.name) }.each do |collection|
207
- ns = "#{dbname}.#{collection.name}"
208
- import_collection(ns, collection)
209
- exit(0) if @done
210
- end
211
- end
212
-
213
- tailer.write_timestamp(start_ts) unless options[:skip_tail]
214
- end
215
-
216
- def import_collection(ns, collection)
217
- log.info("Importing for #{ns}...")
218
- count = 0
219
- batch = []
220
- table = @sql.table_for_ns(ns)
221
- table.truncate unless options[:no_drop_tables]
222
-
223
- start = Time.now
224
- sql_time = 0
225
- collection.find(nil, :batch_size => BATCH) do |cursor|
226
- with_retries do
227
- cursor.each do |obj|
228
- batch << @schemamap.transform(ns, obj)
229
- count += 1
230
-
231
- if batch.length >= BATCH
232
- sql_time += track_time do
233
- bulk_upsert(table, ns, batch)
234
- end
235
- elapsed = Time.now - start
236
- log.info("Imported #{count} rows (#{elapsed}s, #{sql_time}s SQL)...")
237
- batch.clear
238
- exit(0) if @done
239
- end
240
- end
241
- end
242
- end
243
-
244
- unless batch.empty?
245
- bulk_upsert(table, ns, batch)
246
- end
247
- end
248
-
249
- def optail
250
- tailer.tail_from(options[:tail_from] ?
251
- BSON::Timestamp.new(options[:tail_from].to_i, 0) :
252
- nil)
253
- until @done
254
- tailer.stream(1000) do |op|
255
- handle_op(op)
256
- end
257
- end
258
- end
259
-
260
- def sync_object(ns, _id)
261
- primary_sql_key = @schemamap.primary_sql_key_for_ns(ns)
262
- sqlid = @sql.transform_one_ns(ns, { '_id' => _id })[primary_sql_key]
263
- obj = collection_for_ns(ns).find_one({:_id => _id})
264
- if obj
265
- @sql.upsert_ns(ns, obj)
266
- else
267
- @sql.table_for_ns(ns).where(primary_sql_key.to_sym => sqlid).delete()
268
- end
269
- end
270
-
271
- def handle_op(op)
272
- log.debug("processing op: #{op.inspect}")
273
- unless op['ns'] && op['op']
274
- log.warn("Weird op: #{op.inspect}")
275
- return
276
- end
277
-
278
- unless @schemamap.find_ns(op['ns'])
279
- log.debug("Skipping op for unknown ns #{op['ns']}...")
280
- return
281
- end
282
-
283
- ns = op['ns']
284
- dbname, collection_name = ns.split(".", 2)
285
-
286
- case op['op']
287
- when 'n'
288
- log.debug("Skipping no-op #{op.inspect}")
289
- when 'i'
290
- if collection_name == 'system.indexes'
291
- log.info("Skipping index update: #{op.inspect}")
292
- else
293
- @sql.upsert_ns(ns, op['o'])
294
- end
295
- when 'u'
296
- selector = op['o2']
297
- update = op['o']
298
- if update.keys.any? { |k| k.start_with? '$' }
299
- log.debug("resync #{ns}: #{selector['_id']} (update was: #{update.inspect})")
300
- sync_object(ns, selector['_id'])
301
- else
302
- log.debug("upsert #{ns}: _id=#{selector['_id']}")
303
-
304
- # The update operation replaces the existing object, but
305
- # preserves its _id field, so grab the _id off of the
306
- # 'query' field -- it's not guaranteed to be present on the
307
- # update.
308
- update = { '_id' => selector['_id'] }.merge(update)
309
- @sql.upsert_ns(ns, update)
310
- end
311
- when 'd'
312
- if options[:ignore_delete]
313
- log.debug("Ignoring delete op on #{ns} as instructed.")
314
- else
315
- @sql.delete_ns(ns, op['o'])
316
- end
317
- else
318
- log.info("Skipping unknown op #{op.inspect}")
161
+ @streamer.optail
319
162
  end
320
163
  end
321
164
  end
data/lib/mosql/schema.rb CHANGED
@@ -10,9 +10,9 @@ module MoSQL
10
10
  if ent.is_a?(Hash) && ent[:source].is_a?(String) && ent[:type].is_a?(String)
11
11
  # new configuration format
12
12
  array << {
13
- :source => ent.delete(:source),
14
- :type => ent.delete(:type),
15
- :name => ent.first.first,
13
+ :source => ent.fetch(:source),
14
+ :type => ent.fetch(:type),
15
+ :name => (ent.keys - [:source, :type]).first,
16
16
  }
17
17
  elsif ent.is_a?(Hash) && ent.keys.length == 1 && ent.values.first.is_a?(String)
18
18
  array << {
@@ -21,7 +21,7 @@ module MoSQL
21
21
  :type => ent.first.last
22
22
  }
23
23
  else
24
- raise "Invalid ordered hash entry #{ent.inspect}"
24
+ raise SchemaError.new("Invalid ordered hash entry #{ent.inspect}")
25
25
  end
26
26
 
27
27
  end
@@ -32,7 +32,7 @@ module MoSQL
32
32
  seen = Set.new
33
33
  spec[:columns].each do |col|
34
34
  if seen.include?(col[:source])
35
- raise "Duplicate source #{col[:source]} in column definition #{col[:name]} for #{ns}."
35
+ raise SchemaError.new("Duplicate source #{col[:source]} in column definition #{col[:name]} for #{ns}.")
36
36
  end
37
37
  seen.add(col[:source])
38
38
  end
@@ -40,44 +40,75 @@ module MoSQL
40
40
 
41
41
  def parse_spec(ns, spec)
42
42
  out = spec.dup
43
- out[:columns] = to_array(spec[:columns])
43
+ out[:columns] = to_array(spec.fetch(:columns))
44
44
  check_columns!(ns, out)
45
45
  out
46
46
  end
47
47
 
48
+ def parse_meta(meta)
49
+ meta = {} if meta.nil?
50
+ meta[:alias] = [] unless meta.key?(:alias)
51
+ meta[:alias] = [meta[:alias]] unless meta[:alias].is_a?(Array)
52
+ meta[:alias] = meta[:alias].map { |r| Regexp.new(r) }
53
+ meta
54
+ end
55
+
48
56
  def initialize(map)
49
57
  @map = {}
50
58
  map.each do |dbname, db|
51
- @map[dbname] ||= {}
59
+ @map[dbname] = { :meta => parse_meta(db[:meta]) }
52
60
  db.each do |cname, spec|
53
- @map[dbname][cname] = parse_spec("#{dbname}.#{cname}", spec)
61
+ next unless cname.is_a?(String)
62
+ begin
63
+ @map[dbname][cname] = parse_spec("#{dbname}.#{cname}", spec)
64
+ rescue KeyError => e
65
+ raise SchemaError.new("In spec for #{dbname}.#{cname}: #{e}")
66
+ end
54
67
  end
55
68
  end
56
69
  end
57
70
 
58
71
  def create_schema(db, clobber=false)
59
- @map.values.map(&:values).flatten.each do |collection|
60
- meta = collection[:meta]
61
- log.info("Creating table '#{meta[:table]}'...")
62
- db.send(clobber ? :create_table! : :create_table?, meta[:table]) do
63
- collection[:columns].each do |col|
64
- column col[:name], col[:type]
65
-
66
- if col[:source].to_sym == :_id
67
- primary_key [col[:name].to_sym]
72
+ @map.values.each do |dbspec|
73
+ dbspec.each do |n, collection|
74
+ next unless n.is_a?(String)
75
+ meta = collection[:meta]
76
+ log.info("Creating table '#{meta[:table]}'...")
77
+ db.send(clobber ? :create_table! : :create_table?, meta[:table]) do
78
+ collection[:columns].each do |col|
79
+ opts = {}
80
+ if col[:source] == '$timestamp'
81
+ opts[:default] = Sequel.function(:now)
82
+ end
83
+ column col[:name], col[:type], opts
84
+
85
+ if col[:source].to_sym == :_id
86
+ primary_key [col[:name].to_sym]
87
+ end
88
+ end
89
+ if meta[:extra_props]
90
+ column '_extra_props', 'TEXT'
68
91
  end
69
92
  end
70
- if meta[:extra_props]
71
- column '_extra_props', 'TEXT'
72
- end
73
93
  end
74
94
  end
75
95
  end
76
96
 
97
+ def find_db(db)
98
+ unless @map.key?(db)
99
+ @map[db] = @map.values.find do |spec|
100
+ spec && spec[:meta][:alias].any? { |a| a.match(db) }
101
+ end
102
+ end
103
+ @map[db]
104
+ end
105
+
77
106
  def find_ns(ns)
78
107
  db, collection = ns.split(".")
79
- schema = (@map[db] || {})[collection]
80
- if schema.nil?
108
+ unless spec = find_db(db)
109
+ return nil
110
+ end
111
+ unless schema = spec[collection]
81
112
  log.debug("No mapping for ns: #{ns}")
82
113
  return nil
83
114
  end
@@ -109,6 +140,15 @@ module MoSQL
109
140
  val
110
141
  end
111
142
 
143
+ def fetch_special_source(obj, source)
144
+ case source
145
+ when "$timestamp"
146
+ Sequel.function(:now)
147
+ else
148
+ raise SchemaError.new("Unknown source: #{source}")
149
+ end
150
+ end
151
+
112
152
  def transform(ns, obj, schema=nil)
113
153
  schema ||= find_ns!(ns)
114
154
 
@@ -119,10 +159,16 @@ module MoSQL
119
159
  source = col[:source]
120
160
  type = col[:type]
121
161
 
122
- v = fetch_and_delete_dotted(obj, source)
123
- case v
124
- when BSON::Binary, BSON::ObjectId
125
- v = v.to_s
162
+ if source.start_with?("$")
163
+ v = fetch_special_source(obj, source)
164
+ else
165
+ v = fetch_and_delete_dotted(obj, source)
166
+ case v
167
+ when BSON::Binary, BSON::ObjectId, Symbol
168
+ v = v.to_s
169
+ when Hash, Array
170
+ v = JSON.dump(v)
171
+ end
126
172
  end
127
173
  row << v
128
174
  end
@@ -130,10 +176,18 @@ module MoSQL
130
176
  if schema[:meta][:extra_props]
131
177
  # Kludgily delete binary blobs from _extra_props -- they may
132
178
  # contain invalid UTF-8, which to_json will not properly encode.
179
+ extra = {}
133
180
  obj.each do |k,v|
134
- obj.delete(k) if v.is_a?(BSON::Binary)
181
+ case v
182
+ when BSON::Binary
183
+ next
184
+ when Float
185
+ # NaN is illegal in JSON. Translate into null.
186
+ v = nil if v.nan?
187
+ end
188
+ extra[k] = v
135
189
  end
136
- row << obj.to_json
190
+ row << JSON.dump(extra)
137
191
  end
138
192
 
139
193
  log.debug { "Transformed: #{row.inspect}" }
@@ -141,10 +195,14 @@ module MoSQL
141
195
  row
142
196
  end
143
197
 
144
- def all_columns(schema)
198
+ def copy_column?(col)
199
+ col[:source] != '$timestamp'
200
+ end
201
+
202
+ def all_columns(schema, copy=false)
145
203
  cols = []
146
204
  schema[:columns].each do |col|
147
- cols << col[:name]
205
+ cols << col[:name] unless copy && !copy_column?(col)
148
206
  end
149
207
  if schema[:meta][:extra_props]
150
208
  cols << "_extra_props"
@@ -152,11 +210,15 @@ module MoSQL
152
210
  cols
153
211
  end
154
212
 
213
+ def all_columns_for_copy(schema)
214
+ all_columns(schema, true)
215
+ end
216
+
155
217
  def copy_data(db, ns, objs)
156
218
  schema = find_ns!(ns)
157
219
  db.synchronize do |pg|
158
220
  sql = "COPY \"#{schema[:meta][:table]}\" " +
159
- "(#{all_columns(schema).map {|c| "\"#{c}\""}.join(",")}) FROM STDIN"
221
+ "(#{all_columns_for_copy(schema).map {|c| "\"#{c}\""}.join(",")}) FROM STDIN"
160
222
  pg.execute(sql)
161
223
  objs.each do |o|
162
224
  pg.put_copy_data(transform_to_copy(ns, o, schema) + "\n")
@@ -178,13 +240,15 @@ module MoSQL
178
240
  't'
179
241
  when false
180
242
  'f'
243
+ when Sequel::SQL::Function
244
+ nil
181
245
  else
182
246
  val.to_s.gsub(/([\\\t\n\r])/, '\\\\\\1')
183
247
  end
184
248
  end
185
249
 
186
250
  def transform_to_copy(ns, row, schema=nil)
187
- row.map { |c| quote_copy(c) }.join("\t")
251
+ row.map { |c| quote_copy(c) }.compact.join("\t")
188
252
  end
189
253
 
190
254
  def table_for_ns(ns)
data/lib/mosql/sql.rb CHANGED
@@ -35,7 +35,7 @@ module MoSQL
35
35
 
36
36
  def upsert_ns(ns, obj)
37
37
  h = transform_one_ns(ns, obj)
38
- upsert(table_for_ns(ns), @schema.primary_sql_key_for_ns(ns), h)
38
+ upsert!(table_for_ns(ns), @schema.primary_sql_key_for_ns(ns), h)
39
39
  end
40
40
 
41
41
  # obj must contain an _id field. All other fields will be ignored.
@@ -46,26 +46,27 @@ module MoSQL
46
46
  table_for_ns(ns).where(primary_sql_key.to_sym => h[primary_sql_key]).delete
47
47
  end
48
48
 
49
- def upsert(table, table_primary_key, item)
50
- begin
51
- upsert!(table, table_primary_key, item)
52
- rescue Sequel::DatabaseError => e
53
- wrapped = e.wrapped_exception
54
- if wrapped.result
55
- log.warn("Ignoring row (#{table_primary_key}=#{item[table_primary_key]}): #{e}")
56
- else
57
- raise e
49
+ def upsert!(table, table_primary_key, item)
50
+ rows = table.where(table_primary_key.to_sym => item[table_primary_key]).update(item)
51
+ if rows == 0
52
+ begin
53
+ table.insert(item)
54
+ rescue Sequel::DatabaseError => e
55
+ raise e unless self.class.duplicate_key_error?(e)
56
+ log.info("RACE during upsert: Upserting #{item} into #{table}: #{e}")
58
57
  end
58
+ elsif rows > 1
59
+ log.warn("Huh? Updated #{rows} > 1 rows: upsert(#{table}, #{item})")
59
60
  end
60
61
  end
61
62
 
62
- def upsert!(table, table_primary_key, item)
63
- begin
64
- table.insert(item)
65
- rescue Sequel::DatabaseError => e
66
- raise e unless e.message =~ /duplicate key value violates unique constraint/
67
- table.where(table_primary_key.to_sym => item[table_primary_key]).update(item)
68
- end
63
+ def self.duplicate_key_error?(e)
64
+ # c.f. http://www.postgresql.org/docs/9.2/static/errcodes-appendix.html
65
+ # for the list of error codes.
66
+ #
67
+ # No thanks to Sequel and pg for making it easy to figure out
68
+ # how to get at this error code....
69
+ e.wrapped_exception.result.error_field(PG::Result::PG_DIAG_SQLSTATE) == "23505"
69
70
  end
70
71
  end
71
72
  end