couchdb_to_sql 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'rubygems'
5
+ require 'couchdb_to_sql'
6
+
7
+ configuration_file_name = ARGV[0]
8
+
9
+ unless configuration_file_name
10
+ puts
11
+ puts "Syntax: #{$PROGRAM_NAME} <configuration_file.rb>\n"
12
+ puts 'For the exact syntax of the configuration file, please consult the documentation or the web site: ' \
13
+ "https://github.com/ecraft/couchdb_to_sql\n\n"
14
+ exit 1
15
+ end
16
+
17
+ # Take in the arguments for the configuration file and try to run it
18
+ CouchdbToSql.logger.info "Reading configuration: #{configuration_file_name}"
19
+
20
+ CouchdbToSql.module_eval(File.open(configuration_file_name).read, configuration_file_name)
21
+
22
+ # With the configuration loaded, start her up!
23
+ CouchdbToSql.start
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Low level requirements
4
+ require 'active_support/core_ext/object/blank'
5
+ require 'active_support/inflector'
6
+ require 'couchrest'
7
+ require 'httpclient'
8
+ require 'json'
9
+ require 'logging_library'
10
+ require 'set'
11
+ require 'sequel'
12
+
13
+ # Our stuff
14
+ require 'couchdb_to_sql/changes'
15
+ require 'couchdb_to_sql/schema'
16
+ require 'couchdb_to_sql/document_handler'
17
+ require 'couchdb_to_sql/table_builder'
18
+ require 'couchdb_to_sql/table_deleted_marker'
19
+ require 'couchdb_to_sql/table_destroyer'
20
+
21
+ module CouchdbToSql
22
+ extend LoggingLibrary::Loggable
23
+
24
+ Error = Class.new(StandardError)
25
+ InvalidDataError = Class.new(Error)
26
+
27
+ COUCHDB_TO_SQL_SEQUENCES_TABLE = :_couchdb_to_sql_sequences
28
+
29
+ module_function
30
+
31
+ def changes(database, &block)
32
+ (@changes ||= []) << Changes.new(database, &block)
33
+ end
34
+
35
+ def start
36
+ threads = []
37
+ @changes.each do |changes|
38
+ threads << Thread.new(changes, &:start)
39
+ end
40
+ threads.each(&:join)
41
+ end
42
+ end
@@ -0,0 +1,286 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CouchdbToSql
4
+ class Changes
5
+ COUCHDB_HEARTBEAT = 30
6
+ INACTIVITY_TIMEOUT = 70
7
+ RECONNECT_TIMEOUT = 15
8
+
9
+ attr_reader :source, :schemas, :handlers
10
+
11
+ attr_accessor :highest_sequence
12
+
13
+ # Start a new Changes instance by connecting to the provided
14
+ # CouchDB to see if the database exists.
15
+ def initialize(opts = '', &block)
16
+ raise 'Block required for changes!' unless block_given?
17
+
18
+ @schemas = {}
19
+ @handlers = []
20
+ @source = CouchRest.database(opts)
21
+ @http = HTTPClient.new
22
+ @http.debug_dev = STDOUT if ENV.key?('DEBUG')
23
+ @skip_seqs = Set.new
24
+
25
+ log_info 'Connected to CouchDB'
26
+
27
+ @ember_pouch_mode = false
28
+ @fail_on_unhandled_document = false
29
+ @upsert_mode = false
30
+
31
+ # Prepare the definitions
32
+ @dsl_mode = true
33
+ instance_eval(&block)
34
+ @dsl_mode = false
35
+ end
36
+
37
+ #### DSL
38
+
39
+ # Sets the `ember_pouch_mode` flag. In `ember-pouch` mode, all the data fields are expected to reside within a
40
+ # `data` node in the document. More information on `ember-pouch` can be found
41
+ # [here](https://github.com/nolanlawson/ember-pouch).
42
+ #
43
+ # @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
44
+ def ember_pouch_mode
45
+ if @dsl_mode
46
+ @ember_pouch_mode ||= true
47
+ else
48
+ @ember_pouch_mode
49
+ end
50
+ end
51
+
52
+ # Sets the `upsert_mode` flag. When running in upsert mode, Sequel's insert_conflict mode is being used. More information
53
+ # about that can be found
54
+ # [here](http://sequel.jeremyevans.net/rdoc/files/doc/postgresql_rdoc.html#label-INSERT+ON+CONFLICT+Support)
55
+ #
56
+ # @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
57
+ def upsert_mode
58
+ if @dsl_mode
59
+ @upsert_mode ||= true
60
+ else
61
+ @upsert_mode
62
+ end
63
+ end
64
+
65
+ # Sets the "fail on unhandled document" flag, which will turn log errors into runtime exceptions if an unhandled document is
66
+ # encountered.
67
+ #
68
+ # @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
69
+ def fail_on_unhandled_document
70
+ if @dsl_mode
71
+ @fail_on_unhandled_document ||= true
72
+ else
73
+ @fail_on_unhandled_document
74
+ end
75
+ end
76
+
77
+ # @note Dual-purpose method, accepts configuration of database
78
+ # or returns a previous definition.
79
+ def database(opts = nil)
80
+ if opts
81
+ @database ||= begin
82
+ Sequel.connect(opts).tap { |conn|
83
+ next unless ENV.key?('SEQUEL_LOG_LEVEL')
84
+
85
+ conn.logger = LoggingLibrary::LoggerFactory.create(self.class.name).tap { |l|
86
+ l.level = ENV['SEQUEL_LOG_LEVEL'].to_s.downcase.to_sym
87
+ }
88
+ }
89
+ end
90
+ find_or_create_sequence_number
91
+ end
92
+ @database
93
+ end
94
+
95
+ def document(filter = {}, &block)
96
+ @handlers << DocumentHandler.new(self, filter, &block)
97
+ end
98
+
99
+ def skip_seqs_file(file_path)
100
+ file_contents = File.read(file_path)
101
+ seqs = JSON.parse(file_contents)
102
+ @skip_seqs |= Set.new(seqs)
103
+ end
104
+
105
+ #### END DSL
106
+
107
+ def schema(name)
108
+ @schemas[name.to_sym] ||= Schema.new(database, name)
109
+ end
110
+
111
+ # Start listening to the CouchDB changes feed. By this stage we should have
112
+ # a sequence id so we know where to start from and all the filters should
113
+ # have been prepared.
114
+ def start
115
+ perform_request
116
+ end
117
+
118
+ def log_debug(message)
119
+ logger.debug "#{source.name}: #{message}"
120
+ end
121
+
122
+ def log_info(message)
123
+ logger.info "#{source.name}: #{message}"
124
+ end
125
+
126
+ def log_error(message)
127
+ logger.error "#{source.name}: #{message}"
128
+ end
129
+
130
+ protected
131
+
132
+ def perform_request
133
+ raise 'Internal error: Highest_sequence is expected to be non-nil' unless highest_sequence
134
+ log_info "listening to changes feed from sequence number: #{highest_sequence}"
135
+
136
+ url = File.join(source.root.to_s, '_changes')
137
+ uri = URI.parse(url)
138
+
139
+ # Authenticate?
140
+ if uri.user.present? && uri.password.present?
141
+ @http.set_auth(source.root, uri.user, uri.password)
142
+ end
143
+
144
+ # Make sure the request has the latest sequence
145
+ query = {
146
+ feed: 'continuous',
147
+ heartbeat: COUCHDB_HEARTBEAT * 1000,
148
+ include_docs: true,
149
+ since: highest_sequence
150
+ }
151
+
152
+ num_rows = 0
153
+
154
+ loop do
155
+ # Perform the actual request for chunked content
156
+ @http.get_content(url, query) do |chunk|
157
+ rows = chunk.split("\n")
158
+ rows.each { |row|
159
+ parsed_row = JSON.parse(row)
160
+ process_row(parsed_row)
161
+
162
+ num_rows += 1
163
+ log_info "Processed #{num_rows} rows" if (num_rows % 10_000) == 0
164
+ }
165
+ end
166
+ log_error "connection ended, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
167
+ wait RECONNECT_TIMEOUT
168
+ end
169
+ rescue HTTPClient::TimeoutError, HTTPClient::BadResponseError => e
170
+ log_error "connection failed: #{e.message}, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
171
+ wait RECONNECT_TIMEOUT
172
+ retry
173
+ end
174
+
175
+ def process_row(row)
176
+ id = row['id']
177
+ seq = row['seq']
178
+
179
+ return if id =~ /^_design/
180
+ return if @skip_seqs.include?(seq)
181
+
182
+ if id
183
+ # Wrap the whole request in a transaction
184
+ database.transaction do
185
+ doc = fetch_document_from(row)
186
+
187
+ if row['deleted']
188
+ log_info "received DELETE seq. #{seq} id: #{id}"
189
+ handlers.each { |handler| handler.mark_as_deleted(doc) }
190
+ else
191
+ log_debug "received CHANGE seq. #{seq} id: #{id}"
192
+
193
+ document_handlers = find_document_handlers(doc)
194
+ if document_handlers.empty?
195
+ message = 'No document handlers found for document. ' \
196
+ "Document data: #{doc.inspect}, seq: #{seq}, source: #{@source.name}"
197
+ raise InvalidDataError, message if fail_on_unhandled_document
198
+
199
+ log_error message
200
+ end
201
+
202
+ document_handlers.each do |handler|
203
+ # Delete all previous entries of doc, then re-create
204
+ handler.delete(doc)
205
+ handler.insert(doc)
206
+ end
207
+ end
208
+
209
+ update_sequence_table(seq)
210
+ end # transaction
211
+ elsif row['last_seq']
212
+ # Sometimes CouchDB will send an update to keep the connection alive
213
+ log_info "received last seq: #{row['last_seq']}"
214
+ end
215
+ end
216
+
217
+ def fetch_document_from(row)
218
+ doc = row.fetch('doc')
219
+
220
+ if ember_pouch_mode
221
+ ember_pouch_transform_document(doc)
222
+ else
223
+ doc
224
+ end
225
+ end
226
+
227
+ def ember_pouch_transform_document(doc)
228
+ if doc.key?('data')
229
+ doc['id'] = doc['_id'].split('_2_', 2).last
230
+ doc.merge(doc.delete('data'))
231
+ else
232
+ doc
233
+ end
234
+ end
235
+
236
+ def find_document_handlers(document)
237
+ @handlers.select { |row| row.handles?(document) }
238
+ end
239
+
240
+ def find_or_create_sequence_number
241
+ unless database.table_exists?(CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE)
242
+ create_sequence_table
243
+ sequence_table.insert(couchdb_database_name: source.name, created_at: DateTime.now)
244
+ end
245
+
246
+ row = sequence_table.where(couchdb_database_name: source.name).first
247
+ self.highest_sequence = (row ? row.fetch(:highest_sequence) : '0')
248
+ end
249
+
250
+ def update_sequence_table(new_highest_sequence)
251
+ if upsert_mode
252
+ data = {
253
+ couchdb_database_name: source.name,
254
+ highest_sequence: new_highest_sequence,
255
+ updated_at: DateTime.now
256
+ }
257
+ sequence_table
258
+ .insert_conflict(target: :couchdb_database_name, update: data)
259
+ .insert(data.merge(created_at: data[:updated_at]))
260
+ else
261
+ sequence_table
262
+ .where(couchdb_database_name: source.name)
263
+ .update(highest_sequence: new_highest_sequence)
264
+ end
265
+
266
+ self.highest_sequence = new_highest_sequence
267
+ end
268
+
269
+ def create_sequence_table
270
+ database.create_table CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE do
271
+ String :couchdb_database_name, primary_key: true
272
+ String :highest_sequence, default: '0', null: false
273
+ DateTime :created_at
274
+ DateTime :updated_at
275
+ end
276
+ end
277
+
278
+ def sequence_table
279
+ database[CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE]
280
+ end
281
+
282
+ def logger
283
+ CouchdbToSql.logger
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CouchdbToSql
4
+ #
5
+ # Handles document insertion, deletion and 'marking as deleted' operations.
6
+ #
7
+ # This class delegates the actual insertion, deletion etc to the various `Table*` classes.
8
+ #
9
+ class DocumentHandler
10
+ attr_reader :changes, :filter, :mode
11
+ attr_accessor :document
12
+
13
+ def initialize(changes, filter = {}, &block)
14
+ @changes = changes
15
+ @filter = filter
16
+ @_block = block
17
+ @mode = nil
18
+ end
19
+
20
+ def handles?(doc)
21
+ @filter.each do |k, v|
22
+ return false if doc[k.to_s] != v
23
+ end
24
+ true
25
+ end
26
+
27
+ ### START DSL
28
+
29
+ # Handle a table definition.
30
+ def table(name, opts = {}, &block)
31
+ if @mode == :delete
32
+ TableDestroyer.new(self, name, opts).execute
33
+ elsif @mode == :mark_as_deleted
34
+ TableDeletedMarker.new(self, name, opts).execute
35
+ elsif @mode == :insert
36
+ TableBuilder.new(self, name, opts, &block).execute
37
+ end
38
+ end
39
+
40
+ ### END DSL
41
+
42
+ def handler
43
+ self
44
+ end
45
+
46
+ def primary_keys
47
+ []
48
+ end
49
+
50
+ def key_filter
51
+ {}
52
+ end
53
+
54
+ def id
55
+ document['_id']
56
+ end
57
+
58
+ def rev
59
+ document['_rev']
60
+ end
61
+
62
+ def insert(document)
63
+ @mode = :insert
64
+ self.document = document
65
+ instance_eval(&@_block)
66
+ end
67
+
68
+ def delete(document)
69
+ @mode = :delete
70
+ self.document = document
71
+ instance_eval(&@_block)
72
+ end
73
+
74
+ def mark_as_deleted(document)
75
+ @mode = :mark_as_deleted
76
+ self.document = document
77
+ instance_eval(&@_block)
78
+ end
79
+
80
+ def schema(name)
81
+ changes.schema(name)
82
+ end
83
+
84
+ def database
85
+ changes.database
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CouchdbToSql
4
+ # Wrapper around a sequel table definition
5
+ # to allow easy access to column names.
6
+ class Schema
7
+ attr_accessor :name, :database, :columns, :column_names
8
+
9
+ def initialize(database, name)
10
+ self.name = name.to_sym
11
+ self.database = database
12
+ self.columns = {}
13
+ self.column_names = []
14
+ parse_schema
15
+ end
16
+
17
+ def dataset
18
+ database[name]
19
+ end
20
+
21
+ protected
22
+
23
+ def parse_schema
24
+ database.schema(name).each do |row|
25
+ column_names << row[0]
26
+ columns[row[0]] = row[1]
27
+ end
28
+ end
29
+ end
30
+ end