couchdb_to_sql 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'rubygems'
5
+ require 'couchdb_to_sql'
6
+
7
+ configuration_file_name = ARGV[0]
8
+
9
+ unless configuration_file_name
10
+ puts
11
+ puts "Syntax: #{$PROGRAM_NAME} <configuration_file.rb>\n"
12
+ puts 'For the exact syntax of the configuration file, please consult the documentation or the web site: ' \
13
+ "https://github.com/ecraft/couchdb_to_sql\n\n"
14
+ exit 1
15
+ end
16
+
17
+ # Take in the arguments for the configuration file and try to run it
18
+ CouchdbToSql.logger.info "Reading configuration: #{configuration_file_name}"
19
+
20
+ CouchdbToSql.module_eval(File.open(configuration_file_name).read, configuration_file_name)
21
+
22
+ # With the configuration loaded, start her up!
23
+ CouchdbToSql.start
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Low level requirements
4
+ require 'active_support/core_ext/object/blank'
5
+ require 'active_support/inflector'
6
+ require 'couchrest'
7
+ require 'httpclient'
8
+ require 'json'
9
+ require 'logging_library'
10
+ require 'set'
11
+ require 'sequel'
12
+
13
+ # Our stuff
14
+ require 'couchdb_to_sql/changes'
15
+ require 'couchdb_to_sql/schema'
16
+ require 'couchdb_to_sql/document_handler'
17
+ require 'couchdb_to_sql/table_builder'
18
+ require 'couchdb_to_sql/table_deleted_marker'
19
+ require 'couchdb_to_sql/table_destroyer'
20
+
21
+ module CouchdbToSql
22
+ extend LoggingLibrary::Loggable
23
+
24
+ Error = Class.new(StandardError)
25
+ InvalidDataError = Class.new(Error)
26
+
27
+ COUCHDB_TO_SQL_SEQUENCES_TABLE = :_couchdb_to_sql_sequences
28
+
29
+ module_function
30
+
31
+ def changes(database, &block)
32
+ (@changes ||= []) << Changes.new(database, &block)
33
+ end
34
+
35
+ def start
36
+ threads = []
37
+ @changes.each do |changes|
38
+ threads << Thread.new(changes, &:start)
39
+ end
40
+ threads.each(&:join)
41
+ end
42
+ end
@@ -0,0 +1,286 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CouchdbToSql
4
+ class Changes
5
+ COUCHDB_HEARTBEAT = 30
6
+ INACTIVITY_TIMEOUT = 70
7
+ RECONNECT_TIMEOUT = 15
8
+
9
+ attr_reader :source, :schemas, :handlers
10
+
11
+ attr_accessor :highest_sequence
12
+
13
+ # Start a new Changes instance by connecting to the provided
14
+ # CouchDB to see if the database exists.
15
+ def initialize(opts = '', &block)
16
+ raise 'Block required for changes!' unless block_given?
17
+
18
+ @schemas = {}
19
+ @handlers = []
20
+ @source = CouchRest.database(opts)
21
+ @http = HTTPClient.new
22
+ @http.debug_dev = STDOUT if ENV.key?('DEBUG')
23
+ @skip_seqs = Set.new
24
+
25
+ log_info 'Connected to CouchDB'
26
+
27
+ @ember_pouch_mode = false
28
+ @fail_on_unhandled_document = false
29
+ @upsert_mode = false
30
+
31
+ # Prepare the definitions
32
+ @dsl_mode = true
33
+ instance_eval(&block)
34
+ @dsl_mode = false
35
+ end
36
+
37
+ #### DSL
38
+
39
+ # Sets the `ember_pouch_mode` flag. In `ember-pouch` mode, all the data fields are expected to reside within a
40
+ # `data` node in the document. More information on `ember-pouch` can be found
41
+ # [here](https://github.com/nolanlawson/ember-pouch).
42
+ #
43
+ # @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
44
+ def ember_pouch_mode
45
+ if @dsl_mode
46
+ @ember_pouch_mode ||= true
47
+ else
48
+ @ember_pouch_mode
49
+ end
50
+ end
51
+
52
+ # Sets the `upsert_mode` flag. When running in upsert mode, Sequel's insert_conflict mode is being used. More information
53
+ # about that can be found
54
+ # [here](http://sequel.jeremyevans.net/rdoc/files/doc/postgresql_rdoc.html#label-INSERT+ON+CONFLICT+Support)
55
+ #
56
+ # @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
57
+ def upsert_mode
58
+ if @dsl_mode
59
+ @upsert_mode ||= true
60
+ else
61
+ @upsert_mode
62
+ end
63
+ end
64
+
65
+ # Sets the "fail on unhandled document" flag, which will turn log errors into runtime exceptions if an unhandled document is
66
+ # encountered.
67
+ #
68
+ # @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
69
+ def fail_on_unhandled_document
70
+ if @dsl_mode
71
+ @fail_on_unhandled_document ||= true
72
+ else
73
+ @fail_on_unhandled_document
74
+ end
75
+ end
76
+
77
+ # @note Dual-purpose method, accepts configuration of database
78
+ # or returns a previous definition.
79
+ def database(opts = nil)
80
+ if opts
81
+ @database ||= begin
82
+ Sequel.connect(opts).tap { |conn|
83
+ next unless ENV.key?('SEQUEL_LOG_LEVEL')
84
+
85
+ conn.logger = LoggingLibrary::LoggerFactory.create(self.class.name).tap { |l|
86
+ l.level = ENV['SEQUEL_LOG_LEVEL'].to_s.downcase.to_sym
87
+ }
88
+ }
89
+ end
90
+ find_or_create_sequence_number
91
+ end
92
+ @database
93
+ end
94
+
95
+ def document(filter = {}, &block)
96
+ @handlers << DocumentHandler.new(self, filter, &block)
97
+ end
98
+
99
+ def skip_seqs_file(file_path)
100
+ file_contents = File.read(file_path)
101
+ seqs = JSON.parse(file_contents)
102
+ @skip_seqs |= Set.new(seqs)
103
+ end
104
+
105
+ #### END DSL
106
+
107
+ def schema(name)
108
+ @schemas[name.to_sym] ||= Schema.new(database, name)
109
+ end
110
+
111
+ # Start listening to the CouchDB changes feed. By this stage we should have
112
+ # a sequence id so we know where to start from and all the filters should
113
+ # have been prepared.
114
+ def start
115
+ perform_request
116
+ end
117
+
118
+ def log_debug(message)
119
+ logger.debug "#{source.name}: #{message}"
120
+ end
121
+
122
+ def log_info(message)
123
+ logger.info "#{source.name}: #{message}"
124
+ end
125
+
126
+ def log_error(message)
127
+ logger.error "#{source.name}: #{message}"
128
+ end
129
+
130
+ protected
131
+
132
+ def perform_request
133
+ raise 'Internal error: Highest_sequence is expected to be non-nil' unless highest_sequence
134
+ log_info "listening to changes feed from sequence number: #{highest_sequence}"
135
+
136
+ url = File.join(source.root.to_s, '_changes')
137
+ uri = URI.parse(url)
138
+
139
+ # Authenticate?
140
+ if uri.user.present? && uri.password.present?
141
+ @http.set_auth(source.root, uri.user, uri.password)
142
+ end
143
+
144
+ # Make sure the request has the latest sequence
145
+ query = {
146
+ feed: 'continuous',
147
+ heartbeat: COUCHDB_HEARTBEAT * 1000,
148
+ include_docs: true,
149
+ since: highest_sequence
150
+ }
151
+
152
+ num_rows = 0
153
+
154
+ loop do
155
+ # Perform the actual request for chunked content
156
+ @http.get_content(url, query) do |chunk|
157
+ rows = chunk.split("\n")
158
+ rows.each { |row|
159
+ parsed_row = JSON.parse(row)
160
+ process_row(parsed_row)
161
+
162
+ num_rows += 1
163
+ log_info "Processed #{num_rows} rows" if (num_rows % 10_000) == 0
164
+ }
165
+ end
166
+ log_error "connection ended, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
167
+ wait RECONNECT_TIMEOUT
168
+ end
169
+ rescue HTTPClient::TimeoutError, HTTPClient::BadResponseError => e
170
+ log_error "connection failed: #{e.message}, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
171
+ wait RECONNECT_TIMEOUT
172
+ retry
173
+ end
174
+
175
+ def process_row(row)
176
+ id = row['id']
177
+ seq = row['seq']
178
+
179
+ return if id =~ /^_design/
180
+ return if @skip_seqs.include?(seq)
181
+
182
+ if id
183
+ # Wrap the whole request in a transaction
184
+ database.transaction do
185
+ doc = fetch_document_from(row)
186
+
187
+ if row['deleted']
188
+ log_info "received DELETE seq. #{seq} id: #{id}"
189
+ handlers.each { |handler| handler.mark_as_deleted(doc) }
190
+ else
191
+ log_debug "received CHANGE seq. #{seq} id: #{id}"
192
+
193
+ document_handlers = find_document_handlers(doc)
194
+ if document_handlers.empty?
195
+ message = 'No document handlers found for document. ' \
196
+ "Document data: #{doc.inspect}, seq: #{seq}, source: #{@source.name}"
197
+ raise InvalidDataError, message if fail_on_unhandled_document
198
+
199
+ log_error message
200
+ end
201
+
202
+ document_handlers.each do |handler|
203
+ # Delete all previous entries of doc, then re-create
204
+ handler.delete(doc)
205
+ handler.insert(doc)
206
+ end
207
+ end
208
+
209
+ update_sequence_table(seq)
210
+ end # transaction
211
+ elsif row['last_seq']
212
+ # Sometimes CouchDB will send an update to keep the connection alive
213
+ log_info "received last seq: #{row['last_seq']}"
214
+ end
215
+ end
216
+
217
+ def fetch_document_from(row)
218
+ doc = row.fetch('doc')
219
+
220
+ if ember_pouch_mode
221
+ ember_pouch_transform_document(doc)
222
+ else
223
+ doc
224
+ end
225
+ end
226
+
227
+ def ember_pouch_transform_document(doc)
228
+ if doc.key?('data')
229
+ doc['id'] = doc['_id'].split('_2_', 2).last
230
+ doc.merge(doc.delete('data'))
231
+ else
232
+ doc
233
+ end
234
+ end
235
+
236
+ def find_document_handlers(document)
237
+ @handlers.select { |row| row.handles?(document) }
238
+ end
239
+
240
+ def find_or_create_sequence_number
241
+ unless database.table_exists?(CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE)
242
+ create_sequence_table
243
+ sequence_table.insert(couchdb_database_name: source.name, created_at: DateTime.now)
244
+ end
245
+
246
+ row = sequence_table.where(couchdb_database_name: source.name).first
247
+ self.highest_sequence = (row ? row.fetch(:highest_sequence) : '0')
248
+ end
249
+
250
+ def update_sequence_table(new_highest_sequence)
251
+ if upsert_mode
252
+ data = {
253
+ couchdb_database_name: source.name,
254
+ highest_sequence: new_highest_sequence,
255
+ updated_at: DateTime.now
256
+ }
257
+ sequence_table
258
+ .insert_conflict(target: :couchdb_database_name, update: data)
259
+ .insert(data.merge(created_at: data[:updated_at]))
260
+ else
261
+ sequence_table
262
+ .where(couchdb_database_name: source.name)
263
+ .update(highest_sequence: new_highest_sequence)
264
+ end
265
+
266
+ self.highest_sequence = new_highest_sequence
267
+ end
268
+
269
+ def create_sequence_table
270
+ database.create_table CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE do
271
+ String :couchdb_database_name, primary_key: true
272
+ String :highest_sequence, default: '0', null: false
273
+ DateTime :created_at
274
+ DateTime :updated_at
275
+ end
276
+ end
277
+
278
+ def sequence_table
279
+ database[CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE]
280
+ end
281
+
282
+ def logger
283
+ CouchdbToSql.logger
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CouchdbToSql
4
+ #
5
+ # Handles document insertion, deletion and 'marking as deleted' operations.
6
+ #
7
+ # This class delegates the actual insertion, deletion etc to the various `Table*` classes.
8
+ #
9
+ class DocumentHandler
10
+ attr_reader :changes, :filter, :mode
11
+ attr_accessor :document
12
+
13
+ def initialize(changes, filter = {}, &block)
14
+ @changes = changes
15
+ @filter = filter
16
+ @_block = block
17
+ @mode = nil
18
+ end
19
+
20
+ def handles?(doc)
21
+ @filter.each do |k, v|
22
+ return false if doc[k.to_s] != v
23
+ end
24
+ true
25
+ end
26
+
27
+ ### START DSL
28
+
29
+ # Handle a table definition.
30
+ def table(name, opts = {}, &block)
31
+ if @mode == :delete
32
+ TableDestroyer.new(self, name, opts).execute
33
+ elsif @mode == :mark_as_deleted
34
+ TableDeletedMarker.new(self, name, opts).execute
35
+ elsif @mode == :insert
36
+ TableBuilder.new(self, name, opts, &block).execute
37
+ end
38
+ end
39
+
40
+ ### END DSL
41
+
42
+ def handler
43
+ self
44
+ end
45
+
46
+ def primary_keys
47
+ []
48
+ end
49
+
50
+ def key_filter
51
+ {}
52
+ end
53
+
54
+ def id
55
+ document['_id']
56
+ end
57
+
58
+ def rev
59
+ document['_rev']
60
+ end
61
+
62
+ def insert(document)
63
+ @mode = :insert
64
+ self.document = document
65
+ instance_eval(&@_block)
66
+ end
67
+
68
+ def delete(document)
69
+ @mode = :delete
70
+ self.document = document
71
+ instance_eval(&@_block)
72
+ end
73
+
74
+ def mark_as_deleted(document)
75
+ @mode = :mark_as_deleted
76
+ self.document = document
77
+ instance_eval(&@_block)
78
+ end
79
+
80
+ def schema(name)
81
+ changes.schema(name)
82
+ end
83
+
84
+ def database
85
+ changes.database
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CouchdbToSql
4
+ # Wrapper around a sequel table definition
5
+ # to allow easy access to column names.
6
+ class Schema
7
+ attr_accessor :name, :database, :columns, :column_names
8
+
9
+ def initialize(database, name)
10
+ self.name = name.to_sym
11
+ self.database = database
12
+ self.columns = {}
13
+ self.column_names = []
14
+ parse_schema
15
+ end
16
+
17
+ def dataset
18
+ database[name]
19
+ end
20
+
21
+ protected
22
+
23
+ def parse_schema
24
+ database.schema(name).each do |row|
25
+ column_names << row[0]
26
+ columns[row[0]] = row[1]
27
+ end
28
+ end
29
+ end
30
+ end