couchdb_to_sql 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rubocop.yml +33 -0
- data/.rubocop_todo.yml +39 -0
- data/.ruby-version +1 -0
- data/.travis.yml +12 -0
- data/.vscode/launch.json +46 -0
- data/Gemfile +11 -0
- data/LICENSE +24 -0
- data/README.md +163 -0
- data/Rakefile +28 -0
- data/VERSION +1 -0
- data/couchdb_to_sql.gemspec +32 -0
- data/examples/feed.rb +22 -0
- data/exe/couchdb_to_sql +23 -0
- data/lib/couchdb_to_sql.rb +42 -0
- data/lib/couchdb_to_sql/changes.rb +286 -0
- data/lib/couchdb_to_sql/document_handler.rb +88 -0
- data/lib/couchdb_to_sql/schema.rb +30 -0
- data/lib/couchdb_to_sql/table_builder.rb +112 -0
- data/lib/couchdb_to_sql/table_deleted_marker.rb +49 -0
- data/lib/couchdb_to_sql/table_destroyer.rb +22 -0
- data/lib/couchdb_to_sql/table_operator.rb +36 -0
- data/test/functional/functional_changes_test.rb +36 -0
- data/test/test_helper.rb +30 -0
- data/test/unit/changes_test.rb +129 -0
- data/test/unit/document_handler_test.rb +79 -0
- data/test/unit/schema_test.rb +52 -0
- data/test/unit/table_builder_test.rb +199 -0
- data/test/unit/table_destroyer_test.rb +65 -0
- metadata +233 -0
data/exe/couchdb_to_sql
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'couchdb_to_sql'
|
6
|
+
|
7
|
+
configuration_file_name = ARGV[0]
|
8
|
+
|
9
|
+
unless configuration_file_name
|
10
|
+
puts
|
11
|
+
puts "Syntax: #{$PROGRAM_NAME} <configuration_file.rb>\n"
|
12
|
+
puts 'For the exact syntax of the configuration file, please consult the documentation or the web site: ' \
|
13
|
+
"https://github.com/ecraft/couchdb_to_sql\n\n"
|
14
|
+
exit 1
|
15
|
+
end
|
16
|
+
|
17
|
+
# Take in the arguments for the configuration file and try to run it
|
18
|
+
CouchdbToSql.logger.info "Reading configuration: #{configuration_file_name}"
|
19
|
+
|
20
|
+
CouchdbToSql.module_eval(File.open(configuration_file_name).read, configuration_file_name)
|
21
|
+
|
22
|
+
# With the configuration loaded, start her up!
|
23
|
+
CouchdbToSql.start
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Low level requirements
|
4
|
+
require 'active_support/core_ext/object/blank'
|
5
|
+
require 'active_support/inflector'
|
6
|
+
require 'couchrest'
|
7
|
+
require 'httpclient'
|
8
|
+
require 'json'
|
9
|
+
require 'logging_library'
|
10
|
+
require 'set'
|
11
|
+
require 'sequel'
|
12
|
+
|
13
|
+
# Our stuff
|
14
|
+
require 'couchdb_to_sql/changes'
|
15
|
+
require 'couchdb_to_sql/schema'
|
16
|
+
require 'couchdb_to_sql/document_handler'
|
17
|
+
require 'couchdb_to_sql/table_builder'
|
18
|
+
require 'couchdb_to_sql/table_deleted_marker'
|
19
|
+
require 'couchdb_to_sql/table_destroyer'
|
20
|
+
|
21
|
+
module CouchdbToSql
|
22
|
+
extend LoggingLibrary::Loggable
|
23
|
+
|
24
|
+
Error = Class.new(StandardError)
|
25
|
+
InvalidDataError = Class.new(Error)
|
26
|
+
|
27
|
+
COUCHDB_TO_SQL_SEQUENCES_TABLE = :_couchdb_to_sql_sequences
|
28
|
+
|
29
|
+
module_function
|
30
|
+
|
31
|
+
def changes(database, &block)
|
32
|
+
(@changes ||= []) << Changes.new(database, &block)
|
33
|
+
end
|
34
|
+
|
35
|
+
def start
|
36
|
+
threads = []
|
37
|
+
@changes.each do |changes|
|
38
|
+
threads << Thread.new(changes, &:start)
|
39
|
+
end
|
40
|
+
threads.each(&:join)
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CouchdbToSql
|
4
|
+
class Changes
|
5
|
+
COUCHDB_HEARTBEAT = 30
|
6
|
+
INACTIVITY_TIMEOUT = 70
|
7
|
+
RECONNECT_TIMEOUT = 15
|
8
|
+
|
9
|
+
attr_reader :source, :schemas, :handlers
|
10
|
+
|
11
|
+
attr_accessor :highest_sequence
|
12
|
+
|
13
|
+
# Start a new Changes instance by connecting to the provided
|
14
|
+
# CouchDB to see if the database exists.
|
15
|
+
def initialize(opts = '', &block)
|
16
|
+
raise 'Block required for changes!' unless block_given?
|
17
|
+
|
18
|
+
@schemas = {}
|
19
|
+
@handlers = []
|
20
|
+
@source = CouchRest.database(opts)
|
21
|
+
@http = HTTPClient.new
|
22
|
+
@http.debug_dev = STDOUT if ENV.key?('DEBUG')
|
23
|
+
@skip_seqs = Set.new
|
24
|
+
|
25
|
+
log_info 'Connected to CouchDB'
|
26
|
+
|
27
|
+
@ember_pouch_mode = false
|
28
|
+
@fail_on_unhandled_document = false
|
29
|
+
@upsert_mode = false
|
30
|
+
|
31
|
+
# Prepare the definitions
|
32
|
+
@dsl_mode = true
|
33
|
+
instance_eval(&block)
|
34
|
+
@dsl_mode = false
|
35
|
+
end
|
36
|
+
|
37
|
+
#### DSL
|
38
|
+
|
39
|
+
# Sets the `ember_pouch_mode` flag. In `ember-pouch` mode, all the data fields are expected to reside within a
|
40
|
+
# `data` node in the document. More information on `ember-pouch` can be found
|
41
|
+
# [here](https://github.com/nolanlawson/ember-pouch).
|
42
|
+
#
|
43
|
+
# @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
|
44
|
+
def ember_pouch_mode
|
45
|
+
if @dsl_mode
|
46
|
+
@ember_pouch_mode ||= true
|
47
|
+
else
|
48
|
+
@ember_pouch_mode
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Sets the `upsert_mode` flag. When running in upsert mode, Sequel's insert_conflict mode is being used. More information
|
53
|
+
# about that can be found
|
54
|
+
# [here](http://sequel.jeremyevans.net/rdoc/files/doc/postgresql_rdoc.html#label-INSERT+ON+CONFLICT+Support)
|
55
|
+
#
|
56
|
+
# @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
|
57
|
+
def upsert_mode
|
58
|
+
if @dsl_mode
|
59
|
+
@upsert_mode ||= true
|
60
|
+
else
|
61
|
+
@upsert_mode
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Sets the "fail on unhandled document" flag, which will turn log errors into runtime exceptions if an unhandled document is
|
66
|
+
# encountered.
|
67
|
+
#
|
68
|
+
# @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
|
69
|
+
def fail_on_unhandled_document
|
70
|
+
if @dsl_mode
|
71
|
+
@fail_on_unhandled_document ||= true
|
72
|
+
else
|
73
|
+
@fail_on_unhandled_document
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# @note Dual-purpose method, accepts configuration of database
|
78
|
+
# or returns a previous definition.
|
79
|
+
def database(opts = nil)
|
80
|
+
if opts
|
81
|
+
@database ||= begin
|
82
|
+
Sequel.connect(opts).tap { |conn|
|
83
|
+
next unless ENV.key?('SEQUEL_LOG_LEVEL')
|
84
|
+
|
85
|
+
conn.logger = LoggingLibrary::LoggerFactory.create(self.class.name).tap { |l|
|
86
|
+
l.level = ENV['SEQUEL_LOG_LEVEL'].to_s.downcase.to_sym
|
87
|
+
}
|
88
|
+
}
|
89
|
+
end
|
90
|
+
find_or_create_sequence_number
|
91
|
+
end
|
92
|
+
@database
|
93
|
+
end
|
94
|
+
|
95
|
+
def document(filter = {}, &block)
|
96
|
+
@handlers << DocumentHandler.new(self, filter, &block)
|
97
|
+
end
|
98
|
+
|
99
|
+
def skip_seqs_file(file_path)
|
100
|
+
file_contents = File.read(file_path)
|
101
|
+
seqs = JSON.parse(file_contents)
|
102
|
+
@skip_seqs |= Set.new(seqs)
|
103
|
+
end
|
104
|
+
|
105
|
+
#### END DSL
|
106
|
+
|
107
|
+
def schema(name)
|
108
|
+
@schemas[name.to_sym] ||= Schema.new(database, name)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Start listening to the CouchDB changes feed. By this stage we should have
|
112
|
+
# a sequence id so we know where to start from and all the filters should
|
113
|
+
# have been prepared.
|
114
|
+
def start
|
115
|
+
perform_request
|
116
|
+
end
|
117
|
+
|
118
|
+
def log_debug(message)
|
119
|
+
logger.debug "#{source.name}: #{message}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def log_info(message)
|
123
|
+
logger.info "#{source.name}: #{message}"
|
124
|
+
end
|
125
|
+
|
126
|
+
def log_error(message)
|
127
|
+
logger.error "#{source.name}: #{message}"
|
128
|
+
end
|
129
|
+
|
130
|
+
protected
|
131
|
+
|
132
|
+
def perform_request
|
133
|
+
raise 'Internal error: Highest_sequence is expected to be non-nil' unless highest_sequence
|
134
|
+
log_info "listening to changes feed from sequence number: #{highest_sequence}"
|
135
|
+
|
136
|
+
url = File.join(source.root.to_s, '_changes')
|
137
|
+
uri = URI.parse(url)
|
138
|
+
|
139
|
+
# Authenticate?
|
140
|
+
if uri.user.present? && uri.password.present?
|
141
|
+
@http.set_auth(source.root, uri.user, uri.password)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Make sure the request has the latest sequence
|
145
|
+
query = {
|
146
|
+
feed: 'continuous',
|
147
|
+
heartbeat: COUCHDB_HEARTBEAT * 1000,
|
148
|
+
include_docs: true,
|
149
|
+
since: highest_sequence
|
150
|
+
}
|
151
|
+
|
152
|
+
num_rows = 0
|
153
|
+
|
154
|
+
loop do
|
155
|
+
# Perform the actual request for chunked content
|
156
|
+
@http.get_content(url, query) do |chunk|
|
157
|
+
rows = chunk.split("\n")
|
158
|
+
rows.each { |row|
|
159
|
+
parsed_row = JSON.parse(row)
|
160
|
+
process_row(parsed_row)
|
161
|
+
|
162
|
+
num_rows += 1
|
163
|
+
log_info "Processed #{num_rows} rows" if (num_rows % 10_000) == 0
|
164
|
+
}
|
165
|
+
end
|
166
|
+
log_error "connection ended, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
|
167
|
+
wait RECONNECT_TIMEOUT
|
168
|
+
end
|
169
|
+
rescue HTTPClient::TimeoutError, HTTPClient::BadResponseError => e
|
170
|
+
log_error "connection failed: #{e.message}, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
|
171
|
+
wait RECONNECT_TIMEOUT
|
172
|
+
retry
|
173
|
+
end
|
174
|
+
|
175
|
+
def process_row(row)
|
176
|
+
id = row['id']
|
177
|
+
seq = row['seq']
|
178
|
+
|
179
|
+
return if id =~ /^_design/
|
180
|
+
return if @skip_seqs.include?(seq)
|
181
|
+
|
182
|
+
if id
|
183
|
+
# Wrap the whole request in a transaction
|
184
|
+
database.transaction do
|
185
|
+
doc = fetch_document_from(row)
|
186
|
+
|
187
|
+
if row['deleted']
|
188
|
+
log_info "received DELETE seq. #{seq} id: #{id}"
|
189
|
+
handlers.each { |handler| handler.mark_as_deleted(doc) }
|
190
|
+
else
|
191
|
+
log_debug "received CHANGE seq. #{seq} id: #{id}"
|
192
|
+
|
193
|
+
document_handlers = find_document_handlers(doc)
|
194
|
+
if document_handlers.empty?
|
195
|
+
message = 'No document handlers found for document. ' \
|
196
|
+
"Document data: #{doc.inspect}, seq: #{seq}, source: #{@source.name}"
|
197
|
+
raise InvalidDataError, message if fail_on_unhandled_document
|
198
|
+
|
199
|
+
log_error message
|
200
|
+
end
|
201
|
+
|
202
|
+
document_handlers.each do |handler|
|
203
|
+
# Delete all previous entries of doc, then re-create
|
204
|
+
handler.delete(doc)
|
205
|
+
handler.insert(doc)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
update_sequence_table(seq)
|
210
|
+
end # transaction
|
211
|
+
elsif row['last_seq']
|
212
|
+
# Sometimes CouchDB will send an update to keep the connection alive
|
213
|
+
log_info "received last seq: #{row['last_seq']}"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def fetch_document_from(row)
|
218
|
+
doc = row.fetch('doc')
|
219
|
+
|
220
|
+
if ember_pouch_mode
|
221
|
+
ember_pouch_transform_document(doc)
|
222
|
+
else
|
223
|
+
doc
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def ember_pouch_transform_document(doc)
|
228
|
+
if doc.key?('data')
|
229
|
+
doc['id'] = doc['_id'].split('_2_', 2).last
|
230
|
+
doc.merge(doc.delete('data'))
|
231
|
+
else
|
232
|
+
doc
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def find_document_handlers(document)
|
237
|
+
@handlers.select { |row| row.handles?(document) }
|
238
|
+
end
|
239
|
+
|
240
|
+
def find_or_create_sequence_number
|
241
|
+
unless database.table_exists?(CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE)
|
242
|
+
create_sequence_table
|
243
|
+
sequence_table.insert(couchdb_database_name: source.name, created_at: DateTime.now)
|
244
|
+
end
|
245
|
+
|
246
|
+
row = sequence_table.where(couchdb_database_name: source.name).first
|
247
|
+
self.highest_sequence = (row ? row.fetch(:highest_sequence) : '0')
|
248
|
+
end
|
249
|
+
|
250
|
+
def update_sequence_table(new_highest_sequence)
|
251
|
+
if upsert_mode
|
252
|
+
data = {
|
253
|
+
couchdb_database_name: source.name,
|
254
|
+
highest_sequence: new_highest_sequence,
|
255
|
+
updated_at: DateTime.now
|
256
|
+
}
|
257
|
+
sequence_table
|
258
|
+
.insert_conflict(target: :couchdb_database_name, update: data)
|
259
|
+
.insert(data.merge(created_at: data[:updated_at]))
|
260
|
+
else
|
261
|
+
sequence_table
|
262
|
+
.where(couchdb_database_name: source.name)
|
263
|
+
.update(highest_sequence: new_highest_sequence)
|
264
|
+
end
|
265
|
+
|
266
|
+
self.highest_sequence = new_highest_sequence
|
267
|
+
end
|
268
|
+
|
269
|
+
def create_sequence_table
|
270
|
+
database.create_table CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE do
|
271
|
+
String :couchdb_database_name, primary_key: true
|
272
|
+
String :highest_sequence, default: '0', null: false
|
273
|
+
DateTime :created_at
|
274
|
+
DateTime :updated_at
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def sequence_table
|
279
|
+
database[CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE]
|
280
|
+
end
|
281
|
+
|
282
|
+
def logger
|
283
|
+
CouchdbToSql.logger
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CouchdbToSql
|
4
|
+
#
|
5
|
+
# Handles document insertion, deletion and 'marking as deleted' operations.
|
6
|
+
#
|
7
|
+
# This class delegates the actual insertion, deletion etc to the various `Table*` classes.
|
8
|
+
#
|
9
|
+
class DocumentHandler
|
10
|
+
attr_reader :changes, :filter, :mode
|
11
|
+
attr_accessor :document
|
12
|
+
|
13
|
+
def initialize(changes, filter = {}, &block)
|
14
|
+
@changes = changes
|
15
|
+
@filter = filter
|
16
|
+
@_block = block
|
17
|
+
@mode = nil
|
18
|
+
end
|
19
|
+
|
20
|
+
def handles?(doc)
|
21
|
+
@filter.each do |k, v|
|
22
|
+
return false if doc[k.to_s] != v
|
23
|
+
end
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
### START DSL
|
28
|
+
|
29
|
+
# Handle a table definition.
|
30
|
+
def table(name, opts = {}, &block)
|
31
|
+
if @mode == :delete
|
32
|
+
TableDestroyer.new(self, name, opts).execute
|
33
|
+
elsif @mode == :mark_as_deleted
|
34
|
+
TableDeletedMarker.new(self, name, opts).execute
|
35
|
+
elsif @mode == :insert
|
36
|
+
TableBuilder.new(self, name, opts, &block).execute
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
### END DSL
|
41
|
+
|
42
|
+
def handler
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def primary_keys
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
def key_filter
|
51
|
+
{}
|
52
|
+
end
|
53
|
+
|
54
|
+
def id
|
55
|
+
document['_id']
|
56
|
+
end
|
57
|
+
|
58
|
+
def rev
|
59
|
+
document['_rev']
|
60
|
+
end
|
61
|
+
|
62
|
+
def insert(document)
|
63
|
+
@mode = :insert
|
64
|
+
self.document = document
|
65
|
+
instance_eval(&@_block)
|
66
|
+
end
|
67
|
+
|
68
|
+
def delete(document)
|
69
|
+
@mode = :delete
|
70
|
+
self.document = document
|
71
|
+
instance_eval(&@_block)
|
72
|
+
end
|
73
|
+
|
74
|
+
def mark_as_deleted(document)
|
75
|
+
@mode = :mark_as_deleted
|
76
|
+
self.document = document
|
77
|
+
instance_eval(&@_block)
|
78
|
+
end
|
79
|
+
|
80
|
+
def schema(name)
|
81
|
+
changes.schema(name)
|
82
|
+
end
|
83
|
+
|
84
|
+
def database
|
85
|
+
changes.database
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CouchdbToSql
|
4
|
+
# Wrapper around a sequel table definition
|
5
|
+
# to allow easy access to column names.
|
6
|
+
class Schema
|
7
|
+
attr_accessor :name, :database, :columns, :column_names
|
8
|
+
|
9
|
+
def initialize(database, name)
|
10
|
+
self.name = name.to_sym
|
11
|
+
self.database = database
|
12
|
+
self.columns = {}
|
13
|
+
self.column_names = []
|
14
|
+
parse_schema
|
15
|
+
end
|
16
|
+
|
17
|
+
def dataset
|
18
|
+
database[name]
|
19
|
+
end
|
20
|
+
|
21
|
+
protected
|
22
|
+
|
23
|
+
def parse_schema
|
24
|
+
database.schema(name).each do |row|
|
25
|
+
column_names << row[0]
|
26
|
+
columns[row[0]] = row[1]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|