couchdb_to_sql 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rubocop.yml +33 -0
- data/.rubocop_todo.yml +39 -0
- data/.ruby-version +1 -0
- data/.travis.yml +12 -0
- data/.vscode/launch.json +46 -0
- data/Gemfile +11 -0
- data/LICENSE +24 -0
- data/README.md +163 -0
- data/Rakefile +28 -0
- data/VERSION +1 -0
- data/couchdb_to_sql.gemspec +32 -0
- data/examples/feed.rb +22 -0
- data/exe/couchdb_to_sql +23 -0
- data/lib/couchdb_to_sql.rb +42 -0
- data/lib/couchdb_to_sql/changes.rb +286 -0
- data/lib/couchdb_to_sql/document_handler.rb +88 -0
- data/lib/couchdb_to_sql/schema.rb +30 -0
- data/lib/couchdb_to_sql/table_builder.rb +112 -0
- data/lib/couchdb_to_sql/table_deleted_marker.rb +49 -0
- data/lib/couchdb_to_sql/table_destroyer.rb +22 -0
- data/lib/couchdb_to_sql/table_operator.rb +36 -0
- data/test/functional/functional_changes_test.rb +36 -0
- data/test/test_helper.rb +30 -0
- data/test/unit/changes_test.rb +129 -0
- data/test/unit/document_handler_test.rb +79 -0
- data/test/unit/schema_test.rb +52 -0
- data/test/unit/table_builder_test.rb +199 -0
- data/test/unit/table_destroyer_test.rb +65 -0
- metadata +233 -0
data/exe/couchdb_to_sql
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'couchdb_to_sql'
|
6
|
+
|
7
|
+
configuration_file_name = ARGV[0]
|
8
|
+
|
9
|
+
unless configuration_file_name
|
10
|
+
puts
|
11
|
+
puts "Syntax: #{$PROGRAM_NAME} <configuration_file.rb>\n"
|
12
|
+
puts 'For the exact syntax of the configuration file, please consult the documentation or the web site: ' \
|
13
|
+
"https://github.com/ecraft/couchdb_to_sql\n\n"
|
14
|
+
exit 1
|
15
|
+
end
|
16
|
+
|
17
|
+
# Take in the arguments for the configuration file and try to run it
|
18
|
+
CouchdbToSql.logger.info "Reading configuration: #{configuration_file_name}"
|
19
|
+
|
20
|
+
CouchdbToSql.module_eval(File.open(configuration_file_name).read, configuration_file_name)
|
21
|
+
|
22
|
+
# With the configuration loaded, start her up!
|
23
|
+
CouchdbToSql.start
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Low level requirements
|
4
|
+
require 'active_support/core_ext/object/blank'
|
5
|
+
require 'active_support/inflector'
|
6
|
+
require 'couchrest'
|
7
|
+
require 'httpclient'
|
8
|
+
require 'json'
|
9
|
+
require 'logging_library'
|
10
|
+
require 'set'
|
11
|
+
require 'sequel'
|
12
|
+
|
13
|
+
# Our stuff
|
14
|
+
require 'couchdb_to_sql/changes'
|
15
|
+
require 'couchdb_to_sql/schema'
|
16
|
+
require 'couchdb_to_sql/document_handler'
|
17
|
+
require 'couchdb_to_sql/table_builder'
|
18
|
+
require 'couchdb_to_sql/table_deleted_marker'
|
19
|
+
require 'couchdb_to_sql/table_destroyer'
|
20
|
+
|
21
|
+
module CouchdbToSql
|
22
|
+
extend LoggingLibrary::Loggable
|
23
|
+
|
24
|
+
Error = Class.new(StandardError)
|
25
|
+
InvalidDataError = Class.new(Error)
|
26
|
+
|
27
|
+
COUCHDB_TO_SQL_SEQUENCES_TABLE = :_couchdb_to_sql_sequences
|
28
|
+
|
29
|
+
module_function
|
30
|
+
|
31
|
+
def changes(database, &block)
|
32
|
+
(@changes ||= []) << Changes.new(database, &block)
|
33
|
+
end
|
34
|
+
|
35
|
+
def start
|
36
|
+
threads = []
|
37
|
+
@changes.each do |changes|
|
38
|
+
threads << Thread.new(changes, &:start)
|
39
|
+
end
|
40
|
+
threads.each(&:join)
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CouchdbToSql
|
4
|
+
class Changes
|
5
|
+
COUCHDB_HEARTBEAT = 30
|
6
|
+
INACTIVITY_TIMEOUT = 70
|
7
|
+
RECONNECT_TIMEOUT = 15
|
8
|
+
|
9
|
+
attr_reader :source, :schemas, :handlers
|
10
|
+
|
11
|
+
attr_accessor :highest_sequence
|
12
|
+
|
13
|
+
# Start a new Changes instance by connecting to the provided
|
14
|
+
# CouchDB to see if the database exists.
|
15
|
+
def initialize(opts = '', &block)
|
16
|
+
raise 'Block required for changes!' unless block_given?
|
17
|
+
|
18
|
+
@schemas = {}
|
19
|
+
@handlers = []
|
20
|
+
@source = CouchRest.database(opts)
|
21
|
+
@http = HTTPClient.new
|
22
|
+
@http.debug_dev = STDOUT if ENV.key?('DEBUG')
|
23
|
+
@skip_seqs = Set.new
|
24
|
+
|
25
|
+
log_info 'Connected to CouchDB'
|
26
|
+
|
27
|
+
@ember_pouch_mode = false
|
28
|
+
@fail_on_unhandled_document = false
|
29
|
+
@upsert_mode = false
|
30
|
+
|
31
|
+
# Prepare the definitions
|
32
|
+
@dsl_mode = true
|
33
|
+
instance_eval(&block)
|
34
|
+
@dsl_mode = false
|
35
|
+
end
|
36
|
+
|
37
|
+
#### DSL
|
38
|
+
|
39
|
+
# Sets the `ember_pouch_mode` flag. In `ember-pouch` mode, all the data fields are expected to reside within a
|
40
|
+
# `data` node in the document. More information on `ember-pouch` can be found
|
41
|
+
# [here](https://github.com/nolanlawson/ember-pouch).
|
42
|
+
#
|
43
|
+
# @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
|
44
|
+
def ember_pouch_mode
|
45
|
+
if @dsl_mode
|
46
|
+
@ember_pouch_mode ||= true
|
47
|
+
else
|
48
|
+
@ember_pouch_mode
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Sets the `upsert_mode` flag. When running in upsert mode, Sequel's insert_conflict mode is being used. More information
|
53
|
+
# about that can be found
|
54
|
+
# [here](http://sequel.jeremyevans.net/rdoc/files/doc/postgresql_rdoc.html#label-INSERT+ON+CONFLICT+Support)
|
55
|
+
#
|
56
|
+
# @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
|
57
|
+
def upsert_mode
|
58
|
+
if @dsl_mode
|
59
|
+
@upsert_mode ||= true
|
60
|
+
else
|
61
|
+
@upsert_mode
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Sets the "fail on unhandled document" flag, which will turn log errors into runtime exceptions if an unhandled document is
|
66
|
+
# encountered.
|
67
|
+
#
|
68
|
+
# @note Dual-purpose method, accepts configuration of setting or returns a previous definition.
|
69
|
+
def fail_on_unhandled_document
|
70
|
+
if @dsl_mode
|
71
|
+
@fail_on_unhandled_document ||= true
|
72
|
+
else
|
73
|
+
@fail_on_unhandled_document
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# @note Dual-purpose method, accepts configuration of database
|
78
|
+
# or returns a previous definition.
|
79
|
+
def database(opts = nil)
|
80
|
+
if opts
|
81
|
+
@database ||= begin
|
82
|
+
Sequel.connect(opts).tap { |conn|
|
83
|
+
next unless ENV.key?('SEQUEL_LOG_LEVEL')
|
84
|
+
|
85
|
+
conn.logger = LoggingLibrary::LoggerFactory.create(self.class.name).tap { |l|
|
86
|
+
l.level = ENV['SEQUEL_LOG_LEVEL'].to_s.downcase.to_sym
|
87
|
+
}
|
88
|
+
}
|
89
|
+
end
|
90
|
+
find_or_create_sequence_number
|
91
|
+
end
|
92
|
+
@database
|
93
|
+
end
|
94
|
+
|
95
|
+
def document(filter = {}, &block)
|
96
|
+
@handlers << DocumentHandler.new(self, filter, &block)
|
97
|
+
end
|
98
|
+
|
99
|
+
def skip_seqs_file(file_path)
|
100
|
+
file_contents = File.read(file_path)
|
101
|
+
seqs = JSON.parse(file_contents)
|
102
|
+
@skip_seqs |= Set.new(seqs)
|
103
|
+
end
|
104
|
+
|
105
|
+
#### END DSL
|
106
|
+
|
107
|
+
def schema(name)
|
108
|
+
@schemas[name.to_sym] ||= Schema.new(database, name)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Start listening to the CouchDB changes feed. By this stage we should have
|
112
|
+
# a sequence id so we know where to start from and all the filters should
|
113
|
+
# have been prepared.
|
114
|
+
def start
|
115
|
+
perform_request
|
116
|
+
end
|
117
|
+
|
118
|
+
def log_debug(message)
|
119
|
+
logger.debug "#{source.name}: #{message}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def log_info(message)
|
123
|
+
logger.info "#{source.name}: #{message}"
|
124
|
+
end
|
125
|
+
|
126
|
+
def log_error(message)
|
127
|
+
logger.error "#{source.name}: #{message}"
|
128
|
+
end
|
129
|
+
|
130
|
+
protected
|
131
|
+
|
132
|
+
def perform_request
|
133
|
+
raise 'Internal error: Highest_sequence is expected to be non-nil' unless highest_sequence
|
134
|
+
log_info "listening to changes feed from sequence number: #{highest_sequence}"
|
135
|
+
|
136
|
+
url = File.join(source.root.to_s, '_changes')
|
137
|
+
uri = URI.parse(url)
|
138
|
+
|
139
|
+
# Authenticate?
|
140
|
+
if uri.user.present? && uri.password.present?
|
141
|
+
@http.set_auth(source.root, uri.user, uri.password)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Make sure the request has the latest sequence
|
145
|
+
query = {
|
146
|
+
feed: 'continuous',
|
147
|
+
heartbeat: COUCHDB_HEARTBEAT * 1000,
|
148
|
+
include_docs: true,
|
149
|
+
since: highest_sequence
|
150
|
+
}
|
151
|
+
|
152
|
+
num_rows = 0
|
153
|
+
|
154
|
+
loop do
|
155
|
+
# Perform the actual request for chunked content
|
156
|
+
@http.get_content(url, query) do |chunk|
|
157
|
+
rows = chunk.split("\n")
|
158
|
+
rows.each { |row|
|
159
|
+
parsed_row = JSON.parse(row)
|
160
|
+
process_row(parsed_row)
|
161
|
+
|
162
|
+
num_rows += 1
|
163
|
+
log_info "Processed #{num_rows} rows" if (num_rows % 10_000) == 0
|
164
|
+
}
|
165
|
+
end
|
166
|
+
log_error "connection ended, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
|
167
|
+
wait RECONNECT_TIMEOUT
|
168
|
+
end
|
169
|
+
rescue HTTPClient::TimeoutError, HTTPClient::BadResponseError => e
|
170
|
+
log_error "connection failed: #{e.message}, attempting to reconnect in #{RECONNECT_TIMEOUT}s..."
|
171
|
+
wait RECONNECT_TIMEOUT
|
172
|
+
retry
|
173
|
+
end
|
174
|
+
|
175
|
+
def process_row(row)
|
176
|
+
id = row['id']
|
177
|
+
seq = row['seq']
|
178
|
+
|
179
|
+
return if id =~ /^_design/
|
180
|
+
return if @skip_seqs.include?(seq)
|
181
|
+
|
182
|
+
if id
|
183
|
+
# Wrap the whole request in a transaction
|
184
|
+
database.transaction do
|
185
|
+
doc = fetch_document_from(row)
|
186
|
+
|
187
|
+
if row['deleted']
|
188
|
+
log_info "received DELETE seq. #{seq} id: #{id}"
|
189
|
+
handlers.each { |handler| handler.mark_as_deleted(doc) }
|
190
|
+
else
|
191
|
+
log_debug "received CHANGE seq. #{seq} id: #{id}"
|
192
|
+
|
193
|
+
document_handlers = find_document_handlers(doc)
|
194
|
+
if document_handlers.empty?
|
195
|
+
message = 'No document handlers found for document. ' \
|
196
|
+
"Document data: #{doc.inspect}, seq: #{seq}, source: #{@source.name}"
|
197
|
+
raise InvalidDataError, message if fail_on_unhandled_document
|
198
|
+
|
199
|
+
log_error message
|
200
|
+
end
|
201
|
+
|
202
|
+
document_handlers.each do |handler|
|
203
|
+
# Delete all previous entries of doc, then re-create
|
204
|
+
handler.delete(doc)
|
205
|
+
handler.insert(doc)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
update_sequence_table(seq)
|
210
|
+
end # transaction
|
211
|
+
elsif row['last_seq']
|
212
|
+
# Sometimes CouchDB will send an update to keep the connection alive
|
213
|
+
log_info "received last seq: #{row['last_seq']}"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def fetch_document_from(row)
|
218
|
+
doc = row.fetch('doc')
|
219
|
+
|
220
|
+
if ember_pouch_mode
|
221
|
+
ember_pouch_transform_document(doc)
|
222
|
+
else
|
223
|
+
doc
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def ember_pouch_transform_document(doc)
|
228
|
+
if doc.key?('data')
|
229
|
+
doc['id'] = doc['_id'].split('_2_', 2).last
|
230
|
+
doc.merge(doc.delete('data'))
|
231
|
+
else
|
232
|
+
doc
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def find_document_handlers(document)
|
237
|
+
@handlers.select { |row| row.handles?(document) }
|
238
|
+
end
|
239
|
+
|
240
|
+
def find_or_create_sequence_number
|
241
|
+
unless database.table_exists?(CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE)
|
242
|
+
create_sequence_table
|
243
|
+
sequence_table.insert(couchdb_database_name: source.name, created_at: DateTime.now)
|
244
|
+
end
|
245
|
+
|
246
|
+
row = sequence_table.where(couchdb_database_name: source.name).first
|
247
|
+
self.highest_sequence = (row ? row.fetch(:highest_sequence) : '0')
|
248
|
+
end
|
249
|
+
|
250
|
+
def update_sequence_table(new_highest_sequence)
|
251
|
+
if upsert_mode
|
252
|
+
data = {
|
253
|
+
couchdb_database_name: source.name,
|
254
|
+
highest_sequence: new_highest_sequence,
|
255
|
+
updated_at: DateTime.now
|
256
|
+
}
|
257
|
+
sequence_table
|
258
|
+
.insert_conflict(target: :couchdb_database_name, update: data)
|
259
|
+
.insert(data.merge(created_at: data[:updated_at]))
|
260
|
+
else
|
261
|
+
sequence_table
|
262
|
+
.where(couchdb_database_name: source.name)
|
263
|
+
.update(highest_sequence: new_highest_sequence)
|
264
|
+
end
|
265
|
+
|
266
|
+
self.highest_sequence = new_highest_sequence
|
267
|
+
end
|
268
|
+
|
269
|
+
def create_sequence_table
|
270
|
+
database.create_table CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE do
|
271
|
+
String :couchdb_database_name, primary_key: true
|
272
|
+
String :highest_sequence, default: '0', null: false
|
273
|
+
DateTime :created_at
|
274
|
+
DateTime :updated_at
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def sequence_table
|
279
|
+
database[CouchdbToSql::COUCHDB_TO_SQL_SEQUENCES_TABLE]
|
280
|
+
end
|
281
|
+
|
282
|
+
def logger
|
283
|
+
CouchdbToSql.logger
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CouchdbToSql
|
4
|
+
#
|
5
|
+
# Handles document insertion, deletion and 'marking as deleted' operations.
|
6
|
+
#
|
7
|
+
# This class delegates the actual insertion, deletion etc to the various `Table*` classes.
|
8
|
+
#
|
9
|
+
class DocumentHandler
|
10
|
+
attr_reader :changes, :filter, :mode
|
11
|
+
attr_accessor :document
|
12
|
+
|
13
|
+
def initialize(changes, filter = {}, &block)
|
14
|
+
@changes = changes
|
15
|
+
@filter = filter
|
16
|
+
@_block = block
|
17
|
+
@mode = nil
|
18
|
+
end
|
19
|
+
|
20
|
+
def handles?(doc)
|
21
|
+
@filter.each do |k, v|
|
22
|
+
return false if doc[k.to_s] != v
|
23
|
+
end
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
### START DSL
|
28
|
+
|
29
|
+
# Handle a table definition.
|
30
|
+
def table(name, opts = {}, &block)
|
31
|
+
if @mode == :delete
|
32
|
+
TableDestroyer.new(self, name, opts).execute
|
33
|
+
elsif @mode == :mark_as_deleted
|
34
|
+
TableDeletedMarker.new(self, name, opts).execute
|
35
|
+
elsif @mode == :insert
|
36
|
+
TableBuilder.new(self, name, opts, &block).execute
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
### END DSL
|
41
|
+
|
42
|
+
def handler
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def primary_keys
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
def key_filter
|
51
|
+
{}
|
52
|
+
end
|
53
|
+
|
54
|
+
def id
|
55
|
+
document['_id']
|
56
|
+
end
|
57
|
+
|
58
|
+
def rev
|
59
|
+
document['_rev']
|
60
|
+
end
|
61
|
+
|
62
|
+
def insert(document)
|
63
|
+
@mode = :insert
|
64
|
+
self.document = document
|
65
|
+
instance_eval(&@_block)
|
66
|
+
end
|
67
|
+
|
68
|
+
def delete(document)
|
69
|
+
@mode = :delete
|
70
|
+
self.document = document
|
71
|
+
instance_eval(&@_block)
|
72
|
+
end
|
73
|
+
|
74
|
+
def mark_as_deleted(document)
|
75
|
+
@mode = :mark_as_deleted
|
76
|
+
self.document = document
|
77
|
+
instance_eval(&@_block)
|
78
|
+
end
|
79
|
+
|
80
|
+
def schema(name)
|
81
|
+
changes.schema(name)
|
82
|
+
end
|
83
|
+
|
84
|
+
def database
|
85
|
+
changes.database
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CouchdbToSql
|
4
|
+
# Wrapper around a sequel table definition
|
5
|
+
# to allow easy access to column names.
|
6
|
+
class Schema
|
7
|
+
attr_accessor :name, :database, :columns, :column_names
|
8
|
+
|
9
|
+
def initialize(database, name)
|
10
|
+
self.name = name.to_sym
|
11
|
+
self.database = database
|
12
|
+
self.columns = {}
|
13
|
+
self.column_names = []
|
14
|
+
parse_schema
|
15
|
+
end
|
16
|
+
|
17
|
+
def dataset
|
18
|
+
database[name]
|
19
|
+
end
|
20
|
+
|
21
|
+
protected
|
22
|
+
|
23
|
+
def parse_schema
|
24
|
+
database.schema(name).each do |row|
|
25
|
+
column_names << row[0]
|
26
|
+
columns[row[0]] = row[1]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|