embulk-output-documentdb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +5 -0
- data/ChangeLog +4 -0
- data/Gemfile +2 -0
- data/LICENSE.txt +21 -0
- data/README.md +114 -0
- data/Rakefile +3 -0
- data/VERSION +1 -0
- data/embulk-output-documentdb.gemspec +21 -0
- data/lib/embulk/output/documentdb.rb +166 -0
- data/lib/embulk/output/documentdb/client.rb +167 -0
- data/lib/embulk/output/documentdb/constants.rb +10 -0
- data/lib/embulk/output/documentdb/header.rb +55 -0
- data/lib/embulk/output/documentdb/partitioned_coll_client.rb +62 -0
- data/lib/embulk/output/documentdb/resource.rb +40 -0
- data/samples/config-csv2docdb_partitionedcoll.yml +33 -0
- data/samples/config-csv2docdb_singlecoll.yml +31 -0
- data/samples/sample_01.csv +6 -0
- metadata +117 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cdf1c91d03258737edcf353cdfaa4ad87ddba0b2
|
4
|
+
data.tar.gz: 4a96c42c177b9693c66bfa412f720880073b16d7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d632dcfc63aa637e55838ac223327f9b5fc884ba0684edcb408dfab5ab3882ef237950941c158c7c25544c7969ed3b611ebd537aa00c8a62b3431905ddf9e6ec
|
7
|
+
data.tar.gz: a1ded833cbd20cb47dcc417fdea5c3c37aedfbfe169d2008a631bc9c800953d2d8b1b136cac661416d352cc51c57b2d8e7849e3e9ff0ff88eb5d75b52720ec9f
|
data/ChangeLog
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
# Azure DocumentDB output plugin for Embulk
|
2
|
+
|
3
|
+
embulk-output-documentdb is a embulk output plugin that dumps records to Azure DocumentDB
|
4
|
+
|
5
|
+
## Overview
|
6
|
+
|
7
|
+
* **Plugin type**: output
|
8
|
+
* **Load all or nothing**: no
|
9
|
+
* **Resume supported**: no
|
10
|
+
* **Cleanup supported**: yes
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
$ gem install embulk-output-documentdb
|
15
|
+
|
16
|
+
## Configuration
|
17
|
+
|
18
|
+
### DocumentDB
|
19
|
+
|
20
|
+
To use Microsoft Azure DocumentDB, you must create a DocumentDB database account using either the Azure portal, Azure Resource Manager templates, or Azure command-line interface (CLI). In addition, you must have a database and a collection to which embulk-output-documentdb writes event-stream out. Here are instructions:
|
21
|
+
|
22
|
+
* Create a DocumentDB database account using [the Azure portal](https://azure.microsoft.com/en-us/documentation/articles/documentdb-create-account/), or [Azure Resource Manager templates and Azure CLI](https://azure.microsoft.com/en-us/documentation/articles/documentdb-automation-resource-manager-cli/)
|
23
|
+
* [How to create a database for DocumentDB](https://azure.microsoft.com/en-us/documentation/articles/documentdb-create-database/)
|
24
|
+
* [Create a DocumentDB collection](https://azure.microsoft.com/en-us/documentation/articles/documentdb-create-collection/)
|
25
|
+
* [Partitioning and scaling in Azure DocumentDB](https://azure.microsoft.com/en-us/documentation/articles/documentdb-partition-data/)
|
26
|
+
|
27
|
+
## Configuration
|
28
|
+
|
29
|
+
```yaml
|
30
|
+
out:
|
31
|
+
type: documentdb
|
32
|
+
docdb_endpoint: https://yoichikademo0.documents.azure.com:443/
|
33
|
+
docdb_account_key: EMwUa3EzsAtJ1qYfzxo9nQ3KudofsXNm3xLh1SLffKkUHMFl80OZRZIVu4lxdKRKxkgVAj0c2mv9BZSyMN7tdg==
|
34
|
+
docdb_database: myembulkdb
|
35
|
+
docdb_collection: myembulkcoll
|
36
|
+
auto_create_database: true
|
37
|
+
auto_create_collection: true
|
38
|
+
partitioned_collection: false
|
39
|
+
key_column: id
|
40
|
+
```
|
41
|
+
|
42
|
+
* **docdb\_endpoint (required)** - Azure DocumentDB Account endpoint URI
|
43
|
+
* **docdb\_account\_key (required)** - Azure DocumentDB Account key (master key). You must NOT set a read-only key
|
44
|
+
* **docdb\_database (required)** - DocumentDB database nameb
|
45
|
+
* **docdb\_collection (required)** - DocumentDB collection name
|
46
|
+
* **auto\_create\_database (optional)** - Default:true. By default, DocumentDB database named **docdb\_database** will be automatically created if it does not exist
|
47
|
+
* **auto\_create\_collection (optional)** - Default:true. By default, DocumentDB collection named **docdb\_collection** will be automatically created if it does not exist
|
48
|
+
* **partitioned\_collection (optional)** - Default:false. Set true if you want to create and/or store records to partitioned collection. Set false for single-partition collection
|
49
|
+
* **partition\_key (optional)** - Default:nil. Partition key must be specified for paritioned collection (partitioned\_collection set to be true)
|
50
|
+
* **offer\_throughput (optional)** - Default:10100. Throughput for the collection expressed in units of 100 request units per second. This is only effective when you newly create a partitioned collection (ie. Both auto\_create\_collection and partitioned\_collection are set to be true )
|
51
|
+
* **key\_column (required)** - Column name to be inserted to DocumentDB as primary key. If it's not named "id", the column name is converted into "id" (string).
|
52
|
+
|
53
|
+
## Configuration examples
|
54
|
+
|
55
|
+
Here are two types of the plugin configurations example - single-parition collection and partitioned collection.
|
56
|
+
|
57
|
+
### (1) Single-Partition Collection Case
|
58
|
+
|
59
|
+
```yaml
|
60
|
+
out:
|
61
|
+
type: documentdb
|
62
|
+
docdb_endpoint: https://yoichikademo0.documents.azure.com:443/
|
63
|
+
docdb_account_key: EMwUa3EzsAtJ1qYfzxo9nQ3KudofsXNm3xLh1SLffKkUHMFl80OZRZIVu4lxdKRKxkgVAj0c2mv9BZSyMN7tdg==
|
64
|
+
docdb_database: myembulkdb
|
65
|
+
docdb_collection: myembulkcoll
|
66
|
+
auto_create_database: true
|
67
|
+
auto_create_collection: true
|
68
|
+
partitioned_collection: false
|
69
|
+
key_column: id
|
70
|
+
```
|
71
|
+
|
72
|
+
### (2) Partitioned Collection Case
|
73
|
+
|
74
|
+
```yaml
|
75
|
+
type: documentdb
|
76
|
+
docdb_endpoint: https://yoichikademo0.documents.azure.com:443/
|
77
|
+
docdb_account_key: EMwUa3EzsAtJ1qYfzxo9nQ3KudofsXNm3xLh1SLffKkUHMFl80OZRZIVu4lxdKRKxkgVAj0c2mv9BZSyMN7tdg==
|
78
|
+
docdb_database: myembulkdb
|
79
|
+
docdb_collection: myembulkcoll
|
80
|
+
auto_create_database: true
|
81
|
+
auto_create_collection: true
|
82
|
+
partitioned_collection: true
|
83
|
+
partition_key: account
|
84
|
+
offer_throughput: 10100
|
85
|
+
key_column: id
|
86
|
+
```
|
87
|
+
|
88
|
+
## Build, Install, and Run
|
89
|
+
|
90
|
+
```
|
91
|
+
$ rake
|
92
|
+
|
93
|
+
$ embulk gem install pkg/embulk-output-documentdb-0.1.0.gem
|
94
|
+
|
95
|
+
$ embulk preview config.yml
|
96
|
+
|
97
|
+
$ embulk run config.yml
|
98
|
+
|
99
|
+
```
|
100
|
+
|
101
|
+
## Contributing
|
102
|
+
|
103
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yokawasa/embulk-output-documentdb.
|
104
|
+
|
105
|
+
## Copyright
|
106
|
+
|
107
|
+
<table>
|
108
|
+
<tr>
|
109
|
+
<td>Copyright</td><td>Copyright (c) 2016- Yoichi Kawasaki</td>
|
110
|
+
</tr>
|
111
|
+
<tr>
|
112
|
+
<td>License</td><td>MIT</td>
|
113
|
+
</tr>
|
114
|
+
</table>
|
data/Rakefile
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "embulk-output-documentdb"
|
5
|
+
spec.version = File.read("VERSION").strip
|
6
|
+
spec.authors = ["Yoichi Kawasaki"]
|
7
|
+
spec.email = ["yoichi.kawasaki@outlook.com"]
|
8
|
+
spec.summary = "Azure DocumentDB output plugin for Embulk"
|
9
|
+
spec.description = "Dumps records to Azure DocumentDB"
|
10
|
+
spec.licenses = ["MIT"]
|
11
|
+
spec.homepage = "https://github.com/yoichika/embulk-output-documentdb"
|
12
|
+
|
13
|
+
spec.files = `git ls-files`.split("\n")
|
14
|
+
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
15
|
+
spec.require_paths = ["lib"]
|
16
|
+
|
17
|
+
spec.add_dependency "rest-client"
|
18
|
+
spec.add_development_dependency 'embulk', ['>= 0.8.13']
|
19
|
+
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
20
|
+
spec.add_development_dependency 'rake', ['>= 10.0']
|
21
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Output
|
3
|
+
|
4
|
+
require 'time'
|
5
|
+
require 'securerandom'
|
6
|
+
require_relative 'documentdb/client'
|
7
|
+
require_relative 'documentdb/partitioned_coll_client'
|
8
|
+
require_relative 'documentdb/header'
|
9
|
+
require_relative 'documentdb/resource'
|
10
|
+
|
11
|
+
class Documentdb < OutputPlugin
|
12
|
+
Plugin.register_output("documentdb", self)
|
13
|
+
|
14
|
+
def self.transaction(config, schema, count, &control)
|
15
|
+
# configuration code:
|
16
|
+
task = {
|
17
|
+
'docdb_endpoint' => config.param('docdb_endpoint', :string),
|
18
|
+
'docdb_account_key' => config.param('docdb_account_key', :string),
|
19
|
+
'docdb_database' => config.param('docdb_database', :string),
|
20
|
+
'docdb_collection' => config.param('docdb_collection', :string),
|
21
|
+
'auto_create_database' => config.param('auto_create_database', :bool, :default => true),
|
22
|
+
'auto_create_collection' => config.param('auto_create_collection',:bool, :default => true),
|
23
|
+
'partitioned_collection' => config.param('partitioned_collection',:bool, :default => false),
|
24
|
+
'partition_key' => config.param('partition_key', :string, :default => nil),
|
25
|
+
'offer_throughput' => config.param('offer_throughput', :integer, :default => AzureDocumentDB::PARTITIONED_COLL_MIN_THROUGHPUT),
|
26
|
+
'key_column' => config.param('key_column', :string),
|
27
|
+
}
|
28
|
+
Embulk.logger.info "transaction start"
|
29
|
+
# param validation
|
30
|
+
raise ConfigError, 'no docdb_endpoint' if task['docdb_endpoint'].empty?
|
31
|
+
raise ConfigError, 'no docdb_account_key' if task['docdb_account_key'].empty?
|
32
|
+
raise ConfigError, 'no docdb_database' if task['docdb_database'].empty?
|
33
|
+
raise ConfigError, 'no docdb_collection' if task['docdb_collection'].empty?
|
34
|
+
raise ConfigError, 'no key_column' if task['key_column'].empty?
|
35
|
+
|
36
|
+
if task['partitioned_collection']
|
37
|
+
raise ConfigError, 'partition_key must be set in partitioned collection mode' if @partition_key.empty?
|
38
|
+
if (task['auto_create_collection'] && task['offer_throughput'] < AzureDocumentDB::PARTITIONED_COLL_MIN_THROUGHPUT)
|
39
|
+
raise ConfigError, sprintf("offer_throughput must be more than and equals to %s",
|
40
|
+
AzureDocumentDB::PARTITIONED_COLL_MIN_THROUGHPUT)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# resumable output:
|
45
|
+
# resume(task, schema, count, &control)
|
46
|
+
|
47
|
+
# non-resumable output:
|
48
|
+
Embulk.logger.info "Documentdb output start"
|
49
|
+
task_reports = yield(task)
|
50
|
+
Embulk.logger.info "Documentdb output finished. Task reports = #{task_reports.to_json}"
|
51
|
+
|
52
|
+
next_config_diff = {}
|
53
|
+
return next_config_diff
|
54
|
+
end
|
55
|
+
|
56
|
+
#def self.resume(task, schema, count, &control)
|
57
|
+
# task_reports = yield(task)
|
58
|
+
#
|
59
|
+
# next_config_diff = {}
|
60
|
+
# return next_config_diff
|
61
|
+
#end
|
62
|
+
|
63
|
+
|
64
|
+
# init is called in initialize(task, schema, index)
|
65
|
+
def init
|
66
|
+
# initialization code:
|
67
|
+
@recordnum = 0
|
68
|
+
@successnum = 0
|
69
|
+
|
70
|
+
begin
|
71
|
+
@client = nil
|
72
|
+
if task['partitioned_collection']
|
73
|
+
@client = AzureDocumentDB::PartitionedCollectionClient.new(task['docdb_account_key'],task['docdb_endpoint'])
|
74
|
+
else
|
75
|
+
@client = AzureDocumentDB::Client.new(task['docdb_account_key'],task['docdb_endpoint'])
|
76
|
+
end
|
77
|
+
|
78
|
+
# initial operations for database
|
79
|
+
res = @client.find_databases_by_name(task['docdb_database'])
|
80
|
+
if( res[:body]["_count"].to_i == 0 )
|
81
|
+
raise "No database (#{docdb_database})! Enable auto_create_database or create it by yourself" if !task['auto_create_database']
|
82
|
+
# create new database as it doesn't exists
|
83
|
+
@client.create_database(task['docdb_database'])
|
84
|
+
end
|
85
|
+
|
86
|
+
# initial operations for collection
|
87
|
+
database_resource = @client.get_database_resource(task['docdb_database'])
|
88
|
+
res = @client.find_collections_by_name(database_resource, task['docdb_collection'])
|
89
|
+
if( res[:body]["_count"].to_i == 0 )
|
90
|
+
raise "No collection (#{docdb_collection})! Enable auto_create_collection or create it by yourself" if !task['auto_create_collection']
|
91
|
+
# create new collection as it doesn't exists
|
92
|
+
if task['partitioned_collection']
|
93
|
+
partition_key_paths = ["/#{task['partition_key']}"]
|
94
|
+
@client.create_collection(database_resource,
|
95
|
+
task['docdb_collection'], partition_key_paths, task['offer_throughput'])
|
96
|
+
else
|
97
|
+
@client.create_collection(database_resource, task['docdb_collection'])
|
98
|
+
end
|
99
|
+
end
|
100
|
+
@coll_resource = @client.get_collection_resource(database_resource, task['docdb_collection'])
|
101
|
+
|
102
|
+
rescue Exception =>ex
|
103
|
+
Embulk.logger.error { "Error: init: '#{ex}'" }
|
104
|
+
exit!
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
def close
|
110
|
+
end
|
111
|
+
|
112
|
+
# called for each page in each task
|
113
|
+
def add(page)
|
114
|
+
# output code:
|
115
|
+
page.each do |record|
|
116
|
+
hash = Hash[schema.names.zip(record)]
|
117
|
+
@recordnum += 1
|
118
|
+
if !hash.key?(@task['key_column'])
|
119
|
+
Embulk.logger.warn { "Skip Invalid Record: no key_column, data=>" + hash.to_json }
|
120
|
+
next
|
121
|
+
end
|
122
|
+
unique_doc_id = "#{hash[@task['key_column']]}"
|
123
|
+
if @task['key_column'] != 'id'
|
124
|
+
hash.delete(@task['key_column'])
|
125
|
+
end
|
126
|
+
# force primary key to be both named "id" and "string" type
|
127
|
+
hash['id'] = unique_doc_id
|
128
|
+
|
129
|
+
begin
|
130
|
+
if @task['partitioned_collection']
|
131
|
+
@client.create_document(@coll_resource, unique_doc_id, hash, @task['partition_key'])
|
132
|
+
else
|
133
|
+
@client.create_document(@coll_resource, unique_doc_id, hash)
|
134
|
+
end
|
135
|
+
@successnum += 1
|
136
|
+
rescue RestClient::ExceptionWithResponse => rcex
|
137
|
+
exdict = JSON.parse(rcex.response)
|
138
|
+
if exdict['code'] == 'Conflict'
|
139
|
+
Embulk.logger.error { "Duplicate Error: doc id (#{unique_doc_id}) already exists, data=>" + hash.to_json }
|
140
|
+
else
|
141
|
+
Embulk.logger.error { "RestClient Error: '#{rcex.response}', data=>" + hash.to_json }
|
142
|
+
end
|
143
|
+
rescue => ex
|
144
|
+
Embulk.logger.error { "UnknownError: '#{ex}', doc id=>#{unique_doc_id}, data=>" + hash.to_json }
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def finish
|
150
|
+
end
|
151
|
+
|
152
|
+
def abort
|
153
|
+
end
|
154
|
+
|
155
|
+
def commit
|
156
|
+
task_report = {
|
157
|
+
"total_records" => @recordnum,
|
158
|
+
"success" => @successnum,
|
159
|
+
"skip_or_error" => (@recordnum - @successnum),
|
160
|
+
}
|
161
|
+
return task_report
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
require 'rest-client'
|
2
|
+
require 'json'
|
3
|
+
require_relative 'constants'
|
4
|
+
require_relative 'header'
|
5
|
+
require_relative 'resource'
|
6
|
+
|
7
|
+
module AzureDocumentDB
|
8
|
+
|
9
|
+
class Client
|
10
|
+
|
11
|
+
def initialize (master_key, url_endpoint)
|
12
|
+
@master_key = master_key
|
13
|
+
@url_endpoint = url_endpoint
|
14
|
+
@header = AzureDocumentDB::Header.new(@master_key)
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_database (database_name)
|
18
|
+
url = "#{@url_endpoint}/dbs"
|
19
|
+
custom_headers = {'Content-Type' => 'application/json'}
|
20
|
+
headers = @header.generate('post', AzureDocumentDB::RESOURCE_TYPE_DATABASE, '', custom_headers )
|
21
|
+
body_json = { 'id' => database_name }.to_json
|
22
|
+
res = RestClient.post( url, body_json, headers)
|
23
|
+
JSON.parse(res)
|
24
|
+
end
|
25
|
+
|
26
|
+
def find_databases_by_name (database_name)
|
27
|
+
query_params = []
|
28
|
+
query_text = "SELECT * FROM root r WHERE r.id=@id"
|
29
|
+
query_params.push( {:name=>"@id", :value=> database_name } )
|
30
|
+
url = sprintf("%s/dbs", @url_endpoint )
|
31
|
+
res = _query(AzureDocumentDB::RESOURCE_TYPE_DATABASE, '', url, query_text, query_params)
|
32
|
+
res
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_database_resource (database_name)
|
36
|
+
resource = nil
|
37
|
+
res = find_databases_by_name (database_name)
|
38
|
+
if( res[:body]["_count"].to_i == 0 )
|
39
|
+
p "no #{database_name} database exists"
|
40
|
+
return resource
|
41
|
+
end
|
42
|
+
res[:body]['Databases'].select do |db|
|
43
|
+
if (db['id'] == database_name )
|
44
|
+
resource = AzureDocumentDB::DatabaseResource.new(db['_rid'])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
resource
|
48
|
+
end
|
49
|
+
|
50
|
+
def create_collection(database_resource, collection_name, colls_options={}, custom_headers={} )
|
51
|
+
if !database_resource
|
52
|
+
raise ArgumentError.new 'No database_resource!'
|
53
|
+
end
|
54
|
+
url = sprintf("%s/dbs/%s/colls", @url_endpoint, database_resource.database_rid )
|
55
|
+
custom_headers['Content-Type'] = 'application/json'
|
56
|
+
headers = @header.generate('post',
|
57
|
+
AzureDocumentDB::RESOURCE_TYPE_COLLECTION,
|
58
|
+
database_resource.database_rid, custom_headers )
|
59
|
+
body = {'id' => collection_name }
|
60
|
+
colls_options.each{|k, v|
|
61
|
+
if k == 'indexingPolicy' || k == 'partitionKey'
|
62
|
+
body[k] = v
|
63
|
+
end
|
64
|
+
}
|
65
|
+
res = RestClient.post( url, body.to_json, headers)
|
66
|
+
JSON.parse(res)
|
67
|
+
end
|
68
|
+
|
69
|
+
def find_collections_by_name(database_resource, collection_name)
|
70
|
+
if !database_resource
|
71
|
+
raise ArgumentError.new 'No database_resource!'
|
72
|
+
end
|
73
|
+
ret = {}
|
74
|
+
query_params = []
|
75
|
+
query_text = "SELECT * FROM root r WHERE r.id=@id"
|
76
|
+
query_params.push( {:name=>"@id", :value=> collection_name } )
|
77
|
+
url = sprintf("%s/dbs/%s/colls", @url_endpoint, database_resource.database_rid)
|
78
|
+
ret = _query(AzureDocumentDB::RESOURCE_TYPE_COLLECTION,
|
79
|
+
database_resource.database_rid, url, query_text, query_params)
|
80
|
+
ret
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_collection_resource (database_resource, collection_name)
|
84
|
+
_collection_rid = ''
|
85
|
+
if !database_resource
|
86
|
+
raise ArgumentError.new 'No database_resource!'
|
87
|
+
end
|
88
|
+
res = find_collections_by_name(database_resource, collection_name)
|
89
|
+
res[:body]['DocumentCollections'].select do |col|
|
90
|
+
if (col['id'] == collection_name )
|
91
|
+
_collection_rid = col['_rid']
|
92
|
+
end
|
93
|
+
end
|
94
|
+
if _collection_rid.empty?
|
95
|
+
p "no #{collection_name} collection exists"
|
96
|
+
return nil
|
97
|
+
end
|
98
|
+
AzureDocumentDB::CollectionResource.new(database_resource.database_rid, _collection_rid)
|
99
|
+
end
|
100
|
+
|
101
|
+
def create_document(collection_resource, document_id, document, custom_headers={} )
|
102
|
+
if !collection_resource
|
103
|
+
raise ArgumentError.new 'No collection_resource!'
|
104
|
+
end
|
105
|
+
if document['id'] && document_id != document['id']
|
106
|
+
raise ArgumentError.new "Document id mismatch error (#{document_id})!"
|
107
|
+
end
|
108
|
+
body = { 'id' => document_id }.merge document
|
109
|
+
url = sprintf("%s/dbs/%s/colls/%s/docs",
|
110
|
+
@url_endpoint, collection_resource.database_rid, collection_resource.collection_rid)
|
111
|
+
custom_headers['Content-Type'] = 'application/json'
|
112
|
+
headers = @header.generate('post', AzureDocumentDB::RESOURCE_TYPE_DOCUMENT,
|
113
|
+
collection_resource.collection_rid, custom_headers )
|
114
|
+
res = RestClient.post( url, body.to_json, headers)
|
115
|
+
JSON.parse(res)
|
116
|
+
end
|
117
|
+
|
118
|
+
def find_documents(collection_resource, document_id, custom_headers={})
|
119
|
+
if !collection_resource
|
120
|
+
raise ArgumentError.new 'No collection_resource!'
|
121
|
+
end
|
122
|
+
ret = {}
|
123
|
+
query_params = []
|
124
|
+
query_text = "SELECT * FROM c WHERE c.id=@id"
|
125
|
+
query_params.push( {:name=>"@id", :value=> document_id } )
|
126
|
+
url = sprintf("%s/dbs/%s/colls/%s/docs",
|
127
|
+
@url_endpoint, collection_resource.database_rid, collection_resource.collection_rid)
|
128
|
+
ret = _query(AzureDocumentDB::RESOURCE_TYPE_DOCUMENT,
|
129
|
+
collection_resource.collection_rid, url, query_text, query_params, custom_headers)
|
130
|
+
ret
|
131
|
+
end
|
132
|
+
|
133
|
+
def query_documents( collection_resource, query_text, query_params, custom_headers={} )
|
134
|
+
if !collection_resource
|
135
|
+
raise ArgumentError.new 'No collection_resource!'
|
136
|
+
end
|
137
|
+
ret = {}
|
138
|
+
url = sprintf("%s/dbs/%s/colls/%s/docs",
|
139
|
+
@url_endpoint, collection_resource.database_rid, collection_resource.collection_rid)
|
140
|
+
ret = _query(AzureDocumentDB::RESOURCE_TYPE_DOCUMENT,
|
141
|
+
collection_resource.collection_rid, url, query_text, query_params, custom_headers)
|
142
|
+
ret
|
143
|
+
end
|
144
|
+
|
145
|
+
protected
|
146
|
+
|
147
|
+
def _query( resource_type, parent_resource_id, url, query_text, query_params, custom_headers={} )
|
148
|
+
query_specific_header = {
|
149
|
+
'x-ms-documentdb-isquery' => 'True',
|
150
|
+
'Content-Type' => 'application/query+json',
|
151
|
+
'Accept' => 'application/json'
|
152
|
+
}
|
153
|
+
query_specific_header.merge! custom_headers
|
154
|
+
headers = @header.generate('post', resource_type, parent_resource_id, query_specific_header)
|
155
|
+
body_json = {
|
156
|
+
:query => query_text,
|
157
|
+
:parameters => query_params
|
158
|
+
}.to_json
|
159
|
+
|
160
|
+
res = RestClient.post( url, body_json, headers)
|
161
|
+
result = {
|
162
|
+
:header => res.headers,
|
163
|
+
:body => JSON.parse(res.body) }
|
164
|
+
return result
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module AzureDocumentDB
|
2
|
+
API_VERSION = '2015-12-16'.freeze
|
3
|
+
RESOURCE_TYPE_DATABASE='dbs'.freeze
|
4
|
+
RESOURCE_TYPE_COLLECTION='colls'.freeze
|
5
|
+
RESOURCE_TYPE_DOCUMENT='docs'.freeze
|
6
|
+
AUTH_TOKEN_VERSION = '1.0'.freeze
|
7
|
+
AUTH_TOKEN_TYPE_MASTER = 'master'.freeze
|
8
|
+
AUTH_TOKEN_TYPE_RESOURCE = 'resource'.freeze
|
9
|
+
PARTITIONED_COLL_MIN_THROUGHPUT = 10100.freeze
|
10
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'openssl'
|
3
|
+
require 'base64'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
module AzureDocumentDB
|
7
|
+
|
8
|
+
class Header
|
9
|
+
|
10
|
+
def initialize (master_key)
|
11
|
+
@master_key = master_key
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate (verb, resource_type, parent_resource_id, api_specific_headers = {} )
|
15
|
+
headers = {}
|
16
|
+
utc_date = get_httpdate()
|
17
|
+
auth_token = generate_auth_token(verb, resource_type, parent_resource_id, utc_date )
|
18
|
+
default_headers = {
|
19
|
+
'x-ms-version' => AzureDocumentDB::API_VERSION,
|
20
|
+
'x-ms-date' => utc_date,
|
21
|
+
'authorization' => auth_token
|
22
|
+
}.freeze
|
23
|
+
headers.merge!(default_headers)
|
24
|
+
headers.merge(api_specific_headers)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def generate_auth_token ( verb, resource_type, resource_id, utc_date)
|
30
|
+
payload = sprintf("%s\n%s\n%s\n%s\n%s\n",
|
31
|
+
verb,
|
32
|
+
resource_type,
|
33
|
+
resource_id,
|
34
|
+
utc_date,
|
35
|
+
"" )
|
36
|
+
sig = hmac_base64encode(payload)
|
37
|
+
|
38
|
+
ERB::Util.url_encode sprintf("type=%s&ver=%s&sig=%s",
|
39
|
+
AzureDocumentDB::AUTH_TOKEN_TYPE_MASTER,
|
40
|
+
AzureDocumentDB::AUTH_TOKEN_VERSION,
|
41
|
+
sig )
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_httpdate
|
45
|
+
Time.now.httpdate
|
46
|
+
end
|
47
|
+
|
48
|
+
def hmac_base64encode( text )
|
49
|
+
key = Base64.urlsafe_decode64 @master_key
|
50
|
+
hmac = OpenSSL::HMAC.digest('sha256', key, text.downcase)
|
51
|
+
Base64.encode64(hmac).strip
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'rest-client'
|
2
|
+
require 'json'
|
3
|
+
require_relative 'constants'
|
4
|
+
require_relative 'header'
|
5
|
+
require_relative 'resource'
|
6
|
+
|
7
|
+
module AzureDocumentDB
|
8
|
+
|
9
|
+
class PartitionedCollectionClient < Client
|
10
|
+
|
11
|
+
def create_collection(database_resource, collection_name,
|
12
|
+
partition_key_paths, offer_throughput = AzureDocumentDB::PARTITIONED_COLL_MIN_THROUGHPUT )
|
13
|
+
|
14
|
+
if (offer_throughput < AzureDocumentDB::PARTITIONED_COLL_MIN_THROUGHPUT)
|
15
|
+
raise ArgumentError.new sprintf("Offeer thoughput need to be more than %d !",
|
16
|
+
AzureDocumentDB::PARTITIONED_COLL_MIN_THROUGHPUT)
|
17
|
+
end
|
18
|
+
if (partition_key_paths.length < 1 )
|
19
|
+
raise ArgumentError.new "No PartitionKey paths!"
|
20
|
+
end
|
21
|
+
colls_options = {
|
22
|
+
'indexingPolicy' => { 'indexingMode' => "consistent", 'automatic'=>true },
|
23
|
+
'partitionKey' => { "paths" => partition_key_paths, "kind" => "Hash" }
|
24
|
+
}
|
25
|
+
custom_headers= {'x-ms-offer-throughput' => offer_throughput }
|
26
|
+
super(database_resource, collection_name, colls_options, custom_headers)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def create_document(collection_resource, document_id, document, partitioned_key )
|
31
|
+
if partitioned_key.empty?
|
32
|
+
raise ArgumentError.new "No partitioned key!"
|
33
|
+
end
|
34
|
+
if !document.key?(partitioned_key)
|
35
|
+
raise ArgumentError.new "No partitioned key in your document!"
|
36
|
+
end
|
37
|
+
partitioned_key_value = document[partitioned_key]
|
38
|
+
custom_headers = {
|
39
|
+
'x-ms-documentdb-partitionkey' => "[\"#{partitioned_key_value}\"]"
|
40
|
+
}
|
41
|
+
super(collection_resource, document_id, document, custom_headers)
|
42
|
+
end
|
43
|
+
|
44
|
+
def find_documents(collection_resource, document_id,
|
45
|
+
partitioned_key, partitioned_key_value, custom_headers={})
|
46
|
+
if !collection_resource
|
47
|
+
raise ArgumentError.new "No collection_resource!"
|
48
|
+
end
|
49
|
+
ret = {}
|
50
|
+
query_params = []
|
51
|
+
query_text = sprintf("SELECT * FROM c WHERE c.id=@id AND c.%s=@value", partitioned_key)
|
52
|
+
query_params.push( {:name=>"@id", :value=> document_id } )
|
53
|
+
query_params.push( {:name=>"@value", :value=> partitioned_key_value } )
|
54
|
+
url = sprintf("%s/dbs/%s/colls/%s/docs",
|
55
|
+
@url_endpoint, collection_resource.database_rid, collection_resource.collection_rid)
|
56
|
+
ret = query(AzureDocumentDB::RESOURCE_TYPE_DOCUMENT,
|
57
|
+
collection_resource.collection_rid, url, query_text, query_params, custom_headers)
|
58
|
+
ret
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module AzureDocumentDB
|
2
|
+
|
3
|
+
class Resource
|
4
|
+
def initialize
|
5
|
+
@r = {}
|
6
|
+
end
|
7
|
+
protected
|
8
|
+
attr_accessor :r
|
9
|
+
end
|
10
|
+
|
11
|
+
class DatabaseResource < Resource
|
12
|
+
|
13
|
+
def initialize (database_rid)
|
14
|
+
super()
|
15
|
+
@r['database_rid'] = database_rid
|
16
|
+
end
|
17
|
+
|
18
|
+
def database_rid
|
19
|
+
@r['database_rid']
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class CollectionResource < Resource
|
24
|
+
|
25
|
+
def initialize (database_rid, collection_rid)
|
26
|
+
super()
|
27
|
+
@r['database_rid'] = database_rid
|
28
|
+
@r['collection_rid'] = collection_rid
|
29
|
+
end
|
30
|
+
|
31
|
+
def database_rid
|
32
|
+
@r['database_rid']
|
33
|
+
end
|
34
|
+
|
35
|
+
def collection_rid
|
36
|
+
@r['collection_rid']
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: samples/sample_01.csv
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: csv
|
8
|
+
delimiter: ','
|
9
|
+
quote: '"'
|
10
|
+
escape: '"'
|
11
|
+
null_string: 'NULL'
|
12
|
+
trim_if_not_quoted: false
|
13
|
+
skip_header_lines: 1
|
14
|
+
allow_extra_columns: false
|
15
|
+
allow_optional_columns: false
|
16
|
+
columns:
|
17
|
+
- {name: id, type: long}
|
18
|
+
- {name: account, type: long}
|
19
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
20
|
+
- {name: purchase, type: timestamp, format: '%Y%m%d'}
|
21
|
+
- {name: comment, type: string}
|
22
|
+
out:
|
23
|
+
type: documentdb
|
24
|
+
docdb_endpoint: https://yoichikademo1.documents.azure.com:443/
|
25
|
+
docdb_account_key: EMwUa3EzsAtJ1qYfzwo9nQ3xxxfsXNm3xLh1SLffKkUHMFl80OZRZIVu4lxdKRKxkgVAj0c2mv9BZSyMN7tdg==
|
26
|
+
docdb_database: myembulkdb
|
27
|
+
docdb_collection: myembulkcoll
|
28
|
+
auto_create_database: true
|
29
|
+
auto_create_collection: true
|
30
|
+
partitioned_collection: true
|
31
|
+
partition_key: host
|
32
|
+
offer_throughput: 10100
|
33
|
+
key_column: id
|
@@ -0,0 +1,31 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: samples/sample_01.csv
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: csv
|
8
|
+
delimiter: ','
|
9
|
+
quote: '"'
|
10
|
+
escape: '"'
|
11
|
+
null_string: 'NULL'
|
12
|
+
trim_if_not_quoted: false
|
13
|
+
skip_header_lines: 1
|
14
|
+
allow_extra_columns: false
|
15
|
+
allow_optional_columns: false
|
16
|
+
columns:
|
17
|
+
- {name: id, type: long}
|
18
|
+
- {name: account, type: long}
|
19
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
20
|
+
- {name: purchase, type: timestamp, format: '%Y%m%d'}
|
21
|
+
- {name: comment, type: string}
|
22
|
+
out:
|
23
|
+
type: documentdb
|
24
|
+
docdb_endpoint: https://yoichikademo1.documents.azure.com:443/
|
25
|
+
docdb_account_key: EMwUa3EzsAtJ1qYfzwo9nQ3xxxfsXNm3xLh1SLffKkUHMFl80OZRZIVu4lxdKRKxkgVAj0c2mv9BZSyMN7tdg==
|
26
|
+
docdb_database: myembulkdb
|
27
|
+
docdb_collection: myembulkcoll
|
28
|
+
auto_create_database: true
|
29
|
+
auto_create_collection: true
|
30
|
+
partitioned_collection: false
|
31
|
+
key_column: id
|
@@ -0,0 +1,6 @@
|
|
1
|
+
id,account,time,purchase,comment
|
2
|
+
0,21123,2016-08-27 19:23:49,20160127,java
|
3
|
+
1,32864,2016-08-27 19:23:49,20160127,embulk
|
4
|
+
2,14824,2016-08-27 19:01:23,20160127,embulk jruby
|
5
|
+
3,27559,2016-08-28 02:20:02,20160128,"Embulk ""csv"" parser plugin"
|
6
|
+
4,11270,2016-08-29 11:54:36,20160129,NULL
|
metadata
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: embulk-output-documentdb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yoichi Kawasaki
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-08-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rest-client
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: embulk
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.8.13
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.8.13
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.10.6
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.10.6
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
description: Dumps records to Azure DocumentDB
|
70
|
+
email:
|
71
|
+
- yoichi.kawasaki@outlook.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- ChangeLog
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- Rakefile
|
82
|
+
- VERSION
|
83
|
+
- embulk-output-documentdb.gemspec
|
84
|
+
- lib/embulk/output/documentdb.rb
|
85
|
+
- lib/embulk/output/documentdb/client.rb
|
86
|
+
- lib/embulk/output/documentdb/constants.rb
|
87
|
+
- lib/embulk/output/documentdb/header.rb
|
88
|
+
- lib/embulk/output/documentdb/partitioned_coll_client.rb
|
89
|
+
- lib/embulk/output/documentdb/resource.rb
|
90
|
+
- samples/config-csv2docdb_partitionedcoll.yml
|
91
|
+
- samples/config-csv2docdb_singlecoll.yml
|
92
|
+
- samples/sample_01.csv
|
93
|
+
homepage: https://github.com/yoichika/embulk-output-documentdb
|
94
|
+
licenses:
|
95
|
+
- MIT
|
96
|
+
metadata: {}
|
97
|
+
post_install_message:
|
98
|
+
rdoc_options: []
|
99
|
+
require_paths:
|
100
|
+
- lib
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
requirements: []
|
112
|
+
rubyforge_project:
|
113
|
+
rubygems_version: 2.6.2
|
114
|
+
signing_key:
|
115
|
+
specification_version: 4
|
116
|
+
summary: Azure DocumentDB output plugin for Embulk
|
117
|
+
test_files: []
|