akane-bigquery 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +40 -0
- data/Rakefile +2 -0
- data/akane-bigquery.gemspec +30 -0
- data/bin/akane-bigquery +4 -0
- data/lib/akane-bigquery.rb +42 -0
- data/lib/akane-bigquery/cli.rb +218 -0
- data/lib/akane-bigquery/schema.rb +93 -0
- data/lib/akane-bigquery/version.rb +3 -0
- data/lib/akane/storages/bigquery.rb +273 -0
- metadata +169 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7fc97e06744000530649b80d35908244a7003ff5
|
4
|
+
data.tar.gz: 7e8348c2df4d4a66eaa4048902229a6e69a2e9bf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1d206afead4eada0ac898352277af4ba8b276e22da99ed4adc388d0d053781a48c189b23fd64e92e4b1a64a2a913a706f81ec6e2baf3611740a0e90f883058b4
|
7
|
+
data.tar.gz: b98ab328c7b3661ab0f623f8082979685c50f0654a9a651817296599dc89510e0dcc95130c8e7b91b797fd1aec9c254a950d726de82f2bd0f8470ba831ed16df
|
data/.gitignore
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
23
|
+
config.yml
|
24
|
+
akane.yml
|
25
|
+
*.p12
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Shota Fukumori (sora_h)
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# akane-bigquery - Storage engine for akane.gem that streams tweets to Google BigQuery
|
2
|
+
|
3
|
+
Storage plugin gem for [akane](https://github.com/sorah/akane), allows you to use BigQuery as akane's storage engine.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'akane-bigquery'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install akane-bigquery
|
18
|
+
|
19
|
+
## Loading past data
|
20
|
+
|
21
|
+
If you're using akane's `file` storage for past data,`akane-bigquery prepare` allows you to load them into BigQuery.
|
22
|
+
|
23
|
+
```
|
24
|
+
$ mkdir /tmp/akane-bigquery
|
25
|
+
$ akane-bigquery prepare /path/to/your/file-storage /tmp/akane-bigquery
|
26
|
+
$ gsutil -m cp /tmp/akane-bigquery/* gs://YOUR_BUCKET/
|
27
|
+
$ bq load --source_format=NEWLINE_DELIMITED_JSON YOUR_DATASET_ID.tweets "$(gsutil ls gs://YOUR_BUCKET/ | ruby -e 'ARGF.readlines.map(&:chomp).reject(&:empty?).join(",").display')"
|
28
|
+
```
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
TODO: Write usage instructions here
|
33
|
+
|
34
|
+
## Contributing
|
35
|
+
|
36
|
+
1. Fork it ( https://github.com/sorah/akane-bigquery/fork )
|
37
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
38
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
39
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
40
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'akane-bigquery/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "akane-bigquery"
|
8
|
+
spec.version = AkaneBigquery::VERSION
|
9
|
+
spec.authors = ["Shota Fukumori (sora_h)"]
|
10
|
+
spec.email = ["her@sorah.jp"]
|
11
|
+
spec.summary = %q{akane.gem Google Bigquery storage adapter}
|
12
|
+
spec.description = %q{Google Bigquery storage adapter for akane.gem}
|
13
|
+
spec.homepage = "https://github.com/sorah/akane-bigquery"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "akane", ">= 0.2.0"
|
22
|
+
spec.add_dependency 'google-api-client', '>= 0.7.1'
|
23
|
+
spec.add_dependency 'thor', '>= 0.19.1'
|
24
|
+
spec.add_dependency 'oj'
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
27
|
+
spec.add_development_dependency "rspec", "~> 3.0.0"
|
28
|
+
spec.add_development_dependency "webmock", "~> 1.17.3"
|
29
|
+
spec.add_development_dependency "rake"
|
30
|
+
end
|
data/bin/akane-bigquery
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'akane/storages/bigquery'
|
2
|
+
require 'akane-bigquery/version'
|
3
|
+
require 'akane-bigquery/schema'
|
4
|
+
|
5
|
+
require 'google/api_client'
|
6
|
+
|
7
|
+
module AkaneBigquery
|
8
|
+
def self.make_client(config)
|
9
|
+
raise ArgumentError, "missing config['key']" unless config['key']
|
10
|
+
raise ArgumentError, "missing config['key']['path']" unless config['key']['path']
|
11
|
+
raise ArgumentError, "missing config['key']['passphrase']" unless config['key']['passphrase']
|
12
|
+
raise ArgumentError, "missing config['client_id']" unless config['client_id']
|
13
|
+
raise ArgumentError, "missing config['service_email']" unless config['service_email']
|
14
|
+
|
15
|
+
client = Google::APIClient.new(
|
16
|
+
application_name: config["app_name"] || 'akane',
|
17
|
+
application_version: AkaneBigquery::VERSION,
|
18
|
+
)
|
19
|
+
|
20
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(
|
21
|
+
config['key']['path'],
|
22
|
+
config['key']['passphrase']
|
23
|
+
)
|
24
|
+
|
25
|
+
client.authorization = Signet::OAuth2::Client.new(
|
26
|
+
token_credential_uri: 'https://accounts.google.com/o/oauth2/token',
|
27
|
+
audience: 'https://accounts.google.com/o/oauth2/token',
|
28
|
+
scope: 'https://www.googleapis.com/auth/bigquery',
|
29
|
+
issuer: config['service_email'],
|
30
|
+
signing_key: key,
|
31
|
+
)
|
32
|
+
|
33
|
+
client.authorization.fetch_access_token!
|
34
|
+
|
35
|
+
return client
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.make_bigquery_client(config)
|
39
|
+
client = make_client(config)
|
40
|
+
[client, client.discovered_api("bigquery", "v2")]
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
require 'akane-bigquery'
|
2
|
+
require 'yaml'
|
3
|
+
require 'thor'
|
4
|
+
require 'oj'
|
5
|
+
|
6
|
+
module AkaneBigquery
|
7
|
+
class CLI < Thor
|
8
|
+
desc "init", 'creates table on bigquery'
|
9
|
+
method_option :config,
|
10
|
+
required: true, aliases: %w(-c),
|
11
|
+
desc: "path to akane config file (yml)"
|
12
|
+
method_option :config_name,
|
13
|
+
desc: "select bigquery configuration by name key. use this if you have multiple bigquery storages in config file"
|
14
|
+
|
15
|
+
|
16
|
+
def init
|
17
|
+
# check dataset existence
|
18
|
+
dataset = client.execute(
|
19
|
+
api_method: api.datasets.get,
|
20
|
+
parameters: {
|
21
|
+
'projectId' => config['project_id'],
|
22
|
+
'datasetId' => config['dataset_id'],
|
23
|
+
}
|
24
|
+
)
|
25
|
+
|
26
|
+
if dataset.error?
|
27
|
+
if dataset.error_message =~ /^Not Found:/i
|
28
|
+
puts "Creating dataset #{config['dataset_id']} ..."
|
29
|
+
dataset = client.execute(
|
30
|
+
api_method: api.datasets.insert,
|
31
|
+
parameters: {
|
32
|
+
'projectId' => config['project_id'],
|
33
|
+
},
|
34
|
+
body_object: {
|
35
|
+
'datasetReference' => {
|
36
|
+
'datasetId' => config['dataset_id'],
|
37
|
+
},
|
38
|
+
'description' => 'akane',
|
39
|
+
}
|
40
|
+
)
|
41
|
+
|
42
|
+
raise dataset.error_message if dataset.error?
|
43
|
+
else
|
44
|
+
raise dataset.error_message
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
schemas = AkaneBigquery::Schema::SCHEMA
|
49
|
+
|
50
|
+
schemas.each do |table_id, schema|
|
51
|
+
table = client.execute(
|
52
|
+
api_method: api.tables.get,
|
53
|
+
parameters: {
|
54
|
+
'projectId' => config['project_id'],
|
55
|
+
'datasetId' => config['dataset_id'],
|
56
|
+
'tableId' => table_id,
|
57
|
+
},
|
58
|
+
)
|
59
|
+
|
60
|
+
if table.error?
|
61
|
+
if table.error_message =~ /^Not Found:/i
|
62
|
+
puts "Creating table #{table_id} ..."
|
63
|
+
table = client.execute(
|
64
|
+
api_method: api.tables.insert,
|
65
|
+
parameters: {
|
66
|
+
'projectId' => config['project_id'],
|
67
|
+
'datasetId' => config['dataset_id'],
|
68
|
+
},
|
69
|
+
body_object: {
|
70
|
+
'tableReference' => {
|
71
|
+
'projectId' => config['project_id'],
|
72
|
+
'datasetId' => config['dataset_id'],
|
73
|
+
'tableId' => table_id,
|
74
|
+
},
|
75
|
+
'friendlyName' => table_id,
|
76
|
+
'schema' => schema,
|
77
|
+
}
|
78
|
+
)
|
79
|
+
raise table.error_message if table.error?
|
80
|
+
else
|
81
|
+
raise table.error_message
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
desc "prepare SOURCE DEST", "prepare JSONs or loading into BigQuery from existing file storage data"
|
89
|
+
method_option :months, desc: "Names of months to process. Separeted by comma."
|
90
|
+
method_option :before, desc: "Dump only data before specified datetime. Value will be parsed by `Time.parse` of Ruby."
|
91
|
+
def prepare(source, prefix)
|
92
|
+
limit = 524288000 # 500MBytes
|
93
|
+
|
94
|
+
count = -1
|
95
|
+
bytes = 0
|
96
|
+
|
97
|
+
new_io = lambda do
|
98
|
+
bytes = 0
|
99
|
+
count += 1
|
100
|
+
path = File.join(prefix, "tweets.#{count.to_s.rjust(4,'0')}.txt")
|
101
|
+
puts "=> Using #{path}"
|
102
|
+
File.open(path, 'w')
|
103
|
+
end
|
104
|
+
io = new_io.call
|
105
|
+
|
106
|
+
months = options[:months] && options[:months].split(/,/)
|
107
|
+
before = options[:before] && Time.parse(options[:before])
|
108
|
+
|
109
|
+
userdirs = Dir.entries(File.join(source, "users"))
|
110
|
+
userdirs.each_with_index do |user_dirname, index|
|
111
|
+
next if user_dirname == "." || user_dirname == ".."
|
112
|
+
puts " * #{user_dirname} (#{index.succ}/#{userdirs.size}, #{((index.succ/userdirs.size.to_f)*100).to_i}%)"
|
113
|
+
|
114
|
+
userdir = File.join(source, "users", user_dirname)
|
115
|
+
|
116
|
+
tweet_filepaths = if options[:months]
|
117
|
+
months.map { |_| File.join(userdir, "tweets.#{_}.txt") }
|
118
|
+
else
|
119
|
+
Dir[File.join(userdir, 'tweets.*.txt')]
|
120
|
+
end
|
121
|
+
tweet_filepaths.each do |file|
|
122
|
+
begin
|
123
|
+
File.open(file, 'r') do |tweets_io|
|
124
|
+
tweets_io.each_line do |line|
|
125
|
+
json = line.chomp
|
126
|
+
|
127
|
+
tweet = Oj.load(json)
|
128
|
+
|
129
|
+
created_at = Time.parse(tweet['created_at'.freeze])
|
130
|
+
next if before && before <= created_at
|
131
|
+
|
132
|
+
new_json = {
|
133
|
+
'json'.freeze => json,
|
134
|
+
'id_str'.freeze => tweet['id_str'.freeze],
|
135
|
+
'id'.freeze => tweet['id'.freeze],
|
136
|
+
'text'.freeze => tweet['text'.freeze],
|
137
|
+
'lang'.freeze => tweet['lang'.freeze],
|
138
|
+
'source'.freeze => tweet['source'.freeze],
|
139
|
+
'in_reply_to_status_id'.freeze => tweet['in_reply_to_status_id'.freeze],
|
140
|
+
'in_reply_to_status_id_str'.freeze => tweet['in_reply_to_status_id_str'.freeze],
|
141
|
+
'in_reply_to_user_id'.freeze => tweet['in_reply_to_user_id'.freeze],
|
142
|
+
'in_reply_to_user_id_str'.freeze => tweet['in_reply_to_user_id_str'.freeze],
|
143
|
+
'in_reply_to_screen_name'.freeze => tweet['in_reply_to_screen_name'.freeze],
|
144
|
+
'user'.freeze => {
|
145
|
+
'id_str'.freeze => tweet['user'.freeze]['id_str'.freeze],
|
146
|
+
'id'.freeze => tweet['user'.freeze]['id'.freeze],
|
147
|
+
'name'.freeze => tweet['user'.freeze]['name'.freeze],
|
148
|
+
'screen_name'.freeze => tweet['user'.freeze]['screen_name'.freeze],
|
149
|
+
'protected'.freeze => tweet['user'.freeze]['protected'.freeze],
|
150
|
+
},
|
151
|
+
'created_at'.freeze => created_at.to_i
|
152
|
+
}
|
153
|
+
|
154
|
+
if tweet['coordinates'.freeze]
|
155
|
+
new_json['coordinates_longitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][0]
|
156
|
+
new_json['coordinates_latitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][1]
|
157
|
+
end
|
158
|
+
|
159
|
+
if tweet['place'.freeze]
|
160
|
+
place = tweet['place'.freeze]
|
161
|
+
new_json['place'.freeze] = {
|
162
|
+
'id'.freeze => place['id'.freeze],
|
163
|
+
'country'.freeze => place['country'.freeze],
|
164
|
+
'country_code'.freeze => place['country_code'.freeze],
|
165
|
+
'name'.freeze => place['name'.freeze],
|
166
|
+
'full_name'.freeze => place['full_name'.freeze],
|
167
|
+
'place_type'.freeze => place['place_type'.freeze],
|
168
|
+
'url'.freeze => place['url'.freeze],
|
169
|
+
}
|
170
|
+
end
|
171
|
+
|
172
|
+
new_json_str = Oj.dump(new_json)
|
173
|
+
io.puts new_json_str
|
174
|
+
bytes += new_json_str.size + 1
|
175
|
+
io = new_io.call if limit <= bytes
|
176
|
+
end
|
177
|
+
end
|
178
|
+
rescue Errno::ENOENT
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
def config
|
189
|
+
@config ||= begin
|
190
|
+
storages = YAML.load_file(options[:config])['storages']
|
191
|
+
|
192
|
+
conf = if options[:config_name]
|
193
|
+
storages.find { |_| _['bigquery'] && _['bigquery']['name'] == options[:config_name] }
|
194
|
+
else
|
195
|
+
storages.find { |_| _['bigquery'] }
|
196
|
+
end
|
197
|
+
|
198
|
+
(conf && conf['bigquery']) or \
|
199
|
+
abort 'error: bigquery storage configuration not found'
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def client
|
204
|
+
client_and_api; @client
|
205
|
+
end
|
206
|
+
|
207
|
+
def api
|
208
|
+
client_and_api; @api
|
209
|
+
end
|
210
|
+
|
211
|
+
def client_and_api
|
212
|
+
return @client_and_api if @client_and_api
|
213
|
+
|
214
|
+
@client_and_api = AkaneBigquery.make_bigquery_client(config)
|
215
|
+
@client, @api = @client_and_api
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module AkaneBigquery
|
2
|
+
module Schema
|
3
|
+
|
4
|
+
#STRING, INTEGER, FLOAT, BOOLEAN, TIMESTAMP or RECORD
|
5
|
+
#NULLABLE, REQUIRED and REPEATED.
|
6
|
+
SCHEMAS = {
|
7
|
+
'0' => {
|
8
|
+
'tweets' => {
|
9
|
+
'fields' => [
|
10
|
+
{'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
11
|
+
|
12
|
+
{'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
13
|
+
{'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
14
|
+
|
15
|
+
{'name' => 'text', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
16
|
+
|
17
|
+
{'name' => 'lang', 'type' => 'STRING'},
|
18
|
+
{'name' => 'source', 'type' => 'STRING'},
|
19
|
+
|
20
|
+
{'name' => 'in_reply_to_status_id_str', 'type' => 'STRING'},
|
21
|
+
{'name' => 'in_reply_to_status_id', 'type' => 'INTEGER'},
|
22
|
+
{'name' => 'in_reply_to_user_id_str', 'type' => 'STRING'},
|
23
|
+
{'name' => 'in_reply_to_user_id', 'type' => 'INTEGER'},
|
24
|
+
{'name' => 'in_reply_to_screen_name', 'type' => 'STRING'},
|
25
|
+
|
26
|
+
{'name' => 'retweeted_status_id_str', 'type' => 'STRING'},
|
27
|
+
{'name' => 'retweeted_status_id', 'type' => 'INTEGER'},
|
28
|
+
|
29
|
+
{'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
|
30
|
+
|
31
|
+
{
|
32
|
+
'name' => 'user', 'type' => 'RECORD', 'mode' => 'REQUIRED',
|
33
|
+
'fields' => [
|
34
|
+
{'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
35
|
+
{'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
36
|
+
{'name' => 'name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
37
|
+
{'name' => 'screen_name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
38
|
+
{'name' => 'protected', 'type' => 'BOOLEAN', 'mode' => 'NULLABLE'},
|
39
|
+
],
|
40
|
+
},
|
41
|
+
|
42
|
+
{'name' => 'coordinates_longitude', 'type' => 'FLOAT'},
|
43
|
+
{'name' => 'coordinates_latitude', 'type' => 'FLOAT'},
|
44
|
+
|
45
|
+
{
|
46
|
+
'name' => 'place', 'type' => 'RECORD',
|
47
|
+
'fields' => [
|
48
|
+
{'name' => 'id', 'type' => 'STRING'},
|
49
|
+
{'name' => 'country', 'type' => 'STRING'},
|
50
|
+
{'name' => 'country_code', 'type' => 'STRING'},
|
51
|
+
{'name' => 'name', 'type' => 'STRING'},
|
52
|
+
{'name' => 'full_name', 'type' => 'STRING'},
|
53
|
+
{'name' => 'place_type', 'type' => 'STRING'},
|
54
|
+
{'name' => 'url', 'type' => 'STRING'},
|
55
|
+
],
|
56
|
+
},
|
57
|
+
],
|
58
|
+
},
|
59
|
+
'deletions' => {
|
60
|
+
'fields' => [
|
61
|
+
{'name' => 'user_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
62
|
+
{'name' => 'tweet_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
63
|
+
{'name' => 'user_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
64
|
+
{'name' => 'tweet_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
65
|
+
|
66
|
+
{'name' => 'deleted_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
|
67
|
+
],
|
68
|
+
},
|
69
|
+
'events' => {
|
70
|
+
'fields' => [
|
71
|
+
{'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
72
|
+
|
73
|
+
{'name' => 'event', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
74
|
+
|
75
|
+
{'name' => 'source_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
76
|
+
{'name' => 'target_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
77
|
+
{'name' => 'source_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
78
|
+
{'name' => 'target_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
79
|
+
|
80
|
+
{'name' => 'target_object_id', 'type' => 'INTEGER'},
|
81
|
+
{'name' => 'target_object_id_str', 'type' => 'STRING'},
|
82
|
+
|
83
|
+
{'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
|
84
|
+
],
|
85
|
+
},
|
86
|
+
}.freeze,
|
87
|
+
}.freeze
|
88
|
+
|
89
|
+
VERSION = '0'
|
90
|
+
SCHEMA = SCHEMAS[VERSION]
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
require 'akane/storages/abstract_storage'
|
2
|
+
require 'akane-bigquery'
|
3
|
+
|
4
|
+
require 'thread'
|
5
|
+
|
6
|
+
module Akane
|
7
|
+
module Storages
|
8
|
+
class Bigquery < AbstractStorage
|
9
|
+
class Stop < Exception; end # :nodoc:
|
10
|
+
|
11
|
+
def initialize(*)
|
12
|
+
super
|
13
|
+
|
14
|
+
@client, @api = AkaneBigquery.make_bigquery_client(@config)
|
15
|
+
|
16
|
+
@project_id = @config['project_id']
|
17
|
+
@dataset_id = @config['dataset_id']
|
18
|
+
|
19
|
+
@lock = Mutex.new
|
20
|
+
@thread = nil
|
21
|
+
|
22
|
+
@flush_interval = @config['flush_interval'] ? @config['flush_interval'].to_i : 60
|
23
|
+
@flush_threshold = @config['flush_threshold'] ? @config['flush_threshold'].to_i : 1000
|
24
|
+
|
25
|
+
@pending_inserts = []
|
26
|
+
@failing_inserts = []
|
27
|
+
@pending_inserts_lock = Mutex.new
|
28
|
+
|
29
|
+
swap_buffers # initialize
|
30
|
+
start
|
31
|
+
end
|
32
|
+
|
33
|
+
def name
|
34
|
+
@name ||= "bigquery:#{@project_id}/#{@dataset_id}"
|
35
|
+
end
|
36
|
+
|
37
|
+
def bq_insert(table, row)
|
38
|
+
@lock.synchronize do
|
39
|
+
@buffers[table] << row
|
40
|
+
end
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def start
|
45
|
+
@lock.synchronize do
|
46
|
+
unless @thread
|
47
|
+
@thread = Thread.new(&method(:worker_loop))
|
48
|
+
@stop = false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def exitable?
|
54
|
+
@stop && (@thread ? @thread.alive? : true)
|
55
|
+
end
|
56
|
+
|
57
|
+
def stop!
|
58
|
+
@lock.synchronize do
|
59
|
+
super
|
60
|
+
@thread.raise(Stop) if @thread
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def record_tweet(account, tweet)
|
65
|
+
hash = tweet.attrs
|
66
|
+
row = {
|
67
|
+
'json'.freeze => hash.to_json,
|
68
|
+
'id_str'.freeze => hash[:id_str],
|
69
|
+
'id'.freeze => hash[:id],
|
70
|
+
'text'.freeze => hash[:text],
|
71
|
+
'lang'.freeze => hash[:lang],
|
72
|
+
'source'.freeze => hash[:source],
|
73
|
+
'in_reply_to_status_id'.freeze => hash[:in_reply_to_status_id],
|
74
|
+
'in_reply_to_status_id_str'.freeze => hash[:in_reply_to_status_id_str],
|
75
|
+
'in_reply_to_user_id'.freeze => hash[:in_reply_to_user_id],
|
76
|
+
'in_reply_to_user_id_str'.freeze => hash[:in_reply_to_user_id_str],
|
77
|
+
'in_reply_to_screen_name'.freeze => hash[:in_reply_to_screen_name],
|
78
|
+
'user'.freeze => {
|
79
|
+
'id_str'.freeze => hash[:user][:id_str],
|
80
|
+
'id'.freeze => hash[:user][:id],
|
81
|
+
'name'.freeze => hash[:user][:name],
|
82
|
+
'screen_name'.freeze => hash[:user][:screen_name],
|
83
|
+
'protected'.freeze => hash[:user][:protected],
|
84
|
+
},
|
85
|
+
'created_at'.freeze => Time.parse(hash[:created_at]).to_i
|
86
|
+
}
|
87
|
+
|
88
|
+
if hash['coordinates'.freeze]
|
89
|
+
row['coordinates_longitude'.freeze], row['coordinates_latitude'.freeze] = \
|
90
|
+
hash[:coordinates][:coordinates]
|
91
|
+
end
|
92
|
+
|
93
|
+
if hash[:place]
|
94
|
+
place = hash[:place]
|
95
|
+
row['place'.freeze] = {
|
96
|
+
'id'.freeze => place[:id],
|
97
|
+
'country'.freeze => place[:country],
|
98
|
+
'country_code'.freeze => place[:country_code],
|
99
|
+
'name'.freeze => place[:name],
|
100
|
+
'full_name'.freeze => place[:full_name],
|
101
|
+
'place_type'.freeze => place[:place_type],
|
102
|
+
'url'.freeze => place[:url],
|
103
|
+
}
|
104
|
+
end
|
105
|
+
|
106
|
+
bq_insert :tweets, row
|
107
|
+
end
|
108
|
+
|
109
|
+
def mark_as_deleted(account, user_id, tweet_id)
|
110
|
+
bq_insert(:deletions,
|
111
|
+
'user_id'.freeze => user_id,
|
112
|
+
'user_id_str'.freeze => user_id.to_s,
|
113
|
+
'tweet_id'.freeze => tweet_id,
|
114
|
+
'tweet_id_str'.freeze => tweet_id.to_s,
|
115
|
+
'deleted_at'.freeze => Time.now.to_i,
|
116
|
+
)
|
117
|
+
end
|
118
|
+
|
119
|
+
def record_event(account, event)
|
120
|
+
source = event['source'.freeze]
|
121
|
+
target = event['target'.freeze]
|
122
|
+
target_object = event['target_object'.freeze]
|
123
|
+
|
124
|
+
source_id = source[:id]
|
125
|
+
target_id = target[:id]
|
126
|
+
|
127
|
+
unless source_id && target_id
|
128
|
+
@logger.warn "Discarding event because source and target id is missing: #{event.inspect}"
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
hash = Hash[
|
133
|
+
event.map { |k,v| [k, v && v.respond_to?(:attrs) ? v.attrs : nil] }
|
134
|
+
]
|
135
|
+
|
136
|
+
row = {
|
137
|
+
'json'.freeze => hash.to_json,
|
138
|
+
'event'.freeze => event['event'.freeze],
|
139
|
+
'source_id'.freeze => source_id,
|
140
|
+
'source_id_str'.freeze => source_id.to_s,
|
141
|
+
'target_id'.freeze => target_id,
|
142
|
+
'target_id_str'.freeze => target_id.to_s,
|
143
|
+
'created_at'.freeze => Time.now.to_i
|
144
|
+
}
|
145
|
+
|
146
|
+
if target_object && target_object[:id]
|
147
|
+
id = target_object[:id]
|
148
|
+
row['target_object_id'.freeze] = id
|
149
|
+
row['target_object_id_str'.freeze] = id.to_s
|
150
|
+
end
|
151
|
+
|
152
|
+
p row
|
153
|
+
bq_insert :events, row
|
154
|
+
end
|
155
|
+
|
156
|
+
def record_message(account, message)
|
157
|
+
end
|
158
|
+
|
159
|
+
def status
|
160
|
+
@buffers ? @buffers.map{ |table, buf| "#{table}=#{buf.size}" }.join(', ') + " | #{@failing_inserts.size} failures, #{@pending_inserts.size} inserts" : "-"
|
161
|
+
end
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
def swap_buffers
|
166
|
+
@lock.synchronize do
|
167
|
+
old_buffers = @buffers
|
168
|
+
@buffers = {tweets: [], messages: [], deletions: [], events: []}
|
169
|
+
|
170
|
+
old_buffers
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def worker_loop
|
175
|
+
@last_flush = Time.now
|
176
|
+
retry_interval = 1
|
177
|
+
|
178
|
+
begin
|
179
|
+
flush_pending_inserts
|
180
|
+
|
181
|
+
loop do
|
182
|
+
if @flush_interval <= (Time.now - @last_flush) || @flush_threshold <= @buffers.values.map(&:size).inject(:+)
|
183
|
+
flush_buffer
|
184
|
+
end
|
185
|
+
|
186
|
+
flush_pending_inserts
|
187
|
+
|
188
|
+
sleep 1
|
189
|
+
end
|
190
|
+
rescue Stop
|
191
|
+
@logger.info "Flushing buffer for graceful quit"
|
192
|
+
flush_buffer
|
193
|
+
until @pending_inserts.empty? && @failing_inserts.empty?
|
194
|
+
flush_pending_inserts(true)
|
195
|
+
sleep 10 unless @failing_inserts.empty?
|
196
|
+
end
|
197
|
+
rescue Exception => e
|
198
|
+
@logger.error "#{name} - Encountered error on buffer worker"
|
199
|
+
@logger.error e.inspect
|
200
|
+
@logger.error e.backtrace.join("\n")
|
201
|
+
|
202
|
+
@logger.error "Retrying after #{retry_interval.to_i}"
|
203
|
+
sleep retry_interval.to_i
|
204
|
+
retry_interval *= 1.8
|
205
|
+
retry
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def flush_buffer
|
210
|
+
prev_buffers = swap_buffers()
|
211
|
+
|
212
|
+
prev_buffers.each do |table, rows|
|
213
|
+
next if rows.empty?
|
214
|
+
|
215
|
+
insert_id_base = "#{Time.now.to_f}:#{rows.__id__}:#{table}"
|
216
|
+
request = {
|
217
|
+
api_method: @api.tabledata.insert_all,
|
218
|
+
parameters: {
|
219
|
+
'datasetId' => @dataset_id,
|
220
|
+
'projectId' => @project_id,
|
221
|
+
'tableId' => table.to_s,
|
222
|
+
},
|
223
|
+
body_object: {
|
224
|
+
'rows' => rows.map.with_index { |row, index|
|
225
|
+
{
|
226
|
+
'insertId'.freeze => "#{insert_id_base}:#{index}",
|
227
|
+
'json'.freeze => row,
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
@pending_inserts_lock.synchronize do
|
233
|
+
@logger.debug "Adding pending inserts for #{table}, #{rows.size} rows"
|
234
|
+
@pending_inserts << {request: request, insert_id: insert_id_base}
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
@last_flush = Time.now
|
239
|
+
end
|
240
|
+
|
241
|
+
def flush_pending_inserts(do_failures = false)
|
242
|
+
while failing_request = @failing_inserts.shift
|
243
|
+
if do_failures || Time.now <= failing_request[:next_try]
|
244
|
+
@logger.info "[#{name}] Retrying #{failing_request[:insert_id]}"
|
245
|
+
@pending_inserts_lock.synchronize { @pending_inserts.push(failing_request) }
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
while request = @pending_inserts_lock.synchronize { @pending_inserts.shift }
|
250
|
+
table = request[:request][:parameters]['tableId']
|
251
|
+
result = @client.execute(request[:request])
|
252
|
+
|
253
|
+
if result.error?
|
254
|
+
if request[:retry]
|
255
|
+
request[:retry] *= 1.8
|
256
|
+
else
|
257
|
+
request[:retry] = 5
|
258
|
+
end
|
259
|
+
|
260
|
+
request[:next_try] = Time.now + request[:retry]
|
261
|
+
|
262
|
+
@logger.error "[#{name}] Failed #{table} to insert: #{result.error_message} (#{request[:insert_id]}); retrying in #{request[:retry]} seconds"
|
263
|
+
@failing_inserts << request
|
264
|
+
else
|
265
|
+
@logger.debug "[#{name}] Inserted records in #{table}"
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
metadata
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: akane-bigquery
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Shota Fukumori (sora_h)
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: akane
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: google-api-client
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.7.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.7.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: thor
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.19.1
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.19.1
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: oj
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 3.0.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 3.0.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: webmock
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 1.17.3
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 1.17.3
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rake
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: Google Bigquery storage adapter for akane.gem
|
126
|
+
email:
|
127
|
+
- her@sorah.jp
|
128
|
+
executables:
|
129
|
+
- akane-bigquery
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- ".gitignore"
|
134
|
+
- Gemfile
|
135
|
+
- LICENSE.txt
|
136
|
+
- README.md
|
137
|
+
- Rakefile
|
138
|
+
- akane-bigquery.gemspec
|
139
|
+
- bin/akane-bigquery
|
140
|
+
- lib/akane-bigquery.rb
|
141
|
+
- lib/akane-bigquery/cli.rb
|
142
|
+
- lib/akane-bigquery/schema.rb
|
143
|
+
- lib/akane-bigquery/version.rb
|
144
|
+
- lib/akane/storages/bigquery.rb
|
145
|
+
homepage: https://github.com/sorah/akane-bigquery
|
146
|
+
licenses:
|
147
|
+
- MIT
|
148
|
+
metadata: {}
|
149
|
+
post_install_message:
|
150
|
+
rdoc_options: []
|
151
|
+
require_paths:
|
152
|
+
- lib
|
153
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
159
|
+
requirements:
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: '0'
|
163
|
+
requirements: []
|
164
|
+
rubyforge_project:
|
165
|
+
rubygems_version: 2.2.2
|
166
|
+
signing_key:
|
167
|
+
specification_version: 4
|
168
|
+
summary: akane.gem Google Bigquery storage adapter
|
169
|
+
test_files: []
|