akane-bigquery 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +40 -0
- data/Rakefile +2 -0
- data/akane-bigquery.gemspec +30 -0
- data/bin/akane-bigquery +4 -0
- data/lib/akane-bigquery.rb +42 -0
- data/lib/akane-bigquery/cli.rb +218 -0
- data/lib/akane-bigquery/schema.rb +93 -0
- data/lib/akane-bigquery/version.rb +3 -0
- data/lib/akane/storages/bigquery.rb +273 -0
- metadata +169 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7fc97e06744000530649b80d35908244a7003ff5
|
4
|
+
data.tar.gz: 7e8348c2df4d4a66eaa4048902229a6e69a2e9bf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1d206afead4eada0ac898352277af4ba8b276e22da99ed4adc388d0d053781a48c189b23fd64e92e4b1a64a2a913a706f81ec6e2baf3611740a0e90f883058b4
|
7
|
+
data.tar.gz: b98ab328c7b3661ab0f623f8082979685c50f0654a9a651817296599dc89510e0dcc95130c8e7b91b797fd1aec9c254a950d726de82f2bd0f8470ba831ed16df
|
data/.gitignore
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
23
|
+
config.yml
|
24
|
+
akane.yml
|
25
|
+
*.p12
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Shota Fukumori (sora_h)
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# akane-bigquery - Storage engine for akane.gem that streams tweets to Google BigQuery
|
2
|
+
|
3
|
+
Storage plugin gem for [akane](https://github.com/sorah/akane), allows you to use BigQuery as akane's storage engine.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'akane-bigquery'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install akane-bigquery
|
18
|
+
|
19
|
+
## Loading past data
|
20
|
+
|
21
|
+
If you're using akane's `file` storage for past data,`akane-bigquery prepare` allows you to load them into BigQuery.
|
22
|
+
|
23
|
+
```
|
24
|
+
$ mkdir /tmp/akane-bigquery
|
25
|
+
$ akane-bigquery prepare /path/to/your/file-storage /tmp/akane-bigquery
|
26
|
+
$ gsutil -m cp /tmp/akane-bigquery/* gs://YOUR_BUCKET/
|
27
|
+
$ bq load --source_format=NEWLINE_DELIMITED_JSON YOUR_DATASET_ID.tweets "$(gsutil ls gs://YOUR_BUCKET/ | ruby -e 'ARGF.readlines.map(&:chomp).reject(&:empty?).join(",").display')"
|
28
|
+
```
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
TODO: Write usage instructions here
|
33
|
+
|
34
|
+
## Contributing
|
35
|
+
|
36
|
+
1. Fork it ( https://github.com/sorah/akane-bigquery/fork )
|
37
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
38
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
39
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
40
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'akane-bigquery/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "akane-bigquery"
|
8
|
+
spec.version = AkaneBigquery::VERSION
|
9
|
+
spec.authors = ["Shota Fukumori (sora_h)"]
|
10
|
+
spec.email = ["her@sorah.jp"]
|
11
|
+
spec.summary = %q{akane.gem Google Bigquery storage adapter}
|
12
|
+
spec.description = %q{Google Bigquery storage adapter for akane.gem}
|
13
|
+
spec.homepage = "https://github.com/sorah/akane-bigquery"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "akane", ">= 0.2.0"
|
22
|
+
spec.add_dependency 'google-api-client', '>= 0.7.1'
|
23
|
+
spec.add_dependency 'thor', '>= 0.19.1'
|
24
|
+
spec.add_dependency 'oj'
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
27
|
+
spec.add_development_dependency "rspec", "~> 3.0.0"
|
28
|
+
spec.add_development_dependency "webmock", "~> 1.17.3"
|
29
|
+
spec.add_development_dependency "rake"
|
30
|
+
end
|
data/bin/akane-bigquery
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'akane/storages/bigquery'
|
2
|
+
require 'akane-bigquery/version'
|
3
|
+
require 'akane-bigquery/schema'
|
4
|
+
|
5
|
+
require 'google/api_client'
|
6
|
+
|
7
|
+
module AkaneBigquery
|
8
|
+
def self.make_client(config)
|
9
|
+
raise ArgumentError, "missing config['key']" unless config['key']
|
10
|
+
raise ArgumentError, "missing config['key']['path']" unless config['key']['path']
|
11
|
+
raise ArgumentError, "missing config['key']['passphrase']" unless config['key']['passphrase']
|
12
|
+
raise ArgumentError, "missing config['client_id']" unless config['client_id']
|
13
|
+
raise ArgumentError, "missing config['service_email']" unless config['service_email']
|
14
|
+
|
15
|
+
client = Google::APIClient.new(
|
16
|
+
application_name: config["app_name"] || 'akane',
|
17
|
+
application_version: AkaneBigquery::VERSION,
|
18
|
+
)
|
19
|
+
|
20
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(
|
21
|
+
config['key']['path'],
|
22
|
+
config['key']['passphrase']
|
23
|
+
)
|
24
|
+
|
25
|
+
client.authorization = Signet::OAuth2::Client.new(
|
26
|
+
token_credential_uri: 'https://accounts.google.com/o/oauth2/token',
|
27
|
+
audience: 'https://accounts.google.com/o/oauth2/token',
|
28
|
+
scope: 'https://www.googleapis.com/auth/bigquery',
|
29
|
+
issuer: config['service_email'],
|
30
|
+
signing_key: key,
|
31
|
+
)
|
32
|
+
|
33
|
+
client.authorization.fetch_access_token!
|
34
|
+
|
35
|
+
return client
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.make_bigquery_client(config)
|
39
|
+
client = make_client(config)
|
40
|
+
[client, client.discovered_api("bigquery", "v2")]
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
require 'akane-bigquery'
|
2
|
+
require 'yaml'
|
3
|
+
require 'thor'
|
4
|
+
require 'oj'
|
5
|
+
|
6
|
+
module AkaneBigquery
|
7
|
+
class CLI < Thor
|
8
|
+
desc "init", 'creates table on bigquery'
|
9
|
+
method_option :config,
|
10
|
+
required: true, aliases: %w(-c),
|
11
|
+
desc: "path to akane config file (yml)"
|
12
|
+
method_option :config_name,
|
13
|
+
desc: "select bigquery configuration by name key. use this if you have multiple bigquery storages in config file"
|
14
|
+
|
15
|
+
|
16
|
+
def init
|
17
|
+
# check dataset existence
|
18
|
+
dataset = client.execute(
|
19
|
+
api_method: api.datasets.get,
|
20
|
+
parameters: {
|
21
|
+
'projectId' => config['project_id'],
|
22
|
+
'datasetId' => config['dataset_id'],
|
23
|
+
}
|
24
|
+
)
|
25
|
+
|
26
|
+
if dataset.error?
|
27
|
+
if dataset.error_message =~ /^Not Found:/i
|
28
|
+
puts "Creating dataset #{config['dataset_id']} ..."
|
29
|
+
dataset = client.execute(
|
30
|
+
api_method: api.datasets.insert,
|
31
|
+
parameters: {
|
32
|
+
'projectId' => config['project_id'],
|
33
|
+
},
|
34
|
+
body_object: {
|
35
|
+
'datasetReference' => {
|
36
|
+
'datasetId' => config['dataset_id'],
|
37
|
+
},
|
38
|
+
'description' => 'akane',
|
39
|
+
}
|
40
|
+
)
|
41
|
+
|
42
|
+
raise dataset.error_message if dataset.error?
|
43
|
+
else
|
44
|
+
raise dataset.error_message
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
schemas = AkaneBigquery::Schema::SCHEMA
|
49
|
+
|
50
|
+
schemas.each do |table_id, schema|
|
51
|
+
table = client.execute(
|
52
|
+
api_method: api.tables.get,
|
53
|
+
parameters: {
|
54
|
+
'projectId' => config['project_id'],
|
55
|
+
'datasetId' => config['dataset_id'],
|
56
|
+
'tableId' => table_id,
|
57
|
+
},
|
58
|
+
)
|
59
|
+
|
60
|
+
if table.error?
|
61
|
+
if table.error_message =~ /^Not Found:/i
|
62
|
+
puts "Creating table #{table_id} ..."
|
63
|
+
table = client.execute(
|
64
|
+
api_method: api.tables.insert,
|
65
|
+
parameters: {
|
66
|
+
'projectId' => config['project_id'],
|
67
|
+
'datasetId' => config['dataset_id'],
|
68
|
+
},
|
69
|
+
body_object: {
|
70
|
+
'tableReference' => {
|
71
|
+
'projectId' => config['project_id'],
|
72
|
+
'datasetId' => config['dataset_id'],
|
73
|
+
'tableId' => table_id,
|
74
|
+
},
|
75
|
+
'friendlyName' => table_id,
|
76
|
+
'schema' => schema,
|
77
|
+
}
|
78
|
+
)
|
79
|
+
raise table.error_message if table.error?
|
80
|
+
else
|
81
|
+
raise table.error_message
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
desc "prepare SOURCE DEST", "prepare JSONs or loading into BigQuery from existing file storage data"
|
89
|
+
method_option :months, desc: "Names of months to process. Separeted by comma."
|
90
|
+
method_option :before, desc: "Dump only data before specified datetime. Value will be parsed by `Time.parse` of Ruby."
|
91
|
+
def prepare(source, prefix)
|
92
|
+
limit = 524288000 # 500MBytes
|
93
|
+
|
94
|
+
count = -1
|
95
|
+
bytes = 0
|
96
|
+
|
97
|
+
new_io = lambda do
|
98
|
+
bytes = 0
|
99
|
+
count += 1
|
100
|
+
path = File.join(prefix, "tweets.#{count.to_s.rjust(4,'0')}.txt")
|
101
|
+
puts "=> Using #{path}"
|
102
|
+
File.open(path, 'w')
|
103
|
+
end
|
104
|
+
io = new_io.call
|
105
|
+
|
106
|
+
months = options[:months] && options[:months].split(/,/)
|
107
|
+
before = options[:before] && Time.parse(options[:before])
|
108
|
+
|
109
|
+
userdirs = Dir.entries(File.join(source, "users"))
|
110
|
+
userdirs.each_with_index do |user_dirname, index|
|
111
|
+
next if user_dirname == "." || user_dirname == ".."
|
112
|
+
puts " * #{user_dirname} (#{index.succ}/#{userdirs.size}, #{((index.succ/userdirs.size.to_f)*100).to_i}%)"
|
113
|
+
|
114
|
+
userdir = File.join(source, "users", user_dirname)
|
115
|
+
|
116
|
+
tweet_filepaths = if options[:months]
|
117
|
+
months.map { |_| File.join(userdir, "tweets.#{_}.txt") }
|
118
|
+
else
|
119
|
+
Dir[File.join(userdir, 'tweets.*.txt')]
|
120
|
+
end
|
121
|
+
tweet_filepaths.each do |file|
|
122
|
+
begin
|
123
|
+
File.open(file, 'r') do |tweets_io|
|
124
|
+
tweets_io.each_line do |line|
|
125
|
+
json = line.chomp
|
126
|
+
|
127
|
+
tweet = Oj.load(json)
|
128
|
+
|
129
|
+
created_at = Time.parse(tweet['created_at'.freeze])
|
130
|
+
next if before && before <= created_at
|
131
|
+
|
132
|
+
new_json = {
|
133
|
+
'json'.freeze => json,
|
134
|
+
'id_str'.freeze => tweet['id_str'.freeze],
|
135
|
+
'id'.freeze => tweet['id'.freeze],
|
136
|
+
'text'.freeze => tweet['text'.freeze],
|
137
|
+
'lang'.freeze => tweet['lang'.freeze],
|
138
|
+
'source'.freeze => tweet['source'.freeze],
|
139
|
+
'in_reply_to_status_id'.freeze => tweet['in_reply_to_status_id'.freeze],
|
140
|
+
'in_reply_to_status_id_str'.freeze => tweet['in_reply_to_status_id_str'.freeze],
|
141
|
+
'in_reply_to_user_id'.freeze => tweet['in_reply_to_user_id'.freeze],
|
142
|
+
'in_reply_to_user_id_str'.freeze => tweet['in_reply_to_user_id_str'.freeze],
|
143
|
+
'in_reply_to_screen_name'.freeze => tweet['in_reply_to_screen_name'.freeze],
|
144
|
+
'user'.freeze => {
|
145
|
+
'id_str'.freeze => tweet['user'.freeze]['id_str'.freeze],
|
146
|
+
'id'.freeze => tweet['user'.freeze]['id'.freeze],
|
147
|
+
'name'.freeze => tweet['user'.freeze]['name'.freeze],
|
148
|
+
'screen_name'.freeze => tweet['user'.freeze]['screen_name'.freeze],
|
149
|
+
'protected'.freeze => tweet['user'.freeze]['protected'.freeze],
|
150
|
+
},
|
151
|
+
'created_at'.freeze => created_at.to_i
|
152
|
+
}
|
153
|
+
|
154
|
+
if tweet['coordinates'.freeze]
|
155
|
+
new_json['coordinates_longitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][0]
|
156
|
+
new_json['coordinates_latitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][1]
|
157
|
+
end
|
158
|
+
|
159
|
+
if tweet['place'.freeze]
|
160
|
+
place = tweet['place'.freeze]
|
161
|
+
new_json['place'.freeze] = {
|
162
|
+
'id'.freeze => place['id'.freeze],
|
163
|
+
'country'.freeze => place['country'.freeze],
|
164
|
+
'country_code'.freeze => place['country_code'.freeze],
|
165
|
+
'name'.freeze => place['name'.freeze],
|
166
|
+
'full_name'.freeze => place['full_name'.freeze],
|
167
|
+
'place_type'.freeze => place['place_type'.freeze],
|
168
|
+
'url'.freeze => place['url'.freeze],
|
169
|
+
}
|
170
|
+
end
|
171
|
+
|
172
|
+
new_json_str = Oj.dump(new_json)
|
173
|
+
io.puts new_json_str
|
174
|
+
bytes += new_json_str.size + 1
|
175
|
+
io = new_io.call if limit <= bytes
|
176
|
+
end
|
177
|
+
end
|
178
|
+
rescue Errno::ENOENT
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
def config
|
189
|
+
@config ||= begin
|
190
|
+
storages = YAML.load_file(options[:config])['storages']
|
191
|
+
|
192
|
+
conf = if options[:config_name]
|
193
|
+
storages.find { |_| _['bigquery'] && _['bigquery']['name'] == options[:config_name] }
|
194
|
+
else
|
195
|
+
storages.find { |_| _['bigquery'] }
|
196
|
+
end
|
197
|
+
|
198
|
+
(conf && conf['bigquery']) or \
|
199
|
+
abort 'error: bigquery storage configuration not found'
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def client
|
204
|
+
client_and_api; @client
|
205
|
+
end
|
206
|
+
|
207
|
+
def api
|
208
|
+
client_and_api; @api
|
209
|
+
end
|
210
|
+
|
211
|
+
def client_and_api
|
212
|
+
return @client_and_api if @client_and_api
|
213
|
+
|
214
|
+
@client_and_api = AkaneBigquery.make_bigquery_client(config)
|
215
|
+
@client, @api = @client_and_api
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module AkaneBigquery
|
2
|
+
module Schema
|
3
|
+
|
4
|
+
#STRING, INTEGER, FLOAT, BOOLEAN, TIMESTAMP or RECORD
|
5
|
+
#NULLABLE, REQUIRED and REPEATED.
|
6
|
+
SCHEMAS = {
|
7
|
+
'0' => {
|
8
|
+
'tweets' => {
|
9
|
+
'fields' => [
|
10
|
+
{'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
11
|
+
|
12
|
+
{'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
13
|
+
{'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
14
|
+
|
15
|
+
{'name' => 'text', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
16
|
+
|
17
|
+
{'name' => 'lang', 'type' => 'STRING'},
|
18
|
+
{'name' => 'source', 'type' => 'STRING'},
|
19
|
+
|
20
|
+
{'name' => 'in_reply_to_status_id_str', 'type' => 'STRING'},
|
21
|
+
{'name' => 'in_reply_to_status_id', 'type' => 'INTEGER'},
|
22
|
+
{'name' => 'in_reply_to_user_id_str', 'type' => 'STRING'},
|
23
|
+
{'name' => 'in_reply_to_user_id', 'type' => 'INTEGER'},
|
24
|
+
{'name' => 'in_reply_to_screen_name', 'type' => 'STRING'},
|
25
|
+
|
26
|
+
{'name' => 'retweeted_status_id_str', 'type' => 'STRING'},
|
27
|
+
{'name' => 'retweeted_status_id', 'type' => 'INTEGER'},
|
28
|
+
|
29
|
+
{'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
|
30
|
+
|
31
|
+
{
|
32
|
+
'name' => 'user', 'type' => 'RECORD', 'mode' => 'REQUIRED',
|
33
|
+
'fields' => [
|
34
|
+
{'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
35
|
+
{'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
36
|
+
{'name' => 'name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
37
|
+
{'name' => 'screen_name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
38
|
+
{'name' => 'protected', 'type' => 'BOOLEAN', 'mode' => 'NULLABLE'},
|
39
|
+
],
|
40
|
+
},
|
41
|
+
|
42
|
+
{'name' => 'coordinates_longitude', 'type' => 'FLOAT'},
|
43
|
+
{'name' => 'coordinates_latitude', 'type' => 'FLOAT'},
|
44
|
+
|
45
|
+
{
|
46
|
+
'name' => 'place', 'type' => 'RECORD',
|
47
|
+
'fields' => [
|
48
|
+
{'name' => 'id', 'type' => 'STRING'},
|
49
|
+
{'name' => 'country', 'type' => 'STRING'},
|
50
|
+
{'name' => 'country_code', 'type' => 'STRING'},
|
51
|
+
{'name' => 'name', 'type' => 'STRING'},
|
52
|
+
{'name' => 'full_name', 'type' => 'STRING'},
|
53
|
+
{'name' => 'place_type', 'type' => 'STRING'},
|
54
|
+
{'name' => 'url', 'type' => 'STRING'},
|
55
|
+
],
|
56
|
+
},
|
57
|
+
],
|
58
|
+
},
|
59
|
+
'deletions' => {
|
60
|
+
'fields' => [
|
61
|
+
{'name' => 'user_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
62
|
+
{'name' => 'tweet_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
63
|
+
{'name' => 'user_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
64
|
+
{'name' => 'tweet_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
65
|
+
|
66
|
+
{'name' => 'deleted_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
|
67
|
+
],
|
68
|
+
},
|
69
|
+
'events' => {
|
70
|
+
'fields' => [
|
71
|
+
{'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
72
|
+
|
73
|
+
{'name' => 'event', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
74
|
+
|
75
|
+
{'name' => 'source_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
76
|
+
{'name' => 'target_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
|
77
|
+
{'name' => 'source_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
78
|
+
{'name' => 'target_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
|
79
|
+
|
80
|
+
{'name' => 'target_object_id', 'type' => 'INTEGER'},
|
81
|
+
{'name' => 'target_object_id_str', 'type' => 'STRING'},
|
82
|
+
|
83
|
+
{'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
|
84
|
+
],
|
85
|
+
},
|
86
|
+
}.freeze,
|
87
|
+
}.freeze
|
88
|
+
|
89
|
+
VERSION = '0'
|
90
|
+
SCHEMA = SCHEMAS[VERSION]
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
require 'akane/storages/abstract_storage'
|
2
|
+
require 'akane-bigquery'
|
3
|
+
|
4
|
+
require 'thread'
|
5
|
+
|
6
|
+
module Akane
|
7
|
+
module Storages
|
8
|
+
class Bigquery < AbstractStorage
|
9
|
+
class Stop < Exception; end # :nodoc:
|
10
|
+
|
11
|
+
def initialize(*)
|
12
|
+
super
|
13
|
+
|
14
|
+
@client, @api = AkaneBigquery.make_bigquery_client(@config)
|
15
|
+
|
16
|
+
@project_id = @config['project_id']
|
17
|
+
@dataset_id = @config['dataset_id']
|
18
|
+
|
19
|
+
@lock = Mutex.new
|
20
|
+
@thread = nil
|
21
|
+
|
22
|
+
@flush_interval = @config['flush_interval'] ? @config['flush_interval'].to_i : 60
|
23
|
+
@flush_threshold = @config['flush_threshold'] ? @config['flush_threshold'].to_i : 1000
|
24
|
+
|
25
|
+
@pending_inserts = []
|
26
|
+
@failing_inserts = []
|
27
|
+
@pending_inserts_lock = Mutex.new
|
28
|
+
|
29
|
+
swap_buffers # initialize
|
30
|
+
start
|
31
|
+
end
|
32
|
+
|
33
|
+
def name
|
34
|
+
@name ||= "bigquery:#{@project_id}/#{@dataset_id}"
|
35
|
+
end
|
36
|
+
|
37
|
+
def bq_insert(table, row)
|
38
|
+
@lock.synchronize do
|
39
|
+
@buffers[table] << row
|
40
|
+
end
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def start
|
45
|
+
@lock.synchronize do
|
46
|
+
unless @thread
|
47
|
+
@thread = Thread.new(&method(:worker_loop))
|
48
|
+
@stop = false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def exitable?
|
54
|
+
@stop && (@thread ? @thread.alive? : true)
|
55
|
+
end
|
56
|
+
|
57
|
+
def stop!
|
58
|
+
@lock.synchronize do
|
59
|
+
super
|
60
|
+
@thread.raise(Stop) if @thread
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def record_tweet(account, tweet)
|
65
|
+
hash = tweet.attrs
|
66
|
+
row = {
|
67
|
+
'json'.freeze => hash.to_json,
|
68
|
+
'id_str'.freeze => hash[:id_str],
|
69
|
+
'id'.freeze => hash[:id],
|
70
|
+
'text'.freeze => hash[:text],
|
71
|
+
'lang'.freeze => hash[:lang],
|
72
|
+
'source'.freeze => hash[:source],
|
73
|
+
'in_reply_to_status_id'.freeze => hash[:in_reply_to_status_id],
|
74
|
+
'in_reply_to_status_id_str'.freeze => hash[:in_reply_to_status_id_str],
|
75
|
+
'in_reply_to_user_id'.freeze => hash[:in_reply_to_user_id],
|
76
|
+
'in_reply_to_user_id_str'.freeze => hash[:in_reply_to_user_id_str],
|
77
|
+
'in_reply_to_screen_name'.freeze => hash[:in_reply_to_screen_name],
|
78
|
+
'user'.freeze => {
|
79
|
+
'id_str'.freeze => hash[:user][:id_str],
|
80
|
+
'id'.freeze => hash[:user][:id],
|
81
|
+
'name'.freeze => hash[:user][:name],
|
82
|
+
'screen_name'.freeze => hash[:user][:screen_name],
|
83
|
+
'protected'.freeze => hash[:user][:protected],
|
84
|
+
},
|
85
|
+
'created_at'.freeze => Time.parse(hash[:created_at]).to_i
|
86
|
+
}
|
87
|
+
|
88
|
+
if hash['coordinates'.freeze]
|
89
|
+
row['coordinates_longitude'.freeze], row['coordinates_latitude'.freeze] = \
|
90
|
+
hash[:coordinates][:coordinates]
|
91
|
+
end
|
92
|
+
|
93
|
+
if hash[:place]
|
94
|
+
place = hash[:place]
|
95
|
+
row['place'.freeze] = {
|
96
|
+
'id'.freeze => place[:id],
|
97
|
+
'country'.freeze => place[:country],
|
98
|
+
'country_code'.freeze => place[:country_code],
|
99
|
+
'name'.freeze => place[:name],
|
100
|
+
'full_name'.freeze => place[:full_name],
|
101
|
+
'place_type'.freeze => place[:place_type],
|
102
|
+
'url'.freeze => place[:url],
|
103
|
+
}
|
104
|
+
end
|
105
|
+
|
106
|
+
bq_insert :tweets, row
|
107
|
+
end
|
108
|
+
|
109
|
+
def mark_as_deleted(account, user_id, tweet_id)
|
110
|
+
bq_insert(:deletions,
|
111
|
+
'user_id'.freeze => user_id,
|
112
|
+
'user_id_str'.freeze => user_id.to_s,
|
113
|
+
'tweet_id'.freeze => tweet_id,
|
114
|
+
'tweet_id_str'.freeze => tweet_id.to_s,
|
115
|
+
'deleted_at'.freeze => Time.now.to_i,
|
116
|
+
)
|
117
|
+
end
|
118
|
+
|
119
|
+
def record_event(account, event)
|
120
|
+
source = event['source'.freeze]
|
121
|
+
target = event['target'.freeze]
|
122
|
+
target_object = event['target_object'.freeze]
|
123
|
+
|
124
|
+
source_id = source[:id]
|
125
|
+
target_id = target[:id]
|
126
|
+
|
127
|
+
unless source_id && target_id
|
128
|
+
@logger.warn "Discarding event because source and target id is missing: #{event.inspect}"
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
hash = Hash[
|
133
|
+
event.map { |k,v| [k, v && v.respond_to?(:attrs) ? v.attrs : nil] }
|
134
|
+
]
|
135
|
+
|
136
|
+
row = {
|
137
|
+
'json'.freeze => hash.to_json,
|
138
|
+
'event'.freeze => event['event'.freeze],
|
139
|
+
'source_id'.freeze => source_id,
|
140
|
+
'source_id_str'.freeze => source_id.to_s,
|
141
|
+
'target_id'.freeze => target_id,
|
142
|
+
'target_id_str'.freeze => target_id.to_s,
|
143
|
+
'created_at'.freeze => Time.now.to_i
|
144
|
+
}
|
145
|
+
|
146
|
+
if target_object && target_object[:id]
|
147
|
+
id = target_object[:id]
|
148
|
+
row['target_object_id'.freeze] = id
|
149
|
+
row['target_object_id_str'.freeze] = id.to_s
|
150
|
+
end
|
151
|
+
|
152
|
+
p row
|
153
|
+
bq_insert :events, row
|
154
|
+
end
|
155
|
+
|
156
|
+
def record_message(account, message)
|
157
|
+
end
|
158
|
+
|
159
|
+
def status
|
160
|
+
@buffers ? @buffers.map{ |table, buf| "#{table}=#{buf.size}" }.join(', ') + " | #{@failing_inserts.size} failures, #{@pending_inserts.size} inserts" : "-"
|
161
|
+
end
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
def swap_buffers
|
166
|
+
@lock.synchronize do
|
167
|
+
old_buffers = @buffers
|
168
|
+
@buffers = {tweets: [], messages: [], deletions: [], events: []}
|
169
|
+
|
170
|
+
old_buffers
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def worker_loop
|
175
|
+
@last_flush = Time.now
|
176
|
+
retry_interval = 1
|
177
|
+
|
178
|
+
begin
|
179
|
+
flush_pending_inserts
|
180
|
+
|
181
|
+
loop do
|
182
|
+
if @flush_interval <= (Time.now - @last_flush) || @flush_threshold <= @buffers.values.map(&:size).inject(:+)
|
183
|
+
flush_buffer
|
184
|
+
end
|
185
|
+
|
186
|
+
flush_pending_inserts
|
187
|
+
|
188
|
+
sleep 1
|
189
|
+
end
|
190
|
+
rescue Stop
|
191
|
+
@logger.info "Flushing buffer for graceful quit"
|
192
|
+
flush_buffer
|
193
|
+
until @pending_inserts.empty? && @failing_inserts.empty?
|
194
|
+
flush_pending_inserts(true)
|
195
|
+
sleep 10 unless @failing_inserts.empty?
|
196
|
+
end
|
197
|
+
rescue Exception => e
|
198
|
+
@logger.error "#{name} - Encountered error on buffer worker"
|
199
|
+
@logger.error e.inspect
|
200
|
+
@logger.error e.backtrace.join("\n")
|
201
|
+
|
202
|
+
@logger.error "Retrying after #{retry_interval.to_i}"
|
203
|
+
sleep retry_interval.to_i
|
204
|
+
retry_interval *= 1.8
|
205
|
+
retry
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def flush_buffer
|
210
|
+
prev_buffers = swap_buffers()
|
211
|
+
|
212
|
+
prev_buffers.each do |table, rows|
|
213
|
+
next if rows.empty?
|
214
|
+
|
215
|
+
insert_id_base = "#{Time.now.to_f}:#{rows.__id__}:#{table}"
|
216
|
+
request = {
|
217
|
+
api_method: @api.tabledata.insert_all,
|
218
|
+
parameters: {
|
219
|
+
'datasetId' => @dataset_id,
|
220
|
+
'projectId' => @project_id,
|
221
|
+
'tableId' => table.to_s,
|
222
|
+
},
|
223
|
+
body_object: {
|
224
|
+
'rows' => rows.map.with_index { |row, index|
|
225
|
+
{
|
226
|
+
'insertId'.freeze => "#{insert_id_base}:#{index}",
|
227
|
+
'json'.freeze => row,
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
@pending_inserts_lock.synchronize do
|
233
|
+
@logger.debug "Adding pending inserts for #{table}, #{rows.size} rows"
|
234
|
+
@pending_inserts << {request: request, insert_id: insert_id_base}
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
@last_flush = Time.now
|
239
|
+
end
|
240
|
+
|
241
|
+
def flush_pending_inserts(do_failures = false)
|
242
|
+
while failing_request = @failing_inserts.shift
|
243
|
+
if do_failures || Time.now <= failing_request[:next_try]
|
244
|
+
@logger.info "[#{name}] Retrying #{failing_request[:insert_id]}"
|
245
|
+
@pending_inserts_lock.synchronize { @pending_inserts.push(failing_request) }
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
while request = @pending_inserts_lock.synchronize { @pending_inserts.shift }
|
250
|
+
table = request[:request][:parameters]['tableId']
|
251
|
+
result = @client.execute(request[:request])
|
252
|
+
|
253
|
+
if result.error?
|
254
|
+
if request[:retry]
|
255
|
+
request[:retry] *= 1.8
|
256
|
+
else
|
257
|
+
request[:retry] = 5
|
258
|
+
end
|
259
|
+
|
260
|
+
request[:next_try] = Time.now + request[:retry]
|
261
|
+
|
262
|
+
@logger.error "[#{name}] Failed #{table} to insert: #{result.error_message} (#{request[:insert_id]}); retrying in #{request[:retry]} seconds"
|
263
|
+
@failing_inserts << request
|
264
|
+
else
|
265
|
+
@logger.debug "[#{name}] Inserted records in #{table}"
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
metadata
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: akane-bigquery
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Shota Fukumori (sora_h)
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: akane
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: google-api-client
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.7.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.7.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: thor
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.19.1
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.19.1
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: oj
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 3.0.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 3.0.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: webmock
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 1.17.3
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 1.17.3
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rake
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: Google Bigquery storage adapter for akane.gem
|
126
|
+
email:
|
127
|
+
- her@sorah.jp
|
128
|
+
executables:
|
129
|
+
- akane-bigquery
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- ".gitignore"
|
134
|
+
- Gemfile
|
135
|
+
- LICENSE.txt
|
136
|
+
- README.md
|
137
|
+
- Rakefile
|
138
|
+
- akane-bigquery.gemspec
|
139
|
+
- bin/akane-bigquery
|
140
|
+
- lib/akane-bigquery.rb
|
141
|
+
- lib/akane-bigquery/cli.rb
|
142
|
+
- lib/akane-bigquery/schema.rb
|
143
|
+
- lib/akane-bigquery/version.rb
|
144
|
+
- lib/akane/storages/bigquery.rb
|
145
|
+
homepage: https://github.com/sorah/akane-bigquery
|
146
|
+
licenses:
|
147
|
+
- MIT
|
148
|
+
metadata: {}
|
149
|
+
post_install_message:
|
150
|
+
rdoc_options: []
|
151
|
+
require_paths:
|
152
|
+
- lib
|
153
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
159
|
+
requirements:
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: '0'
|
163
|
+
requirements: []
|
164
|
+
rubyforge_project:
|
165
|
+
rubygems_version: 2.2.2
|
166
|
+
signing_key:
|
167
|
+
specification_version: 4
|
168
|
+
summary: akane.gem Google Bigquery storage adapter
|
169
|
+
test_files: []
|