akane-bigquery 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7fc97e06744000530649b80d35908244a7003ff5
4
+ data.tar.gz: 7e8348c2df4d4a66eaa4048902229a6e69a2e9bf
5
+ SHA512:
6
+ metadata.gz: 1d206afead4eada0ac898352277af4ba8b276e22da99ed4adc388d0d053781a48c189b23fd64e92e4b1a64a2a913a706f81ec6e2baf3611740a0e90f883058b4
7
+ data.tar.gz: b98ab328c7b3661ab0f623f8082979685c50f0654a9a651817296599dc89510e0dcc95130c8e7b91b797fd1aec9c254a950d726de82f2bd0f8470ba831ed16df
@@ -0,0 +1,25 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+ config.yml
24
+ akane.yml
25
+ *.p12
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in akane-bigquery.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Shota Fukumori (sora_h)
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,40 @@
1
+ # akane-bigquery - Storage engine for akane.gem that streams tweets to Google BigQuery
2
+
3
+ Storage plugin gem for [akane](https://github.com/sorah/akane), allows you to use BigQuery as akane's storage engine.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'akane-bigquery'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install akane-bigquery
18
+
19
+ ## Loading past data
20
+
21
+ If you're using akane's `file` storage for past data,`akane-bigquery prepare` allows you to load them into BigQuery.
22
+
23
+ ```
24
+ $ mkdir /tmp/akane-bigquery
25
+ $ akane-bigquery prepare /path/to/your/file-storage /tmp/akane-bigquery
26
+ $ gsutil -m cp /tmp/akane-bigquery/* gs://YOUR_BUCKET/
27
+ $ bq load --source_format=NEWLINE_DELIMITED_JSON YOUR_DATASET_ID.tweets "$(gsutil ls gs://YOUR_BUCKET/ | ruby -e 'ARGF.readlines.map(&:chomp).reject(&:empty?).join(",").display')"
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ TODO: Write usage instructions here
33
+
34
+ ## Contributing
35
+
36
+ 1. Fork it ( https://github.com/sorah/akane-bigquery/fork )
37
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
38
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
39
+ 4. Push to the branch (`git push origin my-new-feature`)
40
+ 5. Create a new Pull Request
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'akane-bigquery/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "akane-bigquery"
8
+ spec.version = AkaneBigquery::VERSION
9
+ spec.authors = ["Shota Fukumori (sora_h)"]
10
+ spec.email = ["her@sorah.jp"]
11
+ spec.summary = %q{akane.gem Google Bigquery storage adapter}
12
+ spec.description = %q{Google Bigquery storage adapter for akane.gem}
13
+ spec.homepage = "https://github.com/sorah/akane-bigquery"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "akane", ">= 0.2.0"
22
+ spec.add_dependency 'google-api-client', '>= 0.7.1'
23
+ spec.add_dependency 'thor', '>= 0.19.1'
24
+ spec.add_dependency 'oj'
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.6"
27
+ spec.add_development_dependency "rspec", "~> 3.0.0"
28
+ spec.add_development_dependency "webmock", "~> 1.17.3"
29
+ spec.add_development_dependency "rake"
30
+ end
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'akane-bigquery/cli'
3
+
4
+ AkaneBigquery::CLI.start
@@ -0,0 +1,42 @@
1
+ require 'akane/storages/bigquery'
2
+ require 'akane-bigquery/version'
3
+ require 'akane-bigquery/schema'
4
+
5
+ require 'google/api_client'
6
+
7
+ module AkaneBigquery
8
+ def self.make_client(config)
9
+ raise ArgumentError, "missing config['key']" unless config['key']
10
+ raise ArgumentError, "missing config['key']['path']" unless config['key']['path']
11
+ raise ArgumentError, "missing config['key']['passphrase']" unless config['key']['passphrase']
12
+ raise ArgumentError, "missing config['client_id']" unless config['client_id']
13
+ raise ArgumentError, "missing config['service_email']" unless config['service_email']
14
+
15
+ client = Google::APIClient.new(
16
+ application_name: config["app_name"] || 'akane',
17
+ application_version: AkaneBigquery::VERSION,
18
+ )
19
+
20
+ key = Google::APIClient::KeyUtils.load_from_pkcs12(
21
+ config['key']['path'],
22
+ config['key']['passphrase']
23
+ )
24
+
25
+ client.authorization = Signet::OAuth2::Client.new(
26
+ token_credential_uri: 'https://accounts.google.com/o/oauth2/token',
27
+ audience: 'https://accounts.google.com/o/oauth2/token',
28
+ scope: 'https://www.googleapis.com/auth/bigquery',
29
+ issuer: config['service_email'],
30
+ signing_key: key,
31
+ )
32
+
33
+ client.authorization.fetch_access_token!
34
+
35
+ return client
36
+ end
37
+
38
+ def self.make_bigquery_client(config)
39
+ client = make_client(config)
40
+ [client, client.discovered_api("bigquery", "v2")]
41
+ end
42
+ end
@@ -0,0 +1,218 @@
1
+ require 'akane-bigquery'
2
+ require 'yaml'
3
+ require 'thor'
4
+ require 'oj'
5
+
6
+ module AkaneBigquery
7
+ class CLI < Thor
8
+ desc "init", 'creates table on bigquery'
9
+ method_option :config,
10
+ required: true, aliases: %w(-c),
11
+ desc: "path to akane config file (yml)"
12
+ method_option :config_name,
13
+ desc: "select bigquery configuration by name key. use this if you have multiple bigquery storages in config file"
14
+
15
+
16
+ def init
17
+ # check dataset existence
18
+ dataset = client.execute(
19
+ api_method: api.datasets.get,
20
+ parameters: {
21
+ 'projectId' => config['project_id'],
22
+ 'datasetId' => config['dataset_id'],
23
+ }
24
+ )
25
+
26
+ if dataset.error?
27
+ if dataset.error_message =~ /^Not Found:/i
28
+ puts "Creating dataset #{config['dataset_id']} ..."
29
+ dataset = client.execute(
30
+ api_method: api.datasets.insert,
31
+ parameters: {
32
+ 'projectId' => config['project_id'],
33
+ },
34
+ body_object: {
35
+ 'datasetReference' => {
36
+ 'datasetId' => config['dataset_id'],
37
+ },
38
+ 'description' => 'akane',
39
+ }
40
+ )
41
+
42
+ raise dataset.error_message if dataset.error?
43
+ else
44
+ raise dataset.error_message
45
+ end
46
+ end
47
+
48
+ schemas = AkaneBigquery::Schema::SCHEMA
49
+
50
+ schemas.each do |table_id, schema|
51
+ table = client.execute(
52
+ api_method: api.tables.get,
53
+ parameters: {
54
+ 'projectId' => config['project_id'],
55
+ 'datasetId' => config['dataset_id'],
56
+ 'tableId' => table_id,
57
+ },
58
+ )
59
+
60
+ if table.error?
61
+ if table.error_message =~ /^Not Found:/i
62
+ puts "Creating table #{table_id} ..."
63
+ table = client.execute(
64
+ api_method: api.tables.insert,
65
+ parameters: {
66
+ 'projectId' => config['project_id'],
67
+ 'datasetId' => config['dataset_id'],
68
+ },
69
+ body_object: {
70
+ 'tableReference' => {
71
+ 'projectId' => config['project_id'],
72
+ 'datasetId' => config['dataset_id'],
73
+ 'tableId' => table_id,
74
+ },
75
+ 'friendlyName' => table_id,
76
+ 'schema' => schema,
77
+ }
78
+ )
79
+ raise table.error_message if table.error?
80
+ else
81
+ raise table.error_message
82
+ end
83
+ end
84
+
85
+ end
86
+ end
87
+
88
+ desc "prepare SOURCE DEST", "prepare JSONs or loading into BigQuery from existing file storage data"
89
+ method_option :months, desc: "Names of months to process. Separeted by comma."
90
+ method_option :before, desc: "Dump only data before specified datetime. Value will be parsed by `Time.parse` of Ruby."
91
+ def prepare(source, prefix)
92
+ limit = 524288000 # 500MBytes
93
+
94
+ count = -1
95
+ bytes = 0
96
+
97
+ new_io = lambda do
98
+ bytes = 0
99
+ count += 1
100
+ path = File.join(prefix, "tweets.#{count.to_s.rjust(4,'0')}.txt")
101
+ puts "=> Using #{path}"
102
+ File.open(path, 'w')
103
+ end
104
+ io = new_io.call
105
+
106
+ months = options[:months] && options[:months].split(/,/)
107
+ before = options[:before] && Time.parse(options[:before])
108
+
109
+ userdirs = Dir.entries(File.join(source, "users"))
110
+ userdirs.each_with_index do |user_dirname, index|
111
+ next if user_dirname == "." || user_dirname == ".."
112
+ puts " * #{user_dirname} (#{index.succ}/#{userdirs.size}, #{((index.succ/userdirs.size.to_f)*100).to_i}%)"
113
+
114
+ userdir = File.join(source, "users", user_dirname)
115
+
116
+ tweet_filepaths = if options[:months]
117
+ months.map { |_| File.join(userdir, "tweets.#{_}.txt") }
118
+ else
119
+ Dir[File.join(userdir, 'tweets.*.txt')]
120
+ end
121
+ tweet_filepaths.each do |file|
122
+ begin
123
+ File.open(file, 'r') do |tweets_io|
124
+ tweets_io.each_line do |line|
125
+ json = line.chomp
126
+
127
+ tweet = Oj.load(json)
128
+
129
+ created_at = Time.parse(tweet['created_at'.freeze])
130
+ next if before && before <= created_at
131
+
132
+ new_json = {
133
+ 'json'.freeze => json,
134
+ 'id_str'.freeze => tweet['id_str'.freeze],
135
+ 'id'.freeze => tweet['id'.freeze],
136
+ 'text'.freeze => tweet['text'.freeze],
137
+ 'lang'.freeze => tweet['lang'.freeze],
138
+ 'source'.freeze => tweet['source'.freeze],
139
+ 'in_reply_to_status_id'.freeze => tweet['in_reply_to_status_id'.freeze],
140
+ 'in_reply_to_status_id_str'.freeze => tweet['in_reply_to_status_id_str'.freeze],
141
+ 'in_reply_to_user_id'.freeze => tweet['in_reply_to_user_id'.freeze],
142
+ 'in_reply_to_user_id_str'.freeze => tweet['in_reply_to_user_id_str'.freeze],
143
+ 'in_reply_to_screen_name'.freeze => tweet['in_reply_to_screen_name'.freeze],
144
+ 'user'.freeze => {
145
+ 'id_str'.freeze => tweet['user'.freeze]['id_str'.freeze],
146
+ 'id'.freeze => tweet['user'.freeze]['id'.freeze],
147
+ 'name'.freeze => tweet['user'.freeze]['name'.freeze],
148
+ 'screen_name'.freeze => tweet['user'.freeze]['screen_name'.freeze],
149
+ 'protected'.freeze => tweet['user'.freeze]['protected'.freeze],
150
+ },
151
+ 'created_at'.freeze => created_at.to_i
152
+ }
153
+
154
+ if tweet['coordinates'.freeze]
155
+ new_json['coordinates_longitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][0]
156
+ new_json['coordinates_latitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][1]
157
+ end
158
+
159
+ if tweet['place'.freeze]
160
+ place = tweet['place'.freeze]
161
+ new_json['place'.freeze] = {
162
+ 'id'.freeze => place['id'.freeze],
163
+ 'country'.freeze => place['country'.freeze],
164
+ 'country_code'.freeze => place['country_code'.freeze],
165
+ 'name'.freeze => place['name'.freeze],
166
+ 'full_name'.freeze => place['full_name'.freeze],
167
+ 'place_type'.freeze => place['place_type'.freeze],
168
+ 'url'.freeze => place['url'.freeze],
169
+ }
170
+ end
171
+
172
+ new_json_str = Oj.dump(new_json)
173
+ io.puts new_json_str
174
+ bytes += new_json_str.size + 1
175
+ io = new_io.call if limit <= bytes
176
+ end
177
+ end
178
+ rescue Errno::ENOENT
179
+ end
180
+
181
+ end
182
+
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ def config
189
+ @config ||= begin
190
+ storages = YAML.load_file(options[:config])['storages']
191
+
192
+ conf = if options[:config_name]
193
+ storages.find { |_| _['bigquery'] && _['bigquery']['name'] == options[:config_name] }
194
+ else
195
+ storages.find { |_| _['bigquery'] }
196
+ end
197
+
198
+ (conf && conf['bigquery']) or \
199
+ abort 'error: bigquery storage configuration not found'
200
+ end
201
+ end
202
+
203
+ def client
204
+ client_and_api; @client
205
+ end
206
+
207
+ def api
208
+ client_and_api; @api
209
+ end
210
+
211
+ def client_and_api
212
+ return @client_and_api if @client_and_api
213
+
214
+ @client_and_api = AkaneBigquery.make_bigquery_client(config)
215
+ @client, @api = @client_and_api
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,93 @@
1
+ module AkaneBigquery
2
+ module Schema
3
+
4
+ #STRING, INTEGER, FLOAT, BOOLEAN, TIMESTAMP or RECORD
5
+ #NULLABLE, REQUIRED and REPEATED.
6
+ SCHEMAS = {
7
+ '0' => {
8
+ 'tweets' => {
9
+ 'fields' => [
10
+ {'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
11
+
12
+ {'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
13
+ {'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
14
+
15
+ {'name' => 'text', 'type' => 'STRING', 'mode' => 'REQUIRED'},
16
+
17
+ {'name' => 'lang', 'type' => 'STRING'},
18
+ {'name' => 'source', 'type' => 'STRING'},
19
+
20
+ {'name' => 'in_reply_to_status_id_str', 'type' => 'STRING'},
21
+ {'name' => 'in_reply_to_status_id', 'type' => 'INTEGER'},
22
+ {'name' => 'in_reply_to_user_id_str', 'type' => 'STRING'},
23
+ {'name' => 'in_reply_to_user_id', 'type' => 'INTEGER'},
24
+ {'name' => 'in_reply_to_screen_name', 'type' => 'STRING'},
25
+
26
+ {'name' => 'retweeted_status_id_str', 'type' => 'STRING'},
27
+ {'name' => 'retweeted_status_id', 'type' => 'INTEGER'},
28
+
29
+ {'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
30
+
31
+ {
32
+ 'name' => 'user', 'type' => 'RECORD', 'mode' => 'REQUIRED',
33
+ 'fields' => [
34
+ {'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
35
+ {'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
36
+ {'name' => 'name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
37
+ {'name' => 'screen_name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
38
+ {'name' => 'protected', 'type' => 'BOOLEAN', 'mode' => 'NULLABLE'},
39
+ ],
40
+ },
41
+
42
+ {'name' => 'coordinates_longitude', 'type' => 'FLOAT'},
43
+ {'name' => 'coordinates_latitude', 'type' => 'FLOAT'},
44
+
45
+ {
46
+ 'name' => 'place', 'type' => 'RECORD',
47
+ 'fields' => [
48
+ {'name' => 'id', 'type' => 'STRING'},
49
+ {'name' => 'country', 'type' => 'STRING'},
50
+ {'name' => 'country_code', 'type' => 'STRING'},
51
+ {'name' => 'name', 'type' => 'STRING'},
52
+ {'name' => 'full_name', 'type' => 'STRING'},
53
+ {'name' => 'place_type', 'type' => 'STRING'},
54
+ {'name' => 'url', 'type' => 'STRING'},
55
+ ],
56
+ },
57
+ ],
58
+ },
59
+ 'deletions' => {
60
+ 'fields' => [
61
+ {'name' => 'user_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
62
+ {'name' => 'tweet_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
63
+ {'name' => 'user_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
64
+ {'name' => 'tweet_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
65
+
66
+ {'name' => 'deleted_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
67
+ ],
68
+ },
69
+ 'events' => {
70
+ 'fields' => [
71
+ {'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
72
+
73
+ {'name' => 'event', 'type' => 'STRING', 'mode' => 'REQUIRED'},
74
+
75
+ {'name' => 'source_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
76
+ {'name' => 'target_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
77
+ {'name' => 'source_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
78
+ {'name' => 'target_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
79
+
80
+ {'name' => 'target_object_id', 'type' => 'INTEGER'},
81
+ {'name' => 'target_object_id_str', 'type' => 'STRING'},
82
+
83
+ {'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
84
+ ],
85
+ },
86
+ }.freeze,
87
+ }.freeze
88
+
89
+ VERSION = '0'
90
+ SCHEMA = SCHEMAS[VERSION]
91
+
92
+ end
93
+ end
@@ -0,0 +1,3 @@
1
+ module AkaneBigquery
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,273 @@
1
+ require 'akane/storages/abstract_storage'
2
+ require 'akane-bigquery'
3
+
4
+ require 'thread'
5
+
6
+ module Akane
7
+ module Storages
8
+ class Bigquery < AbstractStorage
9
+ class Stop < Exception; end # :nodoc:
10
+
11
+ def initialize(*)
12
+ super
13
+
14
+ @client, @api = AkaneBigquery.make_bigquery_client(@config)
15
+
16
+ @project_id = @config['project_id']
17
+ @dataset_id = @config['dataset_id']
18
+
19
+ @lock = Mutex.new
20
+ @thread = nil
21
+
22
+ @flush_interval = @config['flush_interval'] ? @config['flush_interval'].to_i : 60
23
+ @flush_threshold = @config['flush_threshold'] ? @config['flush_threshold'].to_i : 1000
24
+
25
+ @pending_inserts = []
26
+ @failing_inserts = []
27
+ @pending_inserts_lock = Mutex.new
28
+
29
+ swap_buffers # initialize
30
+ start
31
+ end
32
+
33
+ def name
34
+ @name ||= "bigquery:#{@project_id}/#{@dataset_id}"
35
+ end
36
+
37
+ def bq_insert(table, row)
38
+ @lock.synchronize do
39
+ @buffers[table] << row
40
+ end
41
+ self
42
+ end
43
+
44
+ def start
45
+ @lock.synchronize do
46
+ unless @thread
47
+ @thread = Thread.new(&method(:worker_loop))
48
+ @stop = false
49
+ end
50
+ end
51
+ end
52
+
53
+ def exitable?
54
+ @stop && (@thread ? @thread.alive? : true)
55
+ end
56
+
57
+ def stop!
58
+ @lock.synchronize do
59
+ super
60
+ @thread.raise(Stop) if @thread
61
+ end
62
+ end
63
+
64
+ def record_tweet(account, tweet)
65
+ hash = tweet.attrs
66
+ row = {
67
+ 'json'.freeze => hash.to_json,
68
+ 'id_str'.freeze => hash[:id_str],
69
+ 'id'.freeze => hash[:id],
70
+ 'text'.freeze => hash[:text],
71
+ 'lang'.freeze => hash[:lang],
72
+ 'source'.freeze => hash[:source],
73
+ 'in_reply_to_status_id'.freeze => hash[:in_reply_to_status_id],
74
+ 'in_reply_to_status_id_str'.freeze => hash[:in_reply_to_status_id_str],
75
+ 'in_reply_to_user_id'.freeze => hash[:in_reply_to_user_id],
76
+ 'in_reply_to_user_id_str'.freeze => hash[:in_reply_to_user_id_str],
77
+ 'in_reply_to_screen_name'.freeze => hash[:in_reply_to_screen_name],
78
+ 'user'.freeze => {
79
+ 'id_str'.freeze => hash[:user][:id_str],
80
+ 'id'.freeze => hash[:user][:id],
81
+ 'name'.freeze => hash[:user][:name],
82
+ 'screen_name'.freeze => hash[:user][:screen_name],
83
+ 'protected'.freeze => hash[:user][:protected],
84
+ },
85
+ 'created_at'.freeze => Time.parse(hash[:created_at]).to_i
86
+ }
87
+
88
+ if hash['coordinates'.freeze]
89
+ row['coordinates_longitude'.freeze], row['coordinates_latitude'.freeze] = \
90
+ hash[:coordinates][:coordinates]
91
+ end
92
+
93
+ if hash[:place]
94
+ place = hash[:place]
95
+ row['place'.freeze] = {
96
+ 'id'.freeze => place[:id],
97
+ 'country'.freeze => place[:country],
98
+ 'country_code'.freeze => place[:country_code],
99
+ 'name'.freeze => place[:name],
100
+ 'full_name'.freeze => place[:full_name],
101
+ 'place_type'.freeze => place[:place_type],
102
+ 'url'.freeze => place[:url],
103
+ }
104
+ end
105
+
106
+ bq_insert :tweets, row
107
+ end
108
+
109
+ def mark_as_deleted(account, user_id, tweet_id)
110
+ bq_insert(:deletions,
111
+ 'user_id'.freeze => user_id,
112
+ 'user_id_str'.freeze => user_id.to_s,
113
+ 'tweet_id'.freeze => tweet_id,
114
+ 'tweet_id_str'.freeze => tweet_id.to_s,
115
+ 'deleted_at'.freeze => Time.now.to_i,
116
+ )
117
+ end
118
+
119
+ def record_event(account, event)
120
+ source = event['source'.freeze]
121
+ target = event['target'.freeze]
122
+ target_object = event['target_object'.freeze]
123
+
124
+ source_id = source[:id]
125
+ target_id = target[:id]
126
+
127
+ unless source_id && target_id
128
+ @logger.warn "Discarding event because source and target id is missing: #{event.inspect}"
129
+ return
130
+ end
131
+
132
+ hash = Hash[
133
+ event.map { |k,v| [k, v && v.respond_to?(:attrs) ? v.attrs : nil] }
134
+ ]
135
+
136
+ row = {
137
+ 'json'.freeze => hash.to_json,
138
+ 'event'.freeze => event['event'.freeze],
139
+ 'source_id'.freeze => source_id,
140
+ 'source_id_str'.freeze => source_id.to_s,
141
+ 'target_id'.freeze => target_id,
142
+ 'target_id_str'.freeze => target_id.to_s,
143
+ 'created_at'.freeze => Time.now.to_i
144
+ }
145
+
146
+ if target_object && target_object[:id]
147
+ id = target_object[:id]
148
+ row['target_object_id'.freeze] = id
149
+ row['target_object_id_str'.freeze] = id.to_s
150
+ end
151
+
152
+ p row
153
+ bq_insert :events, row
154
+ end
155
+
156
+ def record_message(account, message)
157
+ end
158
+
159
+ def status
160
+ @buffers ? @buffers.map{ |table, buf| "#{table}=#{buf.size}" }.join(', ') + " | #{@failing_inserts.size} failures, #{@pending_inserts.size} inserts" : "-"
161
+ end
162
+
163
+ private
164
+
165
+ def swap_buffers
166
+ @lock.synchronize do
167
+ old_buffers = @buffers
168
+ @buffers = {tweets: [], messages: [], deletions: [], events: []}
169
+
170
+ old_buffers
171
+ end
172
+ end
173
+
174
+ def worker_loop
175
+ @last_flush = Time.now
176
+ retry_interval = 1
177
+
178
+ begin
179
+ flush_pending_inserts
180
+
181
+ loop do
182
+ if @flush_interval <= (Time.now - @last_flush) || @flush_threshold <= @buffers.values.map(&:size).inject(:+)
183
+ flush_buffer
184
+ end
185
+
186
+ flush_pending_inserts
187
+
188
+ sleep 1
189
+ end
190
+ rescue Stop
191
+ @logger.info "Flushing buffer for graceful quit"
192
+ flush_buffer
193
+ until @pending_inserts.empty? && @failing_inserts.empty?
194
+ flush_pending_inserts(true)
195
+ sleep 10 unless @failing_inserts.empty?
196
+ end
197
+ rescue Exception => e
198
+ @logger.error "#{name} - Encountered error on buffer worker"
199
+ @logger.error e.inspect
200
+ @logger.error e.backtrace.join("\n")
201
+
202
+ @logger.error "Retrying after #{retry_interval.to_i}"
203
+ sleep retry_interval.to_i
204
+ retry_interval *= 1.8
205
+ retry
206
+ end
207
+ end
208
+
209
+ def flush_buffer
210
+ prev_buffers = swap_buffers()
211
+
212
+ prev_buffers.each do |table, rows|
213
+ next if rows.empty?
214
+
215
+ insert_id_base = "#{Time.now.to_f}:#{rows.__id__}:#{table}"
216
+ request = {
217
+ api_method: @api.tabledata.insert_all,
218
+ parameters: {
219
+ 'datasetId' => @dataset_id,
220
+ 'projectId' => @project_id,
221
+ 'tableId' => table.to_s,
222
+ },
223
+ body_object: {
224
+ 'rows' => rows.map.with_index { |row, index|
225
+ {
226
+ 'insertId'.freeze => "#{insert_id_base}:#{index}",
227
+ 'json'.freeze => row,
228
+ }
229
+ }
230
+ }
231
+ }
232
+ @pending_inserts_lock.synchronize do
233
+ @logger.debug "Adding pending inserts for #{table}, #{rows.size} rows"
234
+ @pending_inserts << {request: request, insert_id: insert_id_base}
235
+ end
236
+ end
237
+
238
+ @last_flush = Time.now
239
+ end
240
+
241
+ def flush_pending_inserts(do_failures = false)
242
+ while failing_request = @failing_inserts.shift
243
+ if do_failures || Time.now <= failing_request[:next_try]
244
+ @logger.info "[#{name}] Retrying #{failing_request[:insert_id]}"
245
+ @pending_inserts_lock.synchronize { @pending_inserts.push(failing_request) }
246
+ end
247
+ end
248
+
249
+ while request = @pending_inserts_lock.synchronize { @pending_inserts.shift }
250
+ table = request[:request][:parameters]['tableId']
251
+ result = @client.execute(request[:request])
252
+
253
+ if result.error?
254
+ if request[:retry]
255
+ request[:retry] *= 1.8
256
+ else
257
+ request[:retry] = 5
258
+ end
259
+
260
+ request[:next_try] = Time.now + request[:retry]
261
+
262
+ @logger.error "[#{name}] Failed #{table} to insert: #{result.error_message} (#{request[:insert_id]}); retrying in #{request[:retry]} seconds"
263
+ @failing_inserts << request
264
+ else
265
+ @logger.debug "[#{name}] Inserted records in #{table}"
266
+ end
267
+ end
268
+ end
269
+
270
+ end
271
+ end
272
+ end
273
+
metadata ADDED
@@ -0,0 +1,169 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: akane-bigquery
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Shota Fukumori (sora_h)
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: akane
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: google-api-client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.7.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.7.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: thor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.19.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.19.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: oj
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 3.0.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 3.0.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: webmock
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 1.17.3
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 1.17.3
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Google Bigquery storage adapter for akane.gem
126
+ email:
127
+ - her@sorah.jp
128
+ executables:
129
+ - akane-bigquery
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - Gemfile
135
+ - LICENSE.txt
136
+ - README.md
137
+ - Rakefile
138
+ - akane-bigquery.gemspec
139
+ - bin/akane-bigquery
140
+ - lib/akane-bigquery.rb
141
+ - lib/akane-bigquery/cli.rb
142
+ - lib/akane-bigquery/schema.rb
143
+ - lib/akane-bigquery/version.rb
144
+ - lib/akane/storages/bigquery.rb
145
+ homepage: https://github.com/sorah/akane-bigquery
146
+ licenses:
147
+ - MIT
148
+ metadata: {}
149
+ post_install_message:
150
+ rdoc_options: []
151
+ require_paths:
152
+ - lib
153
+ required_ruby_version: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - ">="
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ required_rubygems_version: !ruby/object:Gem::Requirement
159
+ requirements:
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: '0'
163
+ requirements: []
164
+ rubyforge_project:
165
+ rubygems_version: 2.2.2
166
+ signing_key:
167
+ specification_version: 4
168
+ summary: akane.gem Google Bigquery storage adapter
169
+ test_files: []