akane-bigquery 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7fc97e06744000530649b80d35908244a7003ff5
4
+ data.tar.gz: 7e8348c2df4d4a66eaa4048902229a6e69a2e9bf
5
+ SHA512:
6
+ metadata.gz: 1d206afead4eada0ac898352277af4ba8b276e22da99ed4adc388d0d053781a48c189b23fd64e92e4b1a64a2a913a706f81ec6e2baf3611740a0e90f883058b4
7
+ data.tar.gz: b98ab328c7b3661ab0f623f8082979685c50f0654a9a651817296599dc89510e0dcc95130c8e7b91b797fd1aec9c254a950d726de82f2bd0f8470ba831ed16df
@@ -0,0 +1,25 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+ config.yml
24
+ akane.yml
25
+ *.p12
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in akane-bigquery.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Shota Fukumori (sora_h)
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,40 @@
1
+ # akane-bigquery - Storage engine for akane.gem that streams tweets to Google BigQuery
2
+
3
+ Storage plugin gem for [akane](https://github.com/sorah/akane), allows you to use BigQuery as akane's storage engine.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'akane-bigquery'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install akane-bigquery
18
+
19
+ ## Loading past data
20
+
21
+ If you're using akane's `file` storage for past data,`akane-bigquery prepare` allows you to load them into BigQuery.
22
+
23
+ ```
24
+ $ mkdir /tmp/akane-bigquery
25
+ $ akane-bigquery prepare /path/to/your/file-storage /tmp/akane-bigquery
26
+ $ gsutil -m cp /tmp/akane-bigquery/* gs://YOUR_BUCKET/
27
+ $ bq load --source_format=NEWLINE_DELIMITED_JSON YOUR_DATASET_ID.tweets "$(gsutil ls gs://YOUR_BUCKET/ | ruby -e 'ARGF.readlines.map(&:chomp).reject(&:empty?).join(",").display')"
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ TODO: Write usage instructions here
33
+
34
+ ## Contributing
35
+
36
+ 1. Fork it ( https://github.com/sorah/akane-bigquery/fork )
37
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
38
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
39
+ 4. Push to the branch (`git push origin my-new-feature`)
40
+ 5. Create a new Pull Request
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'akane-bigquery/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "akane-bigquery"
8
+ spec.version = AkaneBigquery::VERSION
9
+ spec.authors = ["Shota Fukumori (sora_h)"]
10
+ spec.email = ["her@sorah.jp"]
11
+ spec.summary = %q{akane.gem Google Bigquery storage adapter}
12
+ spec.description = %q{Google Bigquery storage adapter for akane.gem}
13
+ spec.homepage = "https://github.com/sorah/akane-bigquery"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "akane", ">= 0.2.0"
22
+ spec.add_dependency 'google-api-client', '>= 0.7.1'
23
+ spec.add_dependency 'thor', '>= 0.19.1'
24
+ spec.add_dependency 'oj'
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.6"
27
+ spec.add_development_dependency "rspec", "~> 3.0.0"
28
+ spec.add_development_dependency "webmock", "~> 1.17.3"
29
+ spec.add_development_dependency "rake"
30
+ end
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'akane-bigquery/cli'
3
+
4
+ AkaneBigquery::CLI.start
@@ -0,0 +1,42 @@
1
+ require 'akane/storages/bigquery'
2
+ require 'akane-bigquery/version'
3
+ require 'akane-bigquery/schema'
4
+
5
+ require 'google/api_client'
6
+
7
+ module AkaneBigquery
8
+ def self.make_client(config)
9
+ raise ArgumentError, "missing config['key']" unless config['key']
10
+ raise ArgumentError, "missing config['key']['path']" unless config['key']['path']
11
+ raise ArgumentError, "missing config['key']['passphrase']" unless config['key']['passphrase']
12
+ raise ArgumentError, "missing config['client_id']" unless config['client_id']
13
+ raise ArgumentError, "missing config['service_email']" unless config['service_email']
14
+
15
+ client = Google::APIClient.new(
16
+ application_name: config["app_name"] || 'akane',
17
+ application_version: AkaneBigquery::VERSION,
18
+ )
19
+
20
+ key = Google::APIClient::KeyUtils.load_from_pkcs12(
21
+ config['key']['path'],
22
+ config['key']['passphrase']
23
+ )
24
+
25
+ client.authorization = Signet::OAuth2::Client.new(
26
+ token_credential_uri: 'https://accounts.google.com/o/oauth2/token',
27
+ audience: 'https://accounts.google.com/o/oauth2/token',
28
+ scope: 'https://www.googleapis.com/auth/bigquery',
29
+ issuer: config['service_email'],
30
+ signing_key: key,
31
+ )
32
+
33
+ client.authorization.fetch_access_token!
34
+
35
+ return client
36
+ end
37
+
38
+ def self.make_bigquery_client(config)
39
+ client = make_client(config)
40
+ [client, client.discovered_api("bigquery", "v2")]
41
+ end
42
+ end
@@ -0,0 +1,218 @@
1
+ require 'akane-bigquery'
2
+ require 'yaml'
3
+ require 'thor'
4
+ require 'oj'
5
+
6
+ module AkaneBigquery
7
+ class CLI < Thor
8
+ desc "init", 'creates table on bigquery'
9
+ method_option :config,
10
+ required: true, aliases: %w(-c),
11
+ desc: "path to akane config file (yml)"
12
+ method_option :config_name,
13
+ desc: "select bigquery configuration by name key. use this if you have multiple bigquery storages in config file"
14
+
15
+
16
+ def init
17
+ # check dataset existence
18
+ dataset = client.execute(
19
+ api_method: api.datasets.get,
20
+ parameters: {
21
+ 'projectId' => config['project_id'],
22
+ 'datasetId' => config['dataset_id'],
23
+ }
24
+ )
25
+
26
+ if dataset.error?
27
+ if dataset.error_message =~ /^Not Found:/i
28
+ puts "Creating dataset #{config['dataset_id']} ..."
29
+ dataset = client.execute(
30
+ api_method: api.datasets.insert,
31
+ parameters: {
32
+ 'projectId' => config['project_id'],
33
+ },
34
+ body_object: {
35
+ 'datasetReference' => {
36
+ 'datasetId' => config['dataset_id'],
37
+ },
38
+ 'description' => 'akane',
39
+ }
40
+ )
41
+
42
+ raise dataset.error_message if dataset.error?
43
+ else
44
+ raise dataset.error_message
45
+ end
46
+ end
47
+
48
+ schemas = AkaneBigquery::Schema::SCHEMA
49
+
50
+ schemas.each do |table_id, schema|
51
+ table = client.execute(
52
+ api_method: api.tables.get,
53
+ parameters: {
54
+ 'projectId' => config['project_id'],
55
+ 'datasetId' => config['dataset_id'],
56
+ 'tableId' => table_id,
57
+ },
58
+ )
59
+
60
+ if table.error?
61
+ if table.error_message =~ /^Not Found:/i
62
+ puts "Creating table #{table_id} ..."
63
+ table = client.execute(
64
+ api_method: api.tables.insert,
65
+ parameters: {
66
+ 'projectId' => config['project_id'],
67
+ 'datasetId' => config['dataset_id'],
68
+ },
69
+ body_object: {
70
+ 'tableReference' => {
71
+ 'projectId' => config['project_id'],
72
+ 'datasetId' => config['dataset_id'],
73
+ 'tableId' => table_id,
74
+ },
75
+ 'friendlyName' => table_id,
76
+ 'schema' => schema,
77
+ }
78
+ )
79
+ raise table.error_message if table.error?
80
+ else
81
+ raise table.error_message
82
+ end
83
+ end
84
+
85
+ end
86
+ end
87
+
88
+ desc "prepare SOURCE DEST", "prepare JSONs or loading into BigQuery from existing file storage data"
89
+ method_option :months, desc: "Names of months to process. Separeted by comma."
90
+ method_option :before, desc: "Dump only data before specified datetime. Value will be parsed by `Time.parse` of Ruby."
91
+ def prepare(source, prefix)
92
+ limit = 524288000 # 500MBytes
93
+
94
+ count = -1
95
+ bytes = 0
96
+
97
+ new_io = lambda do
98
+ bytes = 0
99
+ count += 1
100
+ path = File.join(prefix, "tweets.#{count.to_s.rjust(4,'0')}.txt")
101
+ puts "=> Using #{path}"
102
+ File.open(path, 'w')
103
+ end
104
+ io = new_io.call
105
+
106
+ months = options[:months] && options[:months].split(/,/)
107
+ before = options[:before] && Time.parse(options[:before])
108
+
109
+ userdirs = Dir.entries(File.join(source, "users"))
110
+ userdirs.each_with_index do |user_dirname, index|
111
+ next if user_dirname == "." || user_dirname == ".."
112
+ puts " * #{user_dirname} (#{index.succ}/#{userdirs.size}, #{((index.succ/userdirs.size.to_f)*100).to_i}%)"
113
+
114
+ userdir = File.join(source, "users", user_dirname)
115
+
116
+ tweet_filepaths = if options[:months]
117
+ months.map { |_| File.join(userdir, "tweets.#{_}.txt") }
118
+ else
119
+ Dir[File.join(userdir, 'tweets.*.txt')]
120
+ end
121
+ tweet_filepaths.each do |file|
122
+ begin
123
+ File.open(file, 'r') do |tweets_io|
124
+ tweets_io.each_line do |line|
125
+ json = line.chomp
126
+
127
+ tweet = Oj.load(json)
128
+
129
+ created_at = Time.parse(tweet['created_at'.freeze])
130
+ next if before && before <= created_at
131
+
132
+ new_json = {
133
+ 'json'.freeze => json,
134
+ 'id_str'.freeze => tweet['id_str'.freeze],
135
+ 'id'.freeze => tweet['id'.freeze],
136
+ 'text'.freeze => tweet['text'.freeze],
137
+ 'lang'.freeze => tweet['lang'.freeze],
138
+ 'source'.freeze => tweet['source'.freeze],
139
+ 'in_reply_to_status_id'.freeze => tweet['in_reply_to_status_id'.freeze],
140
+ 'in_reply_to_status_id_str'.freeze => tweet['in_reply_to_status_id_str'.freeze],
141
+ 'in_reply_to_user_id'.freeze => tweet['in_reply_to_user_id'.freeze],
142
+ 'in_reply_to_user_id_str'.freeze => tweet['in_reply_to_user_id_str'.freeze],
143
+ 'in_reply_to_screen_name'.freeze => tweet['in_reply_to_screen_name'.freeze],
144
+ 'user'.freeze => {
145
+ 'id_str'.freeze => tweet['user'.freeze]['id_str'.freeze],
146
+ 'id'.freeze => tweet['user'.freeze]['id'.freeze],
147
+ 'name'.freeze => tweet['user'.freeze]['name'.freeze],
148
+ 'screen_name'.freeze => tweet['user'.freeze]['screen_name'.freeze],
149
+ 'protected'.freeze => tweet['user'.freeze]['protected'.freeze],
150
+ },
151
+ 'created_at'.freeze => created_at.to_i
152
+ }
153
+
154
+ if tweet['coordinates'.freeze]
155
+ new_json['coordinates_longitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][0]
156
+ new_json['coordinates_latitude'.freeze] = tweet['coordinates'.freeze]['coordinates'.freeze][1]
157
+ end
158
+
159
+ if tweet['place'.freeze]
160
+ place = tweet['place'.freeze]
161
+ new_json['place'.freeze] = {
162
+ 'id'.freeze => place['id'.freeze],
163
+ 'country'.freeze => place['country'.freeze],
164
+ 'country_code'.freeze => place['country_code'.freeze],
165
+ 'name'.freeze => place['name'.freeze],
166
+ 'full_name'.freeze => place['full_name'.freeze],
167
+ 'place_type'.freeze => place['place_type'.freeze],
168
+ 'url'.freeze => place['url'.freeze],
169
+ }
170
+ end
171
+
172
+ new_json_str = Oj.dump(new_json)
173
+ io.puts new_json_str
174
+ bytes += new_json_str.size + 1
175
+ io = new_io.call if limit <= bytes
176
+ end
177
+ end
178
+ rescue Errno::ENOENT
179
+ end
180
+
181
+ end
182
+
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ def config
189
+ @config ||= begin
190
+ storages = YAML.load_file(options[:config])['storages']
191
+
192
+ conf = if options[:config_name]
193
+ storages.find { |_| _['bigquery'] && _['bigquery']['name'] == options[:config_name] }
194
+ else
195
+ storages.find { |_| _['bigquery'] }
196
+ end
197
+
198
+ (conf && conf['bigquery']) or \
199
+ abort 'error: bigquery storage configuration not found'
200
+ end
201
+ end
202
+
203
+ def client
204
+ client_and_api; @client
205
+ end
206
+
207
+ def api
208
+ client_and_api; @api
209
+ end
210
+
211
+ def client_and_api
212
+ return @client_and_api if @client_and_api
213
+
214
+ @client_and_api = AkaneBigquery.make_bigquery_client(config)
215
+ @client, @api = @client_and_api
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,93 @@
1
+ module AkaneBigquery
2
+ module Schema
3
+
4
+ #STRING, INTEGER, FLOAT, BOOLEAN, TIMESTAMP or RECORD
5
+ #NULLABLE, REQUIRED and REPEATED.
6
+ SCHEMAS = {
7
+ '0' => {
8
+ 'tweets' => {
9
+ 'fields' => [
10
+ {'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
11
+
12
+ {'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
13
+ {'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
14
+
15
+ {'name' => 'text', 'type' => 'STRING', 'mode' => 'REQUIRED'},
16
+
17
+ {'name' => 'lang', 'type' => 'STRING'},
18
+ {'name' => 'source', 'type' => 'STRING'},
19
+
20
+ {'name' => 'in_reply_to_status_id_str', 'type' => 'STRING'},
21
+ {'name' => 'in_reply_to_status_id', 'type' => 'INTEGER'},
22
+ {'name' => 'in_reply_to_user_id_str', 'type' => 'STRING'},
23
+ {'name' => 'in_reply_to_user_id', 'type' => 'INTEGER'},
24
+ {'name' => 'in_reply_to_screen_name', 'type' => 'STRING'},
25
+
26
+ {'name' => 'retweeted_status_id_str', 'type' => 'STRING'},
27
+ {'name' => 'retweeted_status_id', 'type' => 'INTEGER'},
28
+
29
+ {'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
30
+
31
+ {
32
+ 'name' => 'user', 'type' => 'RECORD', 'mode' => 'REQUIRED',
33
+ 'fields' => [
34
+ {'name' => 'id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
35
+ {'name' => 'id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
36
+ {'name' => 'name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
37
+ {'name' => 'screen_name', 'type' => 'STRING', 'mode' => 'REQUIRED'},
38
+ {'name' => 'protected', 'type' => 'BOOLEAN', 'mode' => 'NULLABLE'},
39
+ ],
40
+ },
41
+
42
+ {'name' => 'coordinates_longitude', 'type' => 'FLOAT'},
43
+ {'name' => 'coordinates_latitude', 'type' => 'FLOAT'},
44
+
45
+ {
46
+ 'name' => 'place', 'type' => 'RECORD',
47
+ 'fields' => [
48
+ {'name' => 'id', 'type' => 'STRING'},
49
+ {'name' => 'country', 'type' => 'STRING'},
50
+ {'name' => 'country_code', 'type' => 'STRING'},
51
+ {'name' => 'name', 'type' => 'STRING'},
52
+ {'name' => 'full_name', 'type' => 'STRING'},
53
+ {'name' => 'place_type', 'type' => 'STRING'},
54
+ {'name' => 'url', 'type' => 'STRING'},
55
+ ],
56
+ },
57
+ ],
58
+ },
59
+ 'deletions' => {
60
+ 'fields' => [
61
+ {'name' => 'user_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
62
+ {'name' => 'tweet_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
63
+ {'name' => 'user_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
64
+ {'name' => 'tweet_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
65
+
66
+ {'name' => 'deleted_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
67
+ ],
68
+ },
69
+ 'events' => {
70
+ 'fields' => [
71
+ {'name' => 'json', 'type' => 'STRING', 'mode' => 'REQUIRED'},
72
+
73
+ {'name' => 'event', 'type' => 'STRING', 'mode' => 'REQUIRED'},
74
+
75
+ {'name' => 'source_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
76
+ {'name' => 'target_id', 'type' => 'INTEGER', 'mode' => 'REQUIRED'},
77
+ {'name' => 'source_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
78
+ {'name' => 'target_id_str', 'type' => 'STRING', 'mode' => 'REQUIRED'},
79
+
80
+ {'name' => 'target_object_id', 'type' => 'INTEGER'},
81
+ {'name' => 'target_object_id_str', 'type' => 'STRING'},
82
+
83
+ {'name' => 'created_at', 'type' => 'TIMESTAMP', 'mode' => 'REQUIRED'},
84
+ ],
85
+ },
86
+ }.freeze,
87
+ }.freeze
88
+
89
+ VERSION = '0'
90
+ SCHEMA = SCHEMAS[VERSION]
91
+
92
+ end
93
+ end
@@ -0,0 +1,3 @@
1
+ module AkaneBigquery
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,273 @@
1
+ require 'akane/storages/abstract_storage'
2
+ require 'akane-bigquery'
3
+
4
+ require 'thread'
5
+
6
+ module Akane
7
+ module Storages
8
+ class Bigquery < AbstractStorage
9
+ class Stop < Exception; end # :nodoc:
10
+
11
+ def initialize(*)
12
+ super
13
+
14
+ @client, @api = AkaneBigquery.make_bigquery_client(@config)
15
+
16
+ @project_id = @config['project_id']
17
+ @dataset_id = @config['dataset_id']
18
+
19
+ @lock = Mutex.new
20
+ @thread = nil
21
+
22
+ @flush_interval = @config['flush_interval'] ? @config['flush_interval'].to_i : 60
23
+ @flush_threshold = @config['flush_threshold'] ? @config['flush_threshold'].to_i : 1000
24
+
25
+ @pending_inserts = []
26
+ @failing_inserts = []
27
+ @pending_inserts_lock = Mutex.new
28
+
29
+ swap_buffers # initialize
30
+ start
31
+ end
32
+
33
+ def name
34
+ @name ||= "bigquery:#{@project_id}/#{@dataset_id}"
35
+ end
36
+
37
+ def bq_insert(table, row)
38
+ @lock.synchronize do
39
+ @buffers[table] << row
40
+ end
41
+ self
42
+ end
43
+
44
+ def start
45
+ @lock.synchronize do
46
+ unless @thread
47
+ @thread = Thread.new(&method(:worker_loop))
48
+ @stop = false
49
+ end
50
+ end
51
+ end
52
+
53
+ def exitable?
54
+ @stop && (@thread ? @thread.alive? : true)
55
+ end
56
+
57
+ def stop!
58
+ @lock.synchronize do
59
+ super
60
+ @thread.raise(Stop) if @thread
61
+ end
62
+ end
63
+
64
+ def record_tweet(account, tweet)
65
+ hash = tweet.attrs
66
+ row = {
67
+ 'json'.freeze => hash.to_json,
68
+ 'id_str'.freeze => hash[:id_str],
69
+ 'id'.freeze => hash[:id],
70
+ 'text'.freeze => hash[:text],
71
+ 'lang'.freeze => hash[:lang],
72
+ 'source'.freeze => hash[:source],
73
+ 'in_reply_to_status_id'.freeze => hash[:in_reply_to_status_id],
74
+ 'in_reply_to_status_id_str'.freeze => hash[:in_reply_to_status_id_str],
75
+ 'in_reply_to_user_id'.freeze => hash[:in_reply_to_user_id],
76
+ 'in_reply_to_user_id_str'.freeze => hash[:in_reply_to_user_id_str],
77
+ 'in_reply_to_screen_name'.freeze => hash[:in_reply_to_screen_name],
78
+ 'user'.freeze => {
79
+ 'id_str'.freeze => hash[:user][:id_str],
80
+ 'id'.freeze => hash[:user][:id],
81
+ 'name'.freeze => hash[:user][:name],
82
+ 'screen_name'.freeze => hash[:user][:screen_name],
83
+ 'protected'.freeze => hash[:user][:protected],
84
+ },
85
+ 'created_at'.freeze => Time.parse(hash[:created_at]).to_i
86
+ }
87
+
88
+ if hash['coordinates'.freeze]
89
+ row['coordinates_longitude'.freeze], row['coordinates_latitude'.freeze] = \
90
+ hash[:coordinates][:coordinates]
91
+ end
92
+
93
+ if hash[:place]
94
+ place = hash[:place]
95
+ row['place'.freeze] = {
96
+ 'id'.freeze => place[:id],
97
+ 'country'.freeze => place[:country],
98
+ 'country_code'.freeze => place[:country_code],
99
+ 'name'.freeze => place[:name],
100
+ 'full_name'.freeze => place[:full_name],
101
+ 'place_type'.freeze => place[:place_type],
102
+ 'url'.freeze => place[:url],
103
+ }
104
+ end
105
+
106
+ bq_insert :tweets, row
107
+ end
108
+
109
+ def mark_as_deleted(account, user_id, tweet_id)
110
+ bq_insert(:deletions,
111
+ 'user_id'.freeze => user_id,
112
+ 'user_id_str'.freeze => user_id.to_s,
113
+ 'tweet_id'.freeze => tweet_id,
114
+ 'tweet_id_str'.freeze => tweet_id.to_s,
115
+ 'deleted_at'.freeze => Time.now.to_i,
116
+ )
117
+ end
118
+
119
+ def record_event(account, event)
120
+ source = event['source'.freeze]
121
+ target = event['target'.freeze]
122
+ target_object = event['target_object'.freeze]
123
+
124
+ source_id = source[:id]
125
+ target_id = target[:id]
126
+
127
+ unless source_id && target_id
128
+ @logger.warn "Discarding event because source and target id is missing: #{event.inspect}"
129
+ return
130
+ end
131
+
132
+ hash = Hash[
133
+ event.map { |k,v| [k, v && v.respond_to?(:attrs) ? v.attrs : nil] }
134
+ ]
135
+
136
+ row = {
137
+ 'json'.freeze => hash.to_json,
138
+ 'event'.freeze => event['event'.freeze],
139
+ 'source_id'.freeze => source_id,
140
+ 'source_id_str'.freeze => source_id.to_s,
141
+ 'target_id'.freeze => target_id,
142
+ 'target_id_str'.freeze => target_id.to_s,
143
+ 'created_at'.freeze => Time.now.to_i
144
+ }
145
+
146
+ if target_object && target_object[:id]
147
+ id = target_object[:id]
148
+ row['target_object_id'.freeze] = id
149
+ row['target_object_id_str'.freeze] = id.to_s
150
+ end
151
+
152
+ p row
153
+ bq_insert :events, row
154
+ end
155
+
156
+ def record_message(account, message)
157
+ end
158
+
159
+ def status
160
+ @buffers ? @buffers.map{ |table, buf| "#{table}=#{buf.size}" }.join(', ') + " | #{@failing_inserts.size} failures, #{@pending_inserts.size} inserts" : "-"
161
+ end
162
+
163
+ private
164
+
165
+ def swap_buffers
166
+ @lock.synchronize do
167
+ old_buffers = @buffers
168
+ @buffers = {tweets: [], messages: [], deletions: [], events: []}
169
+
170
+ old_buffers
171
+ end
172
+ end
173
+
174
+ def worker_loop
175
+ @last_flush = Time.now
176
+ retry_interval = 1
177
+
178
+ begin
179
+ flush_pending_inserts
180
+
181
+ loop do
182
+ if @flush_interval <= (Time.now - @last_flush) || @flush_threshold <= @buffers.values.map(&:size).inject(:+)
183
+ flush_buffer
184
+ end
185
+
186
+ flush_pending_inserts
187
+
188
+ sleep 1
189
+ end
190
+ rescue Stop
191
+ @logger.info "Flushing buffer for graceful quit"
192
+ flush_buffer
193
+ until @pending_inserts.empty? && @failing_inserts.empty?
194
+ flush_pending_inserts(true)
195
+ sleep 10 unless @failing_inserts.empty?
196
+ end
197
+ rescue Exception => e
198
+ @logger.error "#{name} - Encountered error on buffer worker"
199
+ @logger.error e.inspect
200
+ @logger.error e.backtrace.join("\n")
201
+
202
+ @logger.error "Retrying after #{retry_interval.to_i}"
203
+ sleep retry_interval.to_i
204
+ retry_interval *= 1.8
205
+ retry
206
+ end
207
+ end
208
+
209
+ def flush_buffer
210
+ prev_buffers = swap_buffers()
211
+
212
+ prev_buffers.each do |table, rows|
213
+ next if rows.empty?
214
+
215
+ insert_id_base = "#{Time.now.to_f}:#{rows.__id__}:#{table}"
216
+ request = {
217
+ api_method: @api.tabledata.insert_all,
218
+ parameters: {
219
+ 'datasetId' => @dataset_id,
220
+ 'projectId' => @project_id,
221
+ 'tableId' => table.to_s,
222
+ },
223
+ body_object: {
224
+ 'rows' => rows.map.with_index { |row, index|
225
+ {
226
+ 'insertId'.freeze => "#{insert_id_base}:#{index}",
227
+ 'json'.freeze => row,
228
+ }
229
+ }
230
+ }
231
+ }
232
+ @pending_inserts_lock.synchronize do
233
+ @logger.debug "Adding pending inserts for #{table}, #{rows.size} rows"
234
+ @pending_inserts << {request: request, insert_id: insert_id_base}
235
+ end
236
+ end
237
+
238
+ @last_flush = Time.now
239
+ end
240
+
241
+ def flush_pending_inserts(do_failures = false)
242
+ while failing_request = @failing_inserts.shift
243
+ if do_failures || Time.now <= failing_request[:next_try]
244
+ @logger.info "[#{name}] Retrying #{failing_request[:insert_id]}"
245
+ @pending_inserts_lock.synchronize { @pending_inserts.push(failing_request) }
246
+ end
247
+ end
248
+
249
+ while request = @pending_inserts_lock.synchronize { @pending_inserts.shift }
250
+ table = request[:request][:parameters]['tableId']
251
+ result = @client.execute(request[:request])
252
+
253
+ if result.error?
254
+ if request[:retry]
255
+ request[:retry] *= 1.8
256
+ else
257
+ request[:retry] = 5
258
+ end
259
+
260
+ request[:next_try] = Time.now + request[:retry]
261
+
262
+ @logger.error "[#{name}] Failed #{table} to insert: #{result.error_message} (#{request[:insert_id]}); retrying in #{request[:retry]} seconds"
263
+ @failing_inserts << request
264
+ else
265
+ @logger.debug "[#{name}] Inserted records in #{table}"
266
+ end
267
+ end
268
+ end
269
+
270
+ end
271
+ end
272
+ end
273
+
metadata ADDED
@@ -0,0 +1,169 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: akane-bigquery
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Shota Fukumori (sora_h)
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: akane
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: google-api-client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.7.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.7.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: thor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.19.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.19.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: oj
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 3.0.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 3.0.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: webmock
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 1.17.3
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 1.17.3
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Google Bigquery storage adapter for akane.gem
126
+ email:
127
+ - her@sorah.jp
128
+ executables:
129
+ - akane-bigquery
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - Gemfile
135
+ - LICENSE.txt
136
+ - README.md
137
+ - Rakefile
138
+ - akane-bigquery.gemspec
139
+ - bin/akane-bigquery
140
+ - lib/akane-bigquery.rb
141
+ - lib/akane-bigquery/cli.rb
142
+ - lib/akane-bigquery/schema.rb
143
+ - lib/akane-bigquery/version.rb
144
+ - lib/akane/storages/bigquery.rb
145
+ homepage: https://github.com/sorah/akane-bigquery
146
+ licenses:
147
+ - MIT
148
+ metadata: {}
149
+ post_install_message:
150
+ rdoc_options: []
151
+ require_paths:
152
+ - lib
153
+ required_ruby_version: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - ">="
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ required_rubygems_version: !ruby/object:Gem::Requirement
159
+ requirements:
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: '0'
163
+ requirements: []
164
+ rubyforge_project:
165
+ rubygems_version: 2.2.2
166
+ signing_key:
167
+ specification_version: 4
168
+ summary: akane.gem Google Bigquery storage adapter
169
+ test_files: []