akane 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,104 @@
1
+ require 'thread'
2
+
3
+ module Akane
4
+ class Recorder
5
+ def initialize(storages, logger: Logger.new(nil))
6
+ @storages = storages
7
+ @logger = logger
8
+ @queue = Queue.new
9
+ @recently_performed = RoundrobinFlags.new(1000)
10
+ end
11
+
12
+ def queue_length
13
+ @queue.size
14
+ end
15
+
16
+ def record_tweet(account, tweet)
17
+ @queue << [:record_tweet, account, tweet]
18
+ self
19
+ end
20
+
21
+ def mark_as_deleted(account, user_id, tweet_id)
22
+ @queue << [:mark_as_deleted, account, user_id, tweet_id]
23
+ self
24
+ end
25
+
26
+ def record_message(account, message)
27
+ @queue << [:record_message, account, message]
28
+ self
29
+ end
30
+
31
+ def record_event(account, event)
32
+ @queue << [:record_event, account, event]
33
+ self
34
+ end
35
+
36
+ def dequeue(raise_errors = false)
37
+ perform(*@queue.pop, raise_errors: raise_errors)
38
+ end
39
+
40
+ def perform(action, account, *payload, raise_errors: false)
41
+ if action == :record_tweet
42
+ return if @recently_performed[payload.last["id"]]
43
+ @recently_performed.flag!(payload.last["id"])
44
+
45
+ if payload.last["retweeted_status"]
46
+ perform(:record_tweet, account, payload.last["retweeted_status"], raise_errors: raise_errors)
47
+ end
48
+ end
49
+
50
+ @storages.each do |storage|
51
+ begin
52
+ storage.__send__(action, account, *payload)
53
+ rescue Exception => e
54
+ raise e if e === Interrupt
55
+ raise e if raise_errors
56
+ @logger.error "Error while recorder performing to #{storage.inspect}: #{e.inspect}"
57
+ @logger.error e.backtrace
58
+ end
59
+ end
60
+ end
61
+
62
+ def run(raise_errors = false)
63
+ loop do
64
+ begin
65
+ self.dequeue(raise_errors)
66
+ rescue Exception => e
67
+ raise e if e === Interrupt
68
+ raise e if raise_errors
69
+ @logger.error "Error while recorder dequing: #{e.inspect}"
70
+ @logger.error e.backtrace
71
+ end
72
+ end
73
+ end
74
+
75
+ class RoundrobinFlags
76
+ def initialize(size)
77
+ @hash = {}
78
+ @limit_size = size
79
+ end
80
+
81
+ def [](k)
82
+ @hash[k]
83
+ end
84
+
85
+ def flag!(k)
86
+ @hash[k] = true
87
+ if @limit_size < @hash.size
88
+ @hash.keys.first(@hash.size-@limit_size).each do |overflowed_key|
89
+ @hash.delete overflowed_key
90
+ end
91
+ end
92
+ nil
93
+ end
94
+
95
+ def unflag!(k)
96
+ @hash.delete k
97
+ end
98
+
99
+ def flags
100
+ @hash.keys
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,26 @@
1
+ module Akane
2
+ module Storages
3
+ class AbstractStorage
4
+ def initialize(config: raise(ArgumentError, 'missing config'), logger: Logger.new($stdout))
5
+ @config = config
6
+ @logger = logger
7
+ end
8
+
9
+ def record_tweet(account, tweet)
10
+ raise NotImplementedError
11
+ end
12
+
13
+ def mark_as_deleted(account, user_id, tweet_id)
14
+ raise NotImplementedError
15
+ end
16
+
17
+ def record_event(account, event)
18
+ raise NotImplementedError
19
+ end
20
+
21
+ def record_message(account, message)
22
+ raise NotImplementedError
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,242 @@
1
+ require 'akane/storages/abstract_storage'
2
+ require 'elasticsearch'
3
+
4
+ module Akane
5
+ module Storages
6
+ class Elasticsearch < AbstractStorage
7
+ def initialize(*)
8
+ super
9
+
10
+ @es = ::Elasticsearch::Client.new(
11
+ hosts: [@config["host"]],
12
+ logger: @config["enable_es_log"] ? @logger : nil
13
+ )
14
+ @index_name = @config["index"] || 'akane'
15
+ set_elasticsearch_up
16
+ end
17
+
18
+ def record_tweet(account, tweet)
19
+ tweet_hash = tweet.attrs
20
+ tweet_hash[:deleted] = false
21
+ @es.index(index: @index_name, type: 'tweet', id: tweet_hash[:id_str], body: tweet_hash)
22
+ end
23
+
24
+ def mark_as_deleted(account, user_id, tweet_id)
25
+ tweet = @es.get(index: @index_name, type: 'tweet', id: tweet_id.to_s)['_source']
26
+ tweet['deleted'] = true
27
+ @es.index(index: @index_name, type: 'tweet', id: tweet_id.to_s, body: tweet)
28
+ minimum_tweet = {
29
+ id: tweet['id'],
30
+ id_str: tweet['id_str'],
31
+ text: tweet['text'],
32
+ user: {
33
+ id: tweet['user']['id'],
34
+ id_str: tweet['user']['id_str'],
35
+ screen_name: tweet['user']['screen_name'],
36
+ }
37
+ }
38
+ @es.index(index: @index_name, type: 'deleted_tweet', id: tweet_id.to_s, body: {tweet: minimum_tweet, deleted_at: Time.now.strftime('%Y-%m-%d %H:%M:%S %z')})
39
+ rescue ::Elasticsearch::Transport::Transport::Errors::NotFound => e
40
+ @logger.debug "Due to 404, skipping Deletion for #{tweet_id}"
41
+ # do nothing
42
+ end
43
+
44
+ def record_event(account, event)
45
+ case event["event"]
46
+ when 'favorite'
47
+ when 'unfavorite'
48
+ when 'block'
49
+ when 'unblock'
50
+ when 'follow'
51
+ when 'unfollow'
52
+ end
53
+ end
54
+
55
+ def record_message(account, message)
56
+ @es.index(index: @index_name, type: 'message', id: message[:id_str], body: message.attrs)
57
+ end
58
+
59
+ private
60
+
61
+ def set_elasticsearch_up
62
+ begin
63
+ @es.indices.get_mapping(index: @index_name)
64
+ rescue ::Elasticsearch::Transport::Transport::Errors::NotFound => e
65
+ raise e unless /IndexMissingException/ === e.message
66
+
67
+ @logger.info 'elasticsearch.setup: creating index'
68
+
69
+ date_format = "EE MMM d HH:mm:ss Z yyyy"
70
+ user_properties = {
71
+ notifications: {type: 'boolean', store: 'no', index: 'no'},
72
+ follow_request_sent: {type: 'boolean', store: 'no', index: 'no'},
73
+ following: {type: 'boolean', store: 'no', index: 'no'},
74
+ default_profile_image: {type: 'boolean', store: 'no', index: 'no'},
75
+ default_profile: {type: 'boolean', store: 'no', index: 'no'},
76
+ geo_enabled: {type: 'boolean', store: 'no', index: 'no'},
77
+ time_zone: {type: 'string', index: 'not_analyzed'},
78
+ utc_offset: {type: 'integer', store: 'yes', index: 'no'},
79
+ favourites_count: {type: 'integer', store: 'no', index: 'no'},
80
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
81
+ listed_count: {type: 'integer', store: 'no', index: 'no'},
82
+ friends_count: {type: 'integer', store: 'no', index: 'no'},
83
+ followers_count: {type: 'integer', store: 'no', index: 'no'},
84
+ id: {type: 'long'},
85
+ id_str: {type: 'string', index: 'not_analyzed'},
86
+ name: {type: 'string'}.merge(
87
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
88
+ screen_name: {type: 'string', index: 'not_analyzed'},
89
+ location: {type: 'string', index: 'no'},
90
+ url: {type: 'string', index: 'no'},
91
+ description: {type: 'string'}.merge(
92
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
93
+ protected: {type: 'boolean'},
94
+ verified: {type: 'boolean'},
95
+ statuses_count: {type: 'long', store: 'yes', index: 'no'},
96
+ lang: {type: 'string', index: 'not_analyzed'},
97
+ contributors_enabled: {type: 'boolean', index: 'no'},
98
+ is_translator: {type: 'boolean', index: 'no'},
99
+ profile_background_color: {type: 'string', store: 'no', index: 'no'},
100
+ profile_background_image_url: {type: 'string', store: 'no', index: 'no'},
101
+ profile_background_image_url_https: {type: 'string', store: 'no', index: 'no'},
102
+ profile_background_tile: {type: 'boolean', store: 'no', index: 'no'},
103
+ profile_image_url: {type: 'string', type: 'string', index: 'no'},
104
+ profile_image_url_https: {type: 'string', index: 'no'},
105
+ profile_link_color: {type: 'string', store: 'no', index: 'no'},
106
+ profile_sidebar_border_color: {type: 'string', store: 'no', index: 'no'},
107
+ profile_sidebar_fill_color: {type: 'string', store: 'no', index: 'no'},
108
+ profile_use_background_image: {type: 'boolean', store: 'no', index: 'no'},
109
+ }
110
+
111
+ minimum_user_properties = Hash[
112
+ user_properties.map { |k, v|
113
+ [k, %i(id id_str screen_name).include?(k) ? v : {type: v[:type], format: v[:format], store: 'no', index: 'no'}] }
114
+ ]
115
+
116
+ tweet_properties = {
117
+ lang: {type: 'string', index: 'not_analyzed'},
118
+ deleted: {type: 'boolean', null_value: false},
119
+ filter_level: {type: 'string', index: 'no'},
120
+ retweeted: {type: 'boolean', store: 'no', index: 'no'},
121
+ favorited: {type: 'boolean', store: 'no', index: 'no'},
122
+ entities: {type: 'boolean', store: 'no', index: 'no'},
123
+ favorite_count: {type: 'integer', store: 'no', index: 'no'},
124
+ retweet_count: {type: 'integer', store: 'no', index: 'no'},
125
+ in_reply_to_status_id_str: {type: 'string', index: 'not_analyzed'},
126
+ in_reply_to_status_id: {type: 'long'},
127
+ truncated: {type: 'boolean', store: 'no', index: 'no'},
128
+ source: {type: 'string'},
129
+ text: {type: 'string', boost: 2.0, }.merge(
130
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
131
+ id_str: {type: 'string', index: 'not_analyzed'},
132
+ id: {type: 'long'},
133
+ created_at: {type: 'date', format: date_format},
134
+ in_reply_to_user_id_str: {type: 'string', index: 'not_analyzed'},
135
+ in_reply_to_user_id: {type: 'long'},
136
+ user: {
137
+ type: 'object',
138
+ properties: user_properties,
139
+ },
140
+ coordinates: {
141
+ type: 'object',
142
+ properties: {
143
+ coordinates: {type: 'geo_point'},
144
+ type: {type: 'string', index: 'no'},
145
+ },
146
+ },
147
+ place: {
148
+ type: 'object',
149
+ properties: {
150
+ attributes: {type: 'object', store: 'no', index: 'no'},
151
+ bounding_box: {type: 'object', index: 'no'},
152
+ country: {type: 'string', index: 'no'},
153
+ country_code: {type: 'string', index: 'not_analyzed'},
154
+ id: {type: 'string', index: 'not_analyzed'},
155
+ name: {type: 'string'},
156
+ place_type: {type: 'string', index: 'no'},
157
+ url: {type: 'string', index: 'no', store: 'yes'},
158
+ },
159
+ },
160
+ contributors: {type: 'object', store: 'no', index: 'no'},
161
+ }
162
+
163
+ minimum_tweet_properties = Hash[
164
+ tweet_properties.map { |k, v|
165
+ if k == :user
166
+ [k, {type: 'object', properties: minimum_user_properties}]
167
+ else
168
+ [k, %i(id id_str text).include?(k) ? v : {type: v[:type], format: v[:format], store: 'no', index: 'no'}]
169
+ end
170
+ }
171
+ ]
172
+ tweet_properties[:retweeted_status] = {type: 'object', properties: minimum_tweet_properties}
173
+ minimum_tweet_properties[:retweeted_status] = {type: 'object', store: 'no', index: 'no'}
174
+
175
+ @es.indices.create(index: @index_name, body: {
176
+ settings: {
177
+ },
178
+ analysis: {
179
+ standard: {
180
+ type: 'standard'
181
+ },
182
+ }.merge( @config["kuromoji"] ?
183
+ {kuromoji: {
184
+ type: "kuromoji_tokenizer",
185
+ mode: "search",
186
+ }} : {}
187
+ ),
188
+ mappings: {
189
+ tweet: {
190
+ _source: {enabled: true},
191
+ properties: tweet_properties,
192
+ },
193
+ deleted_tweet: {
194
+ _source: {enabled: true},
195
+ properties: {
196
+ tweet: {type: 'object', properties: minimum_tweet_properties},
197
+ deleted_at: {type: 'date', index: 'no'},
198
+ },
199
+ },
200
+
201
+ message: {
202
+ _source: {enabled: true},
203
+ properties: {
204
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
205
+ text: {type: 'string', boost: 2.0, store: 'yes', }.merge(
206
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
207
+ sender_id_str: {type: 'string', store: 'yes', index: 'not_analyzed'},
208
+ sender_screen_name: {type: 'string', store: 'yes', index: 'not_analyzed'},
209
+ sender_id: {type: 'long', store: 'yes', },
210
+ recipient_id_str: {type: 'string', store: 'yes', index: 'not_analyzed'},
211
+ recipient_id: {type: 'long', store: 'yes', },
212
+ recipient_screen_name: {type: 'string', store: 'yes', index: 'not_analyzed'},
213
+ sender: {type: 'object', store: 'yes', properties: minimum_user_properties},
214
+ recipient: {type: 'object', store: 'yes', properties: minimum_user_properties},
215
+ },
216
+ },
217
+ event_favorite: {
218
+ _source: {enabled: true},
219
+ properties: {
220
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
221
+ event: {type: 'string', store: 'yes', index: 'not_analyzed'},
222
+ source: {type: 'object', store: 'yes', properties: minimum_user_properties},
223
+ target: {type: 'object', store: 'yes', properties: minimum_user_properties},
224
+ target_object: {type: 'object', store: 'yes', properties: minimum_tweet_properties},
225
+ },
226
+ },
227
+ event_user_interaction: {
228
+ _source: {enabled: true},
229
+ properties: {
230
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
231
+ event: {type: 'string', store: 'yes', index: 'not_analyzed'},
232
+ source: {type: 'object', store: 'yes', properties: minimum_user_properties},
233
+ target: {type: 'object', store: 'yes', properties: minimum_user_properties},
234
+ },
235
+ },
236
+ },
237
+ })
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,142 @@
1
+ require 'akane/storages/abstract_storage'
2
+ require 'date'
3
+ require 'json'
4
+ require 'time'
5
+ require 'pathname'
6
+
7
+ module Akane
8
+ module Storages
9
+ class File < AbstractStorage
10
+ def initialize(*)
11
+ super
12
+ @screen_name_to_id_cache = {}
13
+ @dir = Pathname.new(@config["dir"])
14
+ [@dir, @dir.join('names'), @dir.join('users'), @dir.join('event'), @dir.join('timeline')].each do |d|
15
+ d.mkdir unless d.exist?
16
+ end
17
+ end
18
+
19
+ def record_tweet(account, tweet)
20
+ timeline_io.puts "[#{tweet["created_at"].xmlschema}][#{account}] #{tweet["user"]["screen_name"]}: " \
21
+ "#{tweet["text"].gsub(/\r?\n/,' ')} (#{tweet["user"]["id"]},#{tweet["id"]})"
22
+
23
+ tweets_io_for_user(tweet["user"]["id"], tweet["user"]["screen_name"]) do |io|
24
+ io.puts tweet.attrs.to_json
25
+ end
26
+ end
27
+
28
+ def mark_as_deleted(account, user_id, tweet_id)
29
+ timeline_deletion_io.puts "#{Time.now.xmlschema},#{user_id},#{tweet_id}"
30
+ tweets_deletion_io_for_user(user_id) do |io|
31
+ io.puts "#{Time.now.xmlschema},#{user_id},#{tweet_id}"
32
+ end
33
+ end
34
+
35
+ def record_event(account, event)
36
+ event_io.puts event.merge("happened_on" => account).to_json
37
+ end
38
+
39
+ def record_message(account, message)
40
+ messages_raw_io_for_user(message["sender"]["id"], message["sender"]["screen_name"]) do |io|
41
+ io.puts message.attrs.to_json
42
+ end
43
+ messages_io_for_user(message["sender"]["id"], message["sender"]["screen_name"]) do |io|
44
+ io.puts "[#{message["created_at"].xmlschema}] #{message["sender"]["screen_name"]} -> #{message["recipient"]["screen_name"]}:" \
45
+ " #{message["text"]} (#{message["sender"]["id"]} -> #{message["recipient"]["id"]},#{message["id"]})"
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def timeline_io
52
+ if @timeline_io_date != Date.today || !@timeline_io
53
+ date = @timeline_io_date = Date.today
54
+ @timeline_io = ::File.open(@dir.join('timeline', date.strftime('%Y-%m-%d.txt')), 'a')
55
+ @timeline_io.sync = !@config.key?("sync_io") || @config["sync_io"]
56
+ @timeline_io
57
+ else
58
+ @timeline_io
59
+ end
60
+ end
61
+
62
+ def timeline_deletion_io
63
+ if @timeline_deletion_io_date != Date.today || !@timeline_deletion_io
64
+ date = @timeline_deletion_io_date = Date.today
65
+ @timeline_deletion_io = ::File.open(@dir.join('timeline', date.strftime('%Y-%m-%d.deleted.txt')), 'a')
66
+ @timeline_deletion_io.sync = !@config.key?("sync_io") || @config["sync_io"]
67
+ @timeline_deletion_io
68
+ else
69
+ @timeline_deletion_io
70
+ end
71
+ end
72
+
73
+ def event_io
74
+ if @event_io_date != Date.today || !@event_io
75
+ date = @event_io_date = Date.today
76
+ @event_io = ::File.open(@dir.join('event', date.strftime('%Y-%m-%d.txt')), 'a')
77
+ @event_io.sync = !@config.key?("sync_io") || @config["sync_io"]
78
+ @event_io
79
+ else
80
+ @event_io
81
+ end
82
+ end
83
+
84
+ def tweets_io_for_user(user_id, screen_name=nil, &block)
85
+ symlink_user_dir(user_id, screen_name)
86
+ date = Date.today
87
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('tweets.%Y-%m.txt')), 'a', &block)
88
+ end
89
+
90
+ def tweets_deletion_io_for_user(user_id, screen_name=nil, &block)
91
+ symlink_user_dir(user_id, screen_name)
92
+ date = Date.today
93
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('deleted-tweets.%Y-%m.txt')), 'a', &block)
94
+ end
95
+
96
+ def messages_io_for_user(user_id, screen_name=nil, &block)
97
+ symlink_user_dir(user_id, screen_name)
98
+ date = Date.today
99
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('messages.%Y-%m.txt')), 'a', &block)
100
+ end
101
+
102
+ def messages_raw_io_for_user(user_id, screen_name=nil, &block)
103
+ symlink_user_dir(user_id, screen_name)
104
+ date = Date.today
105
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('messages-raw.%Y-%m.txt')), 'a', &block)
106
+ end
107
+
108
+
109
+ def symlink_user_dir(user_id, screen_name=nil)
110
+ user_id_dir = @dir.join('users', user_id.to_s)
111
+ user_id_dir.mkdir unless user_id_dir.exist?
112
+
113
+ return unless screen_name
114
+ screen_name_dir = @dir.join('names', screen_name)
115
+
116
+ unless @screen_name_to_id_cache.has_key?(screen_name)
117
+ @screen_name_to_id_cache[screen_name] = if screen_name_dir.symlink?
118
+ screen_name_dir.realpath.basename.to_s
119
+ else
120
+ nil
121
+ end
122
+ @logger.debug "Caching dir for #{screen_name} : #{@screen_name_to_id_cache[screen_name].inspect}"
123
+ end
124
+
125
+ cached_id = @screen_name_to_id_cache[screen_name]
126
+
127
+ if cached_id && cached_id != user_id.to_s
128
+ prev_id = screen_name_dir.realpath.basename
129
+ @logger.info "Renaming #{screen_name}(#{prev_id}) dir: #{screen_name} -> #{prev_id}-#{screen_name}"
130
+ screen_name_dir.rename(@dir.join('names',"#{prev_id}-#{screen_name}"))
131
+ screen_name_dir.make_symlink("../users/#{user_id_dir.basename}")
132
+ @screen_name_to_id_cache[screen_name] = user_id.to_s
133
+
134
+ elsif cached_id.nil?
135
+ @logger.info "Linking #{screen_name}->#{user_id} dir"
136
+ screen_name_dir.make_symlink("../users/#{user_id_dir.basename}")
137
+ @screen_name_to_id_cache[screen_name] = user_id.to_s
138
+ end
139
+ end
140
+ end
141
+ end
142
+ end