akane 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ require 'thread'
2
+
3
+ module Akane
4
+ class Recorder
5
+ def initialize(storages, logger: Logger.new(nil))
6
+ @storages = storages
7
+ @logger = logger
8
+ @queue = Queue.new
9
+ @recently_performed = RoundrobinFlags.new(1000)
10
+ end
11
+
12
+ def queue_length
13
+ @queue.size
14
+ end
15
+
16
+ def record_tweet(account, tweet)
17
+ @queue << [:record_tweet, account, tweet]
18
+ self
19
+ end
20
+
21
+ def mark_as_deleted(account, user_id, tweet_id)
22
+ @queue << [:mark_as_deleted, account, user_id, tweet_id]
23
+ self
24
+ end
25
+
26
+ def record_message(account, message)
27
+ @queue << [:record_message, account, message]
28
+ self
29
+ end
30
+
31
+ def record_event(account, event)
32
+ @queue << [:record_event, account, event]
33
+ self
34
+ end
35
+
36
+ def dequeue(raise_errors = false)
37
+ perform(*@queue.pop, raise_errors: raise_errors)
38
+ end
39
+
40
+ def perform(action, account, *payload, raise_errors: false)
41
+ if action == :record_tweet
42
+ return if @recently_performed[payload.last["id"]]
43
+ @recently_performed.flag!(payload.last["id"])
44
+
45
+ if payload.last["retweeted_status"]
46
+ perform(:record_tweet, account, payload.last["retweeted_status"], raise_errors: raise_errors)
47
+ end
48
+ end
49
+
50
+ @storages.each do |storage|
51
+ begin
52
+ storage.__send__(action, account, *payload)
53
+ rescue Exception => e
54
+ raise e if e === Interrupt
55
+ raise e if raise_errors
56
+ @logger.error "Error while recorder performing to #{storage.inspect}: #{e.inspect}"
57
+ @logger.error e.backtrace
58
+ end
59
+ end
60
+ end
61
+
62
+ def run(raise_errors = false)
63
+ loop do
64
+ begin
65
+ self.dequeue(raise_errors)
66
+ rescue Exception => e
67
+ raise e if e === Interrupt
68
+ raise e if raise_errors
69
+ @logger.error "Error while recorder dequing: #{e.inspect}"
70
+ @logger.error e.backtrace
71
+ end
72
+ end
73
+ end
74
+
75
+ class RoundrobinFlags
76
+ def initialize(size)
77
+ @hash = {}
78
+ @limit_size = size
79
+ end
80
+
81
+ def [](k)
82
+ @hash[k]
83
+ end
84
+
85
+ def flag!(k)
86
+ @hash[k] = true
87
+ if @limit_size < @hash.size
88
+ @hash.keys.first(@hash.size-@limit_size).each do |overflowed_key|
89
+ @hash.delete overflowed_key
90
+ end
91
+ end
92
+ nil
93
+ end
94
+
95
+ def unflag!(k)
96
+ @hash.delete k
97
+ end
98
+
99
+ def flags
100
+ @hash.keys
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,26 @@
1
+ module Akane
2
+ module Storages
3
+ class AbstractStorage
4
+ def initialize(config: raise(ArgumentError, 'missing config'), logger: Logger.new($stdout))
5
+ @config = config
6
+ @logger = logger
7
+ end
8
+
9
+ def record_tweet(account, tweet)
10
+ raise NotImplementedError
11
+ end
12
+
13
+ def mark_as_deleted(account, user_id, tweet_id)
14
+ raise NotImplementedError
15
+ end
16
+
17
+ def record_event(account, event)
18
+ raise NotImplementedError
19
+ end
20
+
21
+ def record_message(account, message)
22
+ raise NotImplementedError
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,242 @@
1
+ require 'akane/storages/abstract_storage'
2
+ require 'elasticsearch'
3
+
4
+ module Akane
5
+ module Storages
6
+ class Elasticsearch < AbstractStorage
7
+ def initialize(*)
8
+ super
9
+
10
+ @es = ::Elasticsearch::Client.new(
11
+ hosts: [@config["host"]],
12
+ logger: @config["enable_es_log"] ? @logger : nil
13
+ )
14
+ @index_name = @config["index"] || 'akane'
15
+ set_elasticsearch_up
16
+ end
17
+
18
+ def record_tweet(account, tweet)
19
+ tweet_hash = tweet.attrs
20
+ tweet_hash[:deleted] = false
21
+ @es.index(index: @index_name, type: 'tweet', id: tweet_hash[:id_str], body: tweet_hash)
22
+ end
23
+
24
+ def mark_as_deleted(account, user_id, tweet_id)
25
+ tweet = @es.get(index: @index_name, type: 'tweet', id: tweet_id.to_s)['_source']
26
+ tweet['deleted'] = true
27
+ @es.index(index: @index_name, type: 'tweet', id: tweet_id.to_s, body: tweet)
28
+ minimum_tweet = {
29
+ id: tweet['id'],
30
+ id_str: tweet['id_str'],
31
+ text: tweet['text'],
32
+ user: {
33
+ id: tweet['user']['id'],
34
+ id_str: tweet['user']['id_str'],
35
+ screen_name: tweet['user']['screen_name'],
36
+ }
37
+ }
38
+ @es.index(index: @index_name, type: 'deleted_tweet', id: tweet_id.to_s, body: {tweet: minimum_tweet, deleted_at: Time.now.strftime('%Y-%m-%d %H:%M:%S %z')})
39
+ rescue ::Elasticsearch::Transport::Transport::Errors::NotFound => e
40
+ @logger.debug "Due to 404, skipping Deletion for #{tweet_id}"
41
+ # do nothing
42
+ end
43
+
44
+ def record_event(account, event)
45
+ case event["event"]
46
+ when 'favorite'
47
+ when 'unfavorite'
48
+ when 'block'
49
+ when 'unblock'
50
+ when 'follow'
51
+ when 'unfollow'
52
+ end
53
+ end
54
+
55
+ def record_message(account, message)
56
+ @es.index(index: @index_name, type: 'message', id: message[:id_str], body: message.attrs)
57
+ end
58
+
59
+ private
60
+
61
+ def set_elasticsearch_up
62
+ begin
63
+ @es.indices.get_mapping(index: @index_name)
64
+ rescue ::Elasticsearch::Transport::Transport::Errors::NotFound => e
65
+ raise e unless /IndexMissingException/ === e.message
66
+
67
+ @logger.info 'elasticsearch.setup: creating index'
68
+
69
+ date_format = "EE MMM d HH:mm:ss Z yyyy"
70
+ user_properties = {
71
+ notifications: {type: 'boolean', store: 'no', index: 'no'},
72
+ follow_request_sent: {type: 'boolean', store: 'no', index: 'no'},
73
+ following: {type: 'boolean', store: 'no', index: 'no'},
74
+ default_profile_image: {type: 'boolean', store: 'no', index: 'no'},
75
+ default_profile: {type: 'boolean', store: 'no', index: 'no'},
76
+ geo_enabled: {type: 'boolean', store: 'no', index: 'no'},
77
+ time_zone: {type: 'string', index: 'not_analyzed'},
78
+ utc_offset: {type: 'integer', store: 'yes', index: 'no'},
79
+ favourites_count: {type: 'integer', store: 'no', index: 'no'},
80
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
81
+ listed_count: {type: 'integer', store: 'no', index: 'no'},
82
+ friends_count: {type: 'integer', store: 'no', index: 'no'},
83
+ followers_count: {type: 'integer', store: 'no', index: 'no'},
84
+ id: {type: 'long'},
85
+ id_str: {type: 'string', index: 'not_analyzed'},
86
+ name: {type: 'string'}.merge(
87
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
88
+ screen_name: {type: 'string', index: 'not_analyzed'},
89
+ location: {type: 'string', index: 'no'},
90
+ url: {type: 'string', index: 'no'},
91
+ description: {type: 'string'}.merge(
92
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
93
+ protected: {type: 'boolean'},
94
+ verified: {type: 'boolean'},
95
+ statuses_count: {type: 'long', store: 'yes', index: 'no'},
96
+ lang: {type: 'string', index: 'not_analyzed'},
97
+ contributors_enabled: {type: 'boolean', index: 'no'},
98
+ is_translator: {type: 'boolean', index: 'no'},
99
+ profile_background_color: {type: 'string', store: 'no', index: 'no'},
100
+ profile_background_image_url: {type: 'string', store: 'no', index: 'no'},
101
+ profile_background_image_url_https: {type: 'string', store: 'no', index: 'no'},
102
+ profile_background_tile: {type: 'boolean', store: 'no', index: 'no'},
103
+ profile_image_url: {type: 'string', type: 'string', index: 'no'},
104
+ profile_image_url_https: {type: 'string', index: 'no'},
105
+ profile_link_color: {type: 'string', store: 'no', index: 'no'},
106
+ profile_sidebar_border_color: {type: 'string', store: 'no', index: 'no'},
107
+ profile_sidebar_fill_color: {type: 'string', store: 'no', index: 'no'},
108
+ profile_use_background_image: {type: 'boolean', store: 'no', index: 'no'},
109
+ }
110
+
111
+ minimum_user_properties = Hash[
112
+ user_properties.map { |k, v|
113
+ [k, %i(id id_str screen_name).include?(k) ? v : {type: v[:type], format: v[:format], store: 'no', index: 'no'}] }
114
+ ]
115
+
116
+ tweet_properties = {
117
+ lang: {type: 'string', index: 'not_analyzed'},
118
+ deleted: {type: 'boolean', null_value: false},
119
+ filter_level: {type: 'string', index: 'no'},
120
+ retweeted: {type: 'boolean', store: 'no', index: 'no'},
121
+ favorited: {type: 'boolean', store: 'no', index: 'no'},
122
+ entities: {type: 'boolean', store: 'no', index: 'no'},
123
+ favorite_count: {type: 'integer', store: 'no', index: 'no'},
124
+ retweet_count: {type: 'integer', store: 'no', index: 'no'},
125
+ in_reply_to_status_id_str: {type: 'string', index: 'not_analyzed'},
126
+ in_reply_to_status_id: {type: 'long'},
127
+ truncated: {type: 'boolean', store: 'no', index: 'no'},
128
+ source: {type: 'string'},
129
+ text: {type: 'string', boost: 2.0, }.merge(
130
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
131
+ id_str: {type: 'string', index: 'not_analyzed'},
132
+ id: {type: 'long'},
133
+ created_at: {type: 'date', format: date_format},
134
+ in_reply_to_user_id_str: {type: 'string', index: 'not_analyzed'},
135
+ in_reply_to_user_id: {type: 'long'},
136
+ user: {
137
+ type: 'object',
138
+ properties: user_properties,
139
+ },
140
+ coordinates: {
141
+ type: 'object',
142
+ properties: {
143
+ coordinates: {type: 'geo_point'},
144
+ type: {type: 'string', index: 'no'},
145
+ },
146
+ },
147
+ place: {
148
+ type: 'object',
149
+ properties: {
150
+ attributes: {type: 'object', store: 'no', index: 'no'},
151
+ bounding_box: {type: 'object', index: 'no'},
152
+ country: {type: 'string', index: 'no'},
153
+ country_code: {type: 'string', index: 'not_analyzed'},
154
+ id: {type: 'string', index: 'not_analyzed'},
155
+ name: {type: 'string'},
156
+ place_type: {type: 'string', index: 'no'},
157
+ url: {type: 'string', index: 'no', store: 'yes'},
158
+ },
159
+ },
160
+ contributors: {type: 'object', store: 'no', index: 'no'},
161
+ }
162
+
163
+ minimum_tweet_properties = Hash[
164
+ tweet_properties.map { |k, v|
165
+ if k == :user
166
+ [k, {type: 'object', properties: minimum_user_properties}]
167
+ else
168
+ [k, %i(id id_str text).include?(k) ? v : {type: v[:type], format: v[:format], store: 'no', index: 'no'}]
169
+ end
170
+ }
171
+ ]
172
+ tweet_properties[:retweeted_status] = {type: 'object', properties: minimum_tweet_properties}
173
+ minimum_tweet_properties[:retweeted_status] = {type: 'object', store: 'no', index: 'no'}
174
+
175
+ @es.indices.create(index: @index_name, body: {
176
+ settings: {
177
+ },
178
+ analysis: {
179
+ standard: {
180
+ type: 'standard'
181
+ },
182
+ }.merge( @config["kuromoji"] ?
183
+ {kuromoji: {
184
+ type: "kuromoji_tokenizer",
185
+ mode: "search",
186
+ }} : {}
187
+ ),
188
+ mappings: {
189
+ tweet: {
190
+ _source: {enabled: true},
191
+ properties: tweet_properties,
192
+ },
193
+ deleted_tweet: {
194
+ _source: {enabled: true},
195
+ properties: {
196
+ tweet: {type: 'object', properties: minimum_tweet_properties},
197
+ deleted_at: {type: 'date', index: 'no'},
198
+ },
199
+ },
200
+
201
+ message: {
202
+ _source: {enabled: true},
203
+ properties: {
204
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
205
+ text: {type: 'string', boost: 2.0, store: 'yes', }.merge(
206
+ @config["kuromoji"] ? {analyzer: 'kuromoji'} : {}),
207
+ sender_id_str: {type: 'string', store: 'yes', index: 'not_analyzed'},
208
+ sender_screen_name: {type: 'string', store: 'yes', index: 'not_analyzed'},
209
+ sender_id: {type: 'long', store: 'yes', },
210
+ recipient_id_str: {type: 'string', store: 'yes', index: 'not_analyzed'},
211
+ recipient_id: {type: 'long', store: 'yes', },
212
+ recipient_screen_name: {type: 'string', store: 'yes', index: 'not_analyzed'},
213
+ sender: {type: 'object', store: 'yes', properties: minimum_user_properties},
214
+ recipient: {type: 'object', store: 'yes', properties: minimum_user_properties},
215
+ },
216
+ },
217
+ event_favorite: {
218
+ _source: {enabled: true},
219
+ properties: {
220
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
221
+ event: {type: 'string', store: 'yes', index: 'not_analyzed'},
222
+ source: {type: 'object', store: 'yes', properties: minimum_user_properties},
223
+ target: {type: 'object', store: 'yes', properties: minimum_user_properties},
224
+ target_object: {type: 'object', store: 'yes', properties: minimum_tweet_properties},
225
+ },
226
+ },
227
+ event_user_interaction: {
228
+ _source: {enabled: true},
229
+ properties: {
230
+ created_at: {type: 'date', format: date_format, store: 'yes', index: 'no'},
231
+ event: {type: 'string', store: 'yes', index: 'not_analyzed'},
232
+ source: {type: 'object', store: 'yes', properties: minimum_user_properties},
233
+ target: {type: 'object', store: 'yes', properties: minimum_user_properties},
234
+ },
235
+ },
236
+ },
237
+ })
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,142 @@
1
+ require 'akane/storages/abstract_storage'
2
+ require 'date'
3
+ require 'json'
4
+ require 'time'
5
+ require 'pathname'
6
+
7
+ module Akane
8
+ module Storages
9
+ class File < AbstractStorage
10
+ def initialize(*)
11
+ super
12
+ @screen_name_to_id_cache = {}
13
+ @dir = Pathname.new(@config["dir"])
14
+ [@dir, @dir.join('names'), @dir.join('users'), @dir.join('event'), @dir.join('timeline')].each do |d|
15
+ d.mkdir unless d.exist?
16
+ end
17
+ end
18
+
19
+ def record_tweet(account, tweet)
20
+ timeline_io.puts "[#{tweet["created_at"].xmlschema}][#{account}] #{tweet["user"]["screen_name"]}: " \
21
+ "#{tweet["text"].gsub(/\r?\n/,' ')} (#{tweet["user"]["id"]},#{tweet["id"]})"
22
+
23
+ tweets_io_for_user(tweet["user"]["id"], tweet["user"]["screen_name"]) do |io|
24
+ io.puts tweet.attrs.to_json
25
+ end
26
+ end
27
+
28
+ def mark_as_deleted(account, user_id, tweet_id)
29
+ timeline_deletion_io.puts "#{Time.now.xmlschema},#{user_id},#{tweet_id}"
30
+ tweets_deletion_io_for_user(user_id) do |io|
31
+ io.puts "#{Time.now.xmlschema},#{user_id},#{tweet_id}"
32
+ end
33
+ end
34
+
35
+ def record_event(account, event)
36
+ event_io.puts event.merge("happened_on" => account).to_json
37
+ end
38
+
39
+ def record_message(account, message)
40
+ messages_raw_io_for_user(message["sender"]["id"], message["sender"]["screen_name"]) do |io|
41
+ io.puts message.attrs.to_json
42
+ end
43
+ messages_io_for_user(message["sender"]["id"], message["sender"]["screen_name"]) do |io|
44
+ io.puts "[#{message["created_at"].xmlschema}] #{message["sender"]["screen_name"]} -> #{message["recipient"]["screen_name"]}:" \
45
+ " #{message["text"]} (#{message["sender"]["id"]} -> #{message["recipient"]["id"]},#{message["id"]})"
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def timeline_io
52
+ if @timeline_io_date != Date.today || !@timeline_io
53
+ date = @timeline_io_date = Date.today
54
+ @timeline_io = ::File.open(@dir.join('timeline', date.strftime('%Y-%m-%d.txt')), 'a')
55
+ @timeline_io.sync = !@config.key?("sync_io") || @config["sync_io"]
56
+ @timeline_io
57
+ else
58
+ @timeline_io
59
+ end
60
+ end
61
+
62
+ def timeline_deletion_io
63
+ if @timeline_deletion_io_date != Date.today || !@timeline_deletion_io
64
+ date = @timeline_deletion_io_date = Date.today
65
+ @timeline_deletion_io = ::File.open(@dir.join('timeline', date.strftime('%Y-%m-%d.deleted.txt')), 'a')
66
+ @timeline_deletion_io.sync = !@config.key?("sync_io") || @config["sync_io"]
67
+ @timeline_deletion_io
68
+ else
69
+ @timeline_deletion_io
70
+ end
71
+ end
72
+
73
+ def event_io
74
+ if @event_io_date != Date.today || !@event_io
75
+ date = @event_io_date = Date.today
76
+ @event_io = ::File.open(@dir.join('event', date.strftime('%Y-%m-%d.txt')), 'a')
77
+ @event_io.sync = !@config.key?("sync_io") || @config["sync_io"]
78
+ @event_io
79
+ else
80
+ @event_io
81
+ end
82
+ end
83
+
84
+ def tweets_io_for_user(user_id, screen_name=nil, &block)
85
+ symlink_user_dir(user_id, screen_name)
86
+ date = Date.today
87
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('tweets.%Y-%m.txt')), 'a', &block)
88
+ end
89
+
90
+ def tweets_deletion_io_for_user(user_id, screen_name=nil, &block)
91
+ symlink_user_dir(user_id, screen_name)
92
+ date = Date.today
93
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('deleted-tweets.%Y-%m.txt')), 'a', &block)
94
+ end
95
+
96
+ def messages_io_for_user(user_id, screen_name=nil, &block)
97
+ symlink_user_dir(user_id, screen_name)
98
+ date = Date.today
99
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('messages.%Y-%m.txt')), 'a', &block)
100
+ end
101
+
102
+ def messages_raw_io_for_user(user_id, screen_name=nil, &block)
103
+ symlink_user_dir(user_id, screen_name)
104
+ date = Date.today
105
+ ::File.open(@dir.join('users', user_id.to_s, date.strftime('messages-raw.%Y-%m.txt')), 'a', &block)
106
+ end
107
+
108
+
109
+ def symlink_user_dir(user_id, screen_name=nil)
110
+ user_id_dir = @dir.join('users', user_id.to_s)
111
+ user_id_dir.mkdir unless user_id_dir.exist?
112
+
113
+ return unless screen_name
114
+ screen_name_dir = @dir.join('names', screen_name)
115
+
116
+ unless @screen_name_to_id_cache.has_key?(screen_name)
117
+ @screen_name_to_id_cache[screen_name] = if screen_name_dir.symlink?
118
+ screen_name_dir.realpath.basename.to_s
119
+ else
120
+ nil
121
+ end
122
+ @logger.debug "Caching dir for #{screen_name} : #{@screen_name_to_id_cache[screen_name].inspect}"
123
+ end
124
+
125
+ cached_id = @screen_name_to_id_cache[screen_name]
126
+
127
+ if cached_id && cached_id != user_id.to_s
128
+ prev_id = screen_name_dir.realpath.basename
129
+ @logger.info "Renaming #{screen_name}(#{prev_id}) dir: #{screen_name} -> #{prev_id}-#{screen_name}"
130
+ screen_name_dir.rename(@dir.join('names',"#{prev_id}-#{screen_name}"))
131
+ screen_name_dir.make_symlink("../users/#{user_id_dir.basename}")
132
+ @screen_name_to_id_cache[screen_name] = user_id.to_s
133
+
134
+ elsif cached_id.nil?
135
+ @logger.info "Linking #{screen_name}->#{user_id} dir"
136
+ screen_name_dir.make_symlink("../users/#{user_id_dir.basename}")
137
+ @screen_name_to_id_cache[screen_name] = user_id.to_s
138
+ end
139
+ end
140
+ end
141
+ end
142
+ end