twitter_with_auto_pagination 0.8.11 → 0.8.12
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a5ffffb7a3903265ee83d5340491193424cf6b2
|
4
|
+
data.tar.gz: df27d9bc220d90ad97c28911ed67960233ba63dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 978c61c49f0fc373584cc3284483a3c775a0035499e2ba49b6271a5e78b8ea163ecc1a6438b682d540ee1e691f4461a78cbc813a2235869b619655777af74496
|
7
|
+
data.tar.gz: f6956150c2825aeae989067263e1eebf5d51df0dece18767239fedfa2b3970b899cfc51569f99f4e2f0fe671df7712081445636510633dd14d903a6c519db9e0
|
@@ -6,7 +6,12 @@ module TwitterWithAutoPagination
|
|
6
6
|
module Clusters
|
7
7
|
include TwitterWithAutoPagination::REST::Utils
|
8
8
|
|
9
|
-
|
9
|
+
PROFILE_SPECIAL_WORDS = %w(20↑ 成人済 腐女子)
|
10
|
+
PROFILE_SPECIAL_REGEXP = nil
|
11
|
+
PROFILE_EXCLUDE_WORDS = %w(in at of my to no er by is RT DM the and for you inc Inc com from info next gmail 好き こと 最近 紹介 連載 発売 依頼 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 代表 連絡 大好き サイト ブログ つぶやき 株式会社 最新 こちら 届け お仕事 ツイ 返信 プロ 今年 リプ ヘッダー アイコン アカ アカウント ツイート たま ブロック 無言 時間 お願い お願いします お願いいたします イベント フォロー フォロワー フォロバ スタッフ 自動 手動 迷言 名言 非公式 リリース 問い合わせ ツイッター)
|
12
|
+
PROFILE_EXCLUDE_REGEXP = Regexp.union(/\w+@\w+\.(com|co\.jp)/, %r[\d{2,4}(年|/)\d{1,2}(月|/)\d{1,2}日], %r[\d{1,2}/\d{1,2}], /\d{2}th/, URI.regexp)
|
13
|
+
|
14
|
+
def tweet_clusters(tweets, limit: 10, debug: false)
|
10
15
|
return {} if tweets.blank?
|
11
16
|
text = tweets.map(&:text).join(' ')
|
12
17
|
|
@@ -18,7 +23,7 @@ module TwitterWithAutoPagination
|
|
18
23
|
special_words = JSON.parse(File.read('./cluster_good_words.json'))
|
19
24
|
end
|
20
25
|
|
21
|
-
%w(べたら それとも たしかに さそう そんなに ったことある してるの しそうな おやくま ってますか これをやってるよ のせいか).each { |w| exclude_words << w }
|
26
|
+
%w(べたら むっちゃ それとも たしかに さそう そんなに ったことある してるの しそうな おやくま ってますか これをやってるよ のせいか 面白い 可愛い).each { |w| exclude_words << w }
|
22
27
|
%w(面白い 可愛い 食べ物 宇多田ヒカル ご飯 面倒 体調悪くなる 空腹 頑張ってない 眼鏡 台風 沖縄 らんま1/2 女の子 怪我 足のむくみ 彼女欲しい 彼氏欲しい 吐き気 注射 海鮮チヂミ 出勤 価格ドットコム 幹事 雑談 パズドラ ビオフェルミン 餃子 お金 まんだらけ 結婚 焼肉 タッチペン).each { |w| special_words << w }
|
23
28
|
|
24
29
|
# クラスタ用の単語の出現回数を記録
|
@@ -40,54 +45,90 @@ module TwitterWithAutoPagination
|
|
40
45
|
each { |w| frequency[w] += 1 }
|
41
46
|
|
42
47
|
# 複数個以上見付かった単語のみを残し、出現頻度順にソート
|
43
|
-
frequency.select { |_, v| 2 < v }.sort_by { |k, v| [-v, -k.size] }.
|
48
|
+
frequency.select { |_, v| 2 < v }.sort_by { |k, v| [-v, -k.size] }.take(limit).to_h
|
44
49
|
end
|
45
50
|
|
46
|
-
def
|
51
|
+
def count_freq_hashtags(tweets, with_prefix: true, use_regexp: false, debug: false)
|
47
52
|
puts "tweets: #{tweets.size}" if debug
|
48
53
|
return {} if tweets.blank?
|
49
54
|
|
50
|
-
|
55
|
+
prefix = %w(# #)
|
56
|
+
regexp = /[##]([A-Za-zA-Za-z_一-鿆0-90-9ぁ-ヶヲ-゚ー]+)/
|
57
|
+
|
58
|
+
tweets =
|
59
|
+
if use_regexp
|
60
|
+
tweets.select { |t| t.text && prefix.any? { |char| t.text.include?(char)} }
|
61
|
+
else
|
62
|
+
tweets.select { |t| include_hashtags?(t) }
|
63
|
+
end
|
51
64
|
puts "tweets with hashtag: #{tweets.size}" if debug
|
52
65
|
|
53
|
-
hashtags =
|
54
|
-
|
66
|
+
hashtags =
|
67
|
+
if use_regexp
|
68
|
+
tweets.map { |t| t.text.scan(regexp).flatten.map(&:strip) }
|
69
|
+
else
|
70
|
+
tweets.map { |t| extract_hashtags(t) }
|
71
|
+
end.flatten
|
72
|
+
hashtags = hashtags.map { |h| "#{prefix[0]}#{h}" } if with_prefix
|
55
73
|
|
56
|
-
hashtags.each_with_object(Hash.new(0)) { |h, memo| memo[h] += 1 }.sort_by { |k, v| [-v, -k.size] }.
|
74
|
+
hashtags.each_with_object(Hash.new(0)) { |h, memo| memo[h] += 1 }.sort_by { |k, v| [-v, -k.size] }.to_h
|
57
75
|
end
|
58
76
|
|
59
|
-
def
|
60
|
-
|
61
|
-
require 'mecab'
|
62
|
-
rescue => e
|
63
|
-
puts "Add gem 'mecab' to your Gemfile."
|
64
|
-
return nil
|
65
|
-
end
|
77
|
+
def hashtag_clusters(hashtags, limit: 10, debug: false)
|
78
|
+
puts "hashtags: #{hashtags.take(10)}" if debug
|
66
79
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
80
|
+
hashtag, count = hashtags.take(3).each_with_object(Hash.new(0)) do |tag, memo|
|
81
|
+
tweets = search(tag)
|
82
|
+
puts "tweets #{tag}: #{tweets.size}" if debug
|
83
|
+
memo[tag] = count_freq_hashtags(tweets).reject { |t, c| t == tag }.values.sum
|
84
|
+
end.max_by { |_, c| c }
|
85
|
+
|
86
|
+
hashtags = count_freq_hashtags(search(hashtag)).reject { |t, c| t == hashtag }.keys
|
87
|
+
queries = hashtags.take(3).combination(2).map { |ary| ary.join(' AND ') }
|
88
|
+
puts "selected #{hashtag}: #{queries.inspect}" if debug
|
89
|
+
|
90
|
+
tweets = queries.map { |q| search(q) }.flatten
|
91
|
+
puts "tweets #{queries.inspect}: #{tweets.size}" if debug
|
92
|
+
|
93
|
+
if tweets.empty?
|
94
|
+
tweets = search(hashtag)
|
95
|
+
puts "tweets #{hashtag}: #{tweets.size}" if debug
|
72
96
|
end
|
97
|
+
|
98
|
+
members = tweets.map { |t| t.user }
|
99
|
+
puts "members count: #{members.size}" if debug
|
100
|
+
|
101
|
+
count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
|
102
|
+
end
|
103
|
+
|
104
|
+
def fetch_lists(user, debug: false)
|
105
|
+
memberships(user, count: 500, call_limit: 2).sort_by { |li| li.member_count }
|
106
|
+
rescue Twitter::Error::ServiceUnavailable => e
|
107
|
+
puts "#{__method__}: #{e.class} #{e.message} #{user.inspect}" if debug
|
108
|
+
[]
|
109
|
+
end
|
110
|
+
|
111
|
+
def list_clusters(lists, shrink: false, shrink_limit: 100, list_member: 300, total_member: 3000, total_list: 50, rate: 0.3, limit: 10, debug: false)
|
112
|
+
lists = lists.sort_by { |li| li.member_count }
|
73
113
|
puts "lists: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
74
114
|
return {} if lists.empty?
|
75
115
|
|
76
116
|
open('lists.txt', 'w') {|f| f.write lists.map(&:full_name).join("\n") } if debug
|
77
117
|
|
78
118
|
list_special_words = %w()
|
79
|
-
|
119
|
+
list_exclude_regexp = %r(list[0-9]*|people-ive-faved|twizard-magic-list|my-favstar-fm-list|timeline-list|conversationlist|who-i-met)
|
80
120
|
list_exclude_words = %w(it list people who met)
|
81
121
|
|
82
122
|
# リスト名を - で分割 -> 1文字の単語を除去 -> 出現頻度の降順でソート
|
83
123
|
words = lists.map { |li| li.full_name.split('/')[1] }.
|
84
|
-
select { |n| !n.match(
|
124
|
+
select { |n| !n.match(list_exclude_regexp) }.
|
85
125
|
map { |n| n.split('-') }.flatten.
|
86
126
|
delete_if { |w| w.size < 2 || list_exclude_words.include?(w) }.
|
127
|
+
map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }.
|
87
128
|
each_with_object(Hash.new(0)) { |w, memo| memo[w] += 1 }.
|
88
129
|
sort_by { |k, v| [-v, -k.size] }
|
89
130
|
|
90
|
-
puts "words: #{words.
|
131
|
+
puts "words: #{words.take(10)}" if debug
|
91
132
|
return {} if words.empty?
|
92
133
|
|
93
134
|
# 出現頻度の高い単語を名前に含むリストを抽出
|
@@ -112,11 +153,12 @@ module TwitterWithAutoPagination
|
|
112
153
|
# メンバー数がしきい値より少ないリストを抽出
|
113
154
|
_list_member = 0
|
114
155
|
_min_list_member = 10 < lists.size ? 10 : 0
|
115
|
-
|
156
|
+
_lists =
|
116
157
|
filter(lists, min: 2) do |li, i|
|
117
158
|
_list_member = list_member * (1.0 + 0.25 * i)
|
118
159
|
_min_list_member < li.member_count && li.member_count < _list_member
|
119
160
|
end
|
161
|
+
lists = _lists.empty? ? [lists[0]] : _lists
|
120
162
|
puts "lists limited by list member #{_min_list_member}..#{_list_member.round}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
121
163
|
return {} if lists.empty?
|
122
164
|
|
@@ -166,25 +208,7 @@ module TwitterWithAutoPagination
|
|
166
208
|
end
|
167
209
|
puts "members included multi lists #{rate.round(3)}: #{members.size}" if debug
|
168
210
|
|
169
|
-
|
170
|
-
profile_special_words = %w()
|
171
|
-
profile_exclude_words = %w(in at of my no er the and for inc Inc com info gmail 好き こと 最近 連載 発売 依頼 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 代表 連絡 大好き サイト ブログ つぶやき 株式会社 こちら 届け お仕事 アカ アカウント ツイート たま ブロック 時間 お願い お願いします お願いいたします イベント フォロー)
|
172
|
-
|
173
|
-
descriptions = members.map { |m| m.description.remove(URI.regexp) }
|
174
|
-
|
175
|
-
candidates, remains = descriptions.partition { |desc| desc.scan('/').size > 2 }
|
176
|
-
slash_freq = count_by_word(candidates, delim: '/')
|
177
|
-
puts "words splitted by /: #{slash_freq.to_a.slice(0, 10)}" if debug
|
178
|
-
|
179
|
-
candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
|
180
|
-
pipe_freq = count_by_word(candidates, delim: '|')
|
181
|
-
puts "words splitted by |: #{pipe_freq.to_a.slice(0, 10)}" if debug
|
182
|
-
|
183
|
-
tagger = MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
|
184
|
-
noun_freq = count_by_word(remains, tagger: tagger, exclude_words: profile_exclude_words)
|
185
|
-
puts "words tagged as noun: #{noun_freq.to_a.slice(0, 10)}" if debug
|
186
|
-
|
187
|
-
slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }.slice(0, limit)
|
211
|
+
count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
|
188
212
|
end
|
189
213
|
|
190
214
|
private
|
@@ -199,23 +223,93 @@ module TwitterWithAutoPagination
|
|
199
223
|
_lists
|
200
224
|
end
|
201
225
|
|
202
|
-
def count_by_word(texts, delim: nil, tagger: nil, exclude_words: [])
|
226
|
+
def count_by_word(texts, delim: nil, tagger: nil, min_length: 2, max_length: 5, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil)
|
203
227
|
texts = texts.dup
|
204
228
|
|
229
|
+
frequency = Hash.new(0)
|
230
|
+
if special_words.any?
|
231
|
+
texts.each do |text|
|
232
|
+
special_words.map { |sw| [sw, text.scan(sw)] }
|
233
|
+
.delete_if { |_, matched| matched.empty? }
|
234
|
+
.each_with_object(frequency) { |(word, matched), memo| memo[word] += matched.size }
|
235
|
+
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
if exclude_regexp
|
240
|
+
texts = texts.map { |t| t.remove(exclude_regexp) }
|
241
|
+
end
|
242
|
+
|
205
243
|
if delim
|
206
244
|
texts = texts.map { |t| t.split(delim) }.flatten.map(&:strip)
|
207
245
|
end
|
208
246
|
|
209
247
|
if tagger
|
210
|
-
texts = tagger.parse(
|
248
|
+
texts = texts.map { |t| tagger.parse(t).split("\n") }.flatten.
|
211
249
|
select { |line| line.include?('名詞') }.
|
212
250
|
map { |line| line.split("\t")[0] }
|
213
251
|
end
|
214
252
|
|
215
|
-
texts.delete_if { |w| w.empty? || w.size <
|
216
|
-
each_with_object(
|
253
|
+
texts.delete_if { |w| w.empty? || w.size < min_length || max_length < w.size || exclude_words.include?(w) || w.match(/\d{2}/) }.
|
254
|
+
each_with_object(frequency) { |word, memo| memo[word] += 1 }.
|
217
255
|
sort_by { |k, v| [-v, -k.size] }.to_h
|
218
256
|
end
|
257
|
+
|
258
|
+
def count_freq_words(texts, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil, debug: false)
|
259
|
+
candidates, remains = texts.partition { |desc| desc.scan('/').size > 2 }
|
260
|
+
slash_freq = count_by_word(candidates, delim: '/', exclude_regexp: exclude_regexp)
|
261
|
+
puts "words splitted by /: #{slash_freq.take(10)}" if debug
|
262
|
+
|
263
|
+
candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
|
264
|
+
pipe_freq = count_by_word(candidates, delim: '|', exclude_regexp: exclude_regexp)
|
265
|
+
puts "words splitted by |: #{pipe_freq.take(10)}" if debug
|
266
|
+
|
267
|
+
noun_freq = count_by_word(remains, tagger: build_tagger, special_words: special_words, exclude_words: exclude_words, special_regexp: special_regexp, exclude_regexp: exclude_regexp)
|
268
|
+
puts "words tagged as noun: #{noun_freq.take(10)}" if debug
|
269
|
+
|
270
|
+
slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.
|
271
|
+
merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }
|
272
|
+
end
|
273
|
+
|
274
|
+
def build_tagger
|
275
|
+
require 'mecab'
|
276
|
+
MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
|
277
|
+
rescue => e
|
278
|
+
puts "Add gem 'mecab' to your Gemfile."
|
279
|
+
raise e
|
280
|
+
end
|
281
|
+
|
282
|
+
def include_hashtags?(tweet)
|
283
|
+
tweet.entities&.hashtags&.any?
|
284
|
+
end
|
285
|
+
|
286
|
+
def extract_hashtags(tweet)
|
287
|
+
tweet.entities.hashtags.map { |h| h.text }
|
288
|
+
end
|
289
|
+
|
290
|
+
SYNONYM_WORDS = (
|
291
|
+
%w(cosplay cosplayer cosplayers coser cos こすぷれ コスプレ レイヤ レイヤー コスプレイヤー レイヤーさん).map { |w| [w, 'coplay'] } +
|
292
|
+
%w(tsukuba tkb).map { |w| [w, 'tsukuba'] } +
|
293
|
+
%w(waseda 早稲田 早稲田大学).map { |w| [w, 'waseda'] } +
|
294
|
+
%w(keio 慶應 慶應義塾).map { |w| [w, 'keio'] } +
|
295
|
+
%w(gakusai gakuensai 学祭 学園祭).map { |w| [w, 'gakusai'] } +
|
296
|
+
%w(kosen kousen).map { |w| [w, 'kosen'] } +
|
297
|
+
%w(anime アニメ).map { |w| [w, 'anime'] } +
|
298
|
+
%w(photo photos).map { |w| [w, 'photo'] } +
|
299
|
+
%w(creator creater クリエイター).map { |w| [w, 'creator'] } +
|
300
|
+
%w(illustrator illustrater 絵師).map { |w| [w, 'illustrator'] } +
|
301
|
+
%w(artist art artists アート 芸術).map { |w| [w, 'artist'] } +
|
302
|
+
%w(design デザイン).map { |w| [w, 'design'] } +
|
303
|
+
%w(kawaii かわいい).map { |w| [w, 'kawaii'] } +
|
304
|
+
%w(idol あいどる アイドル 美人).map { |w| [w, 'idol'] } +
|
305
|
+
%w(music musician musicians dj netlabel label レーベル おんがく 音楽家 音楽).map { |w| [w, 'music'] } +
|
306
|
+
%w(engineer engineers engineering えんじにあ tech 技術 技術系 hacker coder programming programer programmer geek rubyist ruby scala java lisp).map { |w| [w, 'engineer'] } +
|
307
|
+
%w(internet インターネット).map { |w| [w, 'internet'] }
|
308
|
+
).to_h
|
309
|
+
|
310
|
+
def normalize_synonym(words)
|
311
|
+
words.map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }
|
312
|
+
end
|
219
313
|
end
|
220
314
|
end
|
221
315
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_with_auto_pagination
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shinohara Teruki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: twitter
|