twitter_friendly 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/twitter_friendly/rest/api.rb +0 -5
- data/lib/twitter_friendly/version.rb +1 -1
- metadata +1 -2
- data/lib/twitter_friendly/rest/extension/clusters.rb +0 -313
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 687792a2d0065ebefe3b95ad38f1df4558a644f0c07819619961dd85f4b7fa7c
|
4
|
+
data.tar.gz: ec86328550855e0f864daea0f3ab8bbfadf25cc71e217cda7d794c1d3eb52a2b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3029c832c6da171bedcd7a9a82e2c211e2be912913deed15d6a32ff4d2884f18d4b990ec8f902f4e97dcd684815a59c13a7f0f5e95e6134dc2043e343d43430
|
7
|
+
data.tar.gz: 36365cf299fab85c22859303652963185700c967023a558b7f16f1b289b6b563ccc32ed6f014d48d8383bf0beb3438c102fe9d43e0b34e42c964cec57f650db5
|
data/Gemfile.lock
CHANGED
@@ -9,9 +9,6 @@ require 'twitter_friendly/rest/favorites'
|
|
9
9
|
require 'twitter_friendly/rest/lists'
|
10
10
|
require 'twitter_friendly/rest/tweets'
|
11
11
|
|
12
|
-
# 後方互換性のために残した
|
13
|
-
require 'twitter_friendly/rest/extension/clusters'
|
14
|
-
|
15
12
|
module TwitterFriendly
|
16
13
|
module REST
|
17
14
|
module API
|
@@ -25,8 +22,6 @@ module TwitterFriendly
|
|
25
22
|
include TwitterFriendly::REST::Favorites
|
26
23
|
include TwitterFriendly::REST::Lists
|
27
24
|
include TwitterFriendly::REST::Tweets
|
28
|
-
|
29
|
-
include TwitterFriendly::REST::Extension::Clusters
|
30
25
|
end
|
31
26
|
end
|
32
27
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_friendly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
@@ -188,7 +188,6 @@ files:
|
|
188
188
|
- lib/twitter_friendly/rate_limit.rb
|
189
189
|
- lib/twitter_friendly/rest/api.rb
|
190
190
|
- lib/twitter_friendly/rest/collector.rb
|
191
|
-
- lib/twitter_friendly/rest/extension/clusters.rb
|
192
191
|
- lib/twitter_friendly/rest/favorites.rb
|
193
192
|
- lib/twitter_friendly/rest/friends_and_followers.rb
|
194
193
|
- lib/twitter_friendly/rest/lists.rb
|
@@ -1,313 +0,0 @@
|
|
1
|
-
module TwitterFriendly
|
2
|
-
module REST
|
3
|
-
module Extension
|
4
|
-
module Clusters
|
5
|
-
|
6
|
-
PROFILE_SPECIAL_WORDS = %w(20↑ 成人済 腐女子)
|
7
|
-
PROFILE_SPECIAL_REGEXP = nil
|
8
|
-
PROFILE_EXCLUDE_WORDS = %w(in at of my to no er by is RT DM the and for you inc Inc com from info next gmail 好き こと 最近 紹介 連載 発売 依頼 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 代表 連絡 大好き サイト ブログ つぶやき 株式会社 最新 こちら 届け お仕事 ツイ 返信 プロ 今年 リプ ヘッダー アイコン アカ アカウント ツイート たま ブロック 無言 時間 お願い お願いします お願いいたします イベント フォロー フォロワー フォロバ スタッフ 自動 手動 迷言 名言 非公式 リリース 問い合わせ ツイッター)
|
9
|
-
PROFILE_EXCLUDE_REGEXP = Regexp.union(/\w+@\w+\.(com|co\.jp)/, %r[\d{2,4}(年|/)\d{1,2}(月|/)\d{1,2}日], %r[\d{1,2}/\d{1,2}], /\d{2}th/, URI.regexp)
|
10
|
-
|
11
|
-
def tweet_clusters(tweets, limit: 10, debug: false)
|
12
|
-
return {} if tweets.blank?
|
13
|
-
text = tweets.map(&:text).join(' ')
|
14
|
-
|
15
|
-
if defined?(Rails)
|
16
|
-
exclude_words = JSON.parse(File.read(Rails.configuration.x.constants['cluster_bad_words_path']))
|
17
|
-
special_words = JSON.parse(File.read(Rails.configuration.x.constants['cluster_good_words_path']))
|
18
|
-
else
|
19
|
-
exclude_words = JSON.parse(File.read('./cluster_bad_words.json'))
|
20
|
-
special_words = JSON.parse(File.read('./cluster_good_words.json'))
|
21
|
-
end
|
22
|
-
|
23
|
-
%w(べたら むっちゃ それとも たしかに さそう そんなに ったことある してるの しそうな おやくま ってますか これをやってるよ のせいか 面白い 可愛い).each { |w| exclude_words << w }
|
24
|
-
%w(面白い 可愛い 食べ物 宇多田ヒカル ご飯 面倒 体調悪くなる 空腹 頑張ってない 眼鏡 台風 沖縄 らんま1/2 女の子 怪我 足のむくみ 彼女欲しい 彼氏欲しい 吐き気 注射 海鮮チヂミ 出勤 価格ドットコム 幹事 雑談 パズドラ ビオフェルミン 餃子 お金 まんだらけ 結婚 焼肉 タッチペン).each { |w| special_words << w }
|
25
|
-
|
26
|
-
# クラスタ用の単語の出現回数を記録
|
27
|
-
frequency =
|
28
|
-
special_words.map { |sw| [sw, text.scan(sw)] }
|
29
|
-
.delete_if { |_, matched| matched.empty? }
|
30
|
-
.each_with_object(Hash.new(0)) { |(word, matched), memo| memo[word] = matched.size }
|
31
|
-
|
32
|
-
# 同一文字種の繰り返しを見付ける。漢字の繰り返し、ひらがなの繰り返し、カタカナの繰り返し、など
|
33
|
-
text.scan(/[一-龠〆ヵヶ々]+|[ぁ-んー~]+|[ァ-ヴー~]+|[a-zA-ZA-Z0-9]+|[、。!!??]+/).
|
34
|
-
|
35
|
-
# 複数回繰り返される文字を除去
|
36
|
-
map { |w| w.remove /[?!?!。、w]|(ー{2,})/ }.
|
37
|
-
|
38
|
-
# 文字数の少なすぎる単語、除外単語を除去する
|
39
|
-
delete_if { |w| w.length <= 2 || exclude_words.include?(w) }.
|
40
|
-
|
41
|
-
# 出現回数を記録
|
42
|
-
each { |w| frequency[w] += 1 }
|
43
|
-
|
44
|
-
# 複数個以上見付かった単語のみを残し、出現頻度順にソート
|
45
|
-
frequency.select { |_, v| 2 < v }.sort_by { |k, v| [-v, -k.size] }.take(limit).to_h
|
46
|
-
end
|
47
|
-
|
48
|
-
def count_freq_hashtags(tweets, with_prefix: true, use_regexp: false, debug: false)
|
49
|
-
puts "tweets: #{tweets.size}" if debug
|
50
|
-
return {} if tweets.blank?
|
51
|
-
|
52
|
-
prefix = %w(# #)
|
53
|
-
regexp = /[##]([A-Za-zA-Za-z_一-鿆0-90-9ぁ-ヶヲ-゚ー]+)/
|
54
|
-
|
55
|
-
tweets =
|
56
|
-
if use_regexp
|
57
|
-
tweets.select { |t| t.text && prefix.any? { |char| t.text.include?(char)} }
|
58
|
-
else
|
59
|
-
tweets.select { |t| include_hashtags?(t) }
|
60
|
-
end
|
61
|
-
puts "tweets with hashtag: #{tweets.size}" if debug
|
62
|
-
|
63
|
-
hashtags =
|
64
|
-
if use_regexp
|
65
|
-
tweets.map { |t| t.text.scan(regexp).flatten.map(&:strip) }
|
66
|
-
else
|
67
|
-
tweets.map { |t| extract_hashtags(t) }
|
68
|
-
end.flatten
|
69
|
-
hashtags = hashtags.map { |h| "#{prefix[0]}#{h}" } if with_prefix
|
70
|
-
|
71
|
-
hashtags.each_with_object(Hash.new(0)) { |h, memo| memo[h] += 1 }.sort_by { |k, v| [-v, -k.size] }.to_h
|
72
|
-
end
|
73
|
-
|
74
|
-
def hashtag_clusters(hashtags, limit: 10, debug: false)
|
75
|
-
puts "hashtags: #{hashtags.take(10)}" if debug
|
76
|
-
|
77
|
-
hashtag, count = hashtags.take(3).each_with_object(Hash.new(0)) do |tag, memo|
|
78
|
-
tweets = search(tag)
|
79
|
-
puts "tweets #{tag}: #{tweets.size}" if debug
|
80
|
-
memo[tag] = count_freq_hashtags(tweets).reject { |t, c| t == tag }.values.sum
|
81
|
-
end.max_by { |_, c| c }
|
82
|
-
|
83
|
-
hashtags = count_freq_hashtags(search(hashtag)).reject { |t, c| t == hashtag }.keys
|
84
|
-
queries = hashtags.take(3).combination(2).map { |ary| ary.join(' AND ') }
|
85
|
-
puts "selected #{hashtag}: #{queries.inspect}" if debug
|
86
|
-
|
87
|
-
tweets = queries.map { |q| search(q) }.flatten
|
88
|
-
puts "tweets #{queries.inspect}: #{tweets.size}" if debug
|
89
|
-
|
90
|
-
if tweets.empty?
|
91
|
-
tweets = search(hashtag)
|
92
|
-
puts "tweets #{hashtag}: #{tweets.size}" if debug
|
93
|
-
end
|
94
|
-
|
95
|
-
members = tweets.map { |t| t.user }
|
96
|
-
puts "members count: #{members.size}" if debug
|
97
|
-
|
98
|
-
count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
|
99
|
-
end
|
100
|
-
|
101
|
-
def fetch_lists(user, debug: false)
|
102
|
-
memberships(user, count: 500, call_limit: 2).sort_by { |li| li.member_count }
|
103
|
-
rescue Twitter::Error::ServiceUnavailable => e
|
104
|
-
puts "#{__method__}: #{e.class} #{e.message} #{user.inspect}" if debug
|
105
|
-
[]
|
106
|
-
end
|
107
|
-
|
108
|
-
def list_clusters(lists, shrink: false, shrink_limit: 100, list_member: 300, total_member: 3000, total_list: 50, rate: 0.3, limit: 10, debug: false)
|
109
|
-
lists = lists.sort_by { |li| li.member_count }
|
110
|
-
puts "lists: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
111
|
-
return {} if lists.empty?
|
112
|
-
|
113
|
-
open('lists.txt', 'w') {|f| f.write lists.map(&:full_name).join("\n") } if debug
|
114
|
-
|
115
|
-
list_special_words = %w()
|
116
|
-
list_exclude_regexp = %r(list[0-9]*|people-ive-faved|twizard-magic-list|my-favstar-fm-list|timeline-list|conversationlist|who-i-met)
|
117
|
-
list_exclude_words = %w(it list people who met)
|
118
|
-
|
119
|
-
# リスト名を - で分割 -> 1文字の単語を除去 -> 出現頻度の降順でソート
|
120
|
-
words = lists.map { |li| li.full_name.split('/')[1] }.
|
121
|
-
select { |n| !n.match(list_exclude_regexp) }.
|
122
|
-
map { |n| n.split('-') }.flatten.
|
123
|
-
delete_if { |w| w.size < 2 || list_exclude_words.include?(w) }.
|
124
|
-
map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }.
|
125
|
-
each_with_object(Hash.new(0)) { |w, memo| memo[w] += 1 }.
|
126
|
-
sort_by { |k, v| [-v, -k.size] }
|
127
|
-
|
128
|
-
puts "words: #{words.take(10)}" if debug
|
129
|
-
return {} if words.empty?
|
130
|
-
|
131
|
-
# 出現頻度の高い単語を名前に含むリストを抽出
|
132
|
-
_words = []
|
133
|
-
lists =
|
134
|
-
filter(lists, min: 2) do |li, i|
|
135
|
-
_words = words[0..i].map(&:first)
|
136
|
-
name = li.full_name.split('/')[1]
|
137
|
-
_words.any? { |w| name.include?(w) }
|
138
|
-
end
|
139
|
-
puts "lists include #{_words.inspect}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
140
|
-
return {} if lists.empty?
|
141
|
-
|
142
|
-
# 中間の 25-75% のリストを抽出
|
143
|
-
while lists.size > shrink_limit
|
144
|
-
percentile25 = ((lists.length * 0.25).ceil) - 1
|
145
|
-
percentile75 = ((lists.length * 0.75).ceil) - 1
|
146
|
-
lists = lists[percentile25..percentile75]
|
147
|
-
puts "lists sliced by 25-75 percentile: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
148
|
-
end if shrink || lists.size > shrink_limit
|
149
|
-
|
150
|
-
# メンバー数がしきい値より少ないリストを抽出
|
151
|
-
_list_member = 0
|
152
|
-
_min_list_member = 10 < lists.size ? 10 : 0
|
153
|
-
_lists =
|
154
|
-
filter(lists, min: 2) do |li, i|
|
155
|
-
_list_member = list_member * (1.0 + 0.25 * i)
|
156
|
-
_min_list_member < li.member_count && li.member_count < _list_member
|
157
|
-
end
|
158
|
-
lists = _lists.empty? ? [lists[0]] : _lists
|
159
|
-
puts "lists limited by list member #{_min_list_member}..#{_list_member.round}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
160
|
-
return {} if lists.empty?
|
161
|
-
|
162
|
-
# トータルメンバー数がしきい値より少なくなるリストを抽出
|
163
|
-
_lists = []
|
164
|
-
lists.size.times do |i|
|
165
|
-
_lists = lists[0..(-1 - i)]
|
166
|
-
if _lists.map { |li| li.member_count }.sum < total_member
|
167
|
-
break
|
168
|
-
else
|
169
|
-
_lists = []
|
170
|
-
end
|
171
|
-
end
|
172
|
-
lists = _lists.empty? ? [lists[0]] : _lists
|
173
|
-
puts "lists limited by total members #{total_member}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
174
|
-
return {} if lists.empty?
|
175
|
-
|
176
|
-
# リスト数がしきい値より少なくなるリストを抽出
|
177
|
-
if lists.size > total_list
|
178
|
-
lists = lists[0..(total_list - 1)]
|
179
|
-
end
|
180
|
-
puts "lists limited by total lists #{total_list}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
181
|
-
return {} if lists.empty?
|
182
|
-
|
183
|
-
members = lists.map do |li|
|
184
|
-
begin
|
185
|
-
list_members(li.id)
|
186
|
-
rescue Twitter::Error::NotFound => e
|
187
|
-
puts "#{__method__}: #{e.class} #{e.message} #{li.id} #{li.full_name} #{li.mode}" if debug
|
188
|
-
nil
|
189
|
-
end
|
190
|
-
end.compact.flatten
|
191
|
-
puts "candidate members: #{members.size}" if debug
|
192
|
-
return {} if members.empty?
|
193
|
-
|
194
|
-
open('members.txt', 'w') {|f| f.write members.map{ |m| m.description.gsub(/\R/, ' ') }.join("\n") } if debug
|
195
|
-
|
196
|
-
3.times do
|
197
|
-
_members = members.each_with_object(Hash.new(0)) { |member, memo| memo[member] += 1 }.
|
198
|
-
select { |_, v| lists.size * rate < v }.keys
|
199
|
-
if _members.size > 100
|
200
|
-
members = _members
|
201
|
-
break
|
202
|
-
else
|
203
|
-
rate -= 0.05
|
204
|
-
end
|
205
|
-
end
|
206
|
-
puts "members included multi lists #{rate.round(3)}: #{members.size}" if debug
|
207
|
-
|
208
|
-
count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
|
209
|
-
end
|
210
|
-
|
211
|
-
private
|
212
|
-
|
213
|
-
def filter(lists, min:)
|
214
|
-
min = [min, lists.size].min
|
215
|
-
_lists = []
|
216
|
-
3.times do |i|
|
217
|
-
_lists = lists.select { |li| yield(li, i) }
|
218
|
-
break if _lists.size >= min
|
219
|
-
end
|
220
|
-
_lists
|
221
|
-
end
|
222
|
-
|
223
|
-
def count_by_word(texts, delim: nil, tagger: nil, min_length: 2, max_length: 5, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil)
|
224
|
-
texts = texts.dup
|
225
|
-
|
226
|
-
frequency = Hash.new(0)
|
227
|
-
if special_words.any?
|
228
|
-
texts.each do |text|
|
229
|
-
special_words.map { |sw| [sw, text.scan(sw)] }
|
230
|
-
.delete_if { |_, matched| matched.empty? }
|
231
|
-
.each_with_object(frequency) { |(word, matched), memo| memo[word] += matched.size }
|
232
|
-
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
if exclude_regexp
|
237
|
-
texts = texts.map { |t| t.remove(exclude_regexp) }
|
238
|
-
end
|
239
|
-
|
240
|
-
if delim
|
241
|
-
texts = texts.map { |t| t.split(delim) }.flatten.map(&:strip)
|
242
|
-
end
|
243
|
-
|
244
|
-
if tagger
|
245
|
-
texts = texts.map { |t| tagger.parse(t).split("\n") }.flatten.
|
246
|
-
select { |line| line.include?('名詞') }.
|
247
|
-
map { |line| line.split("\t")[0] }
|
248
|
-
end
|
249
|
-
|
250
|
-
texts.delete_if { |w| w.empty? || w.size < min_length || max_length < w.size || exclude_words.include?(w) || w.match(/\d{2}/) }.
|
251
|
-
each_with_object(frequency) { |word, memo| memo[word] += 1 }.
|
252
|
-
sort_by { |k, v| [-v, -k.size] }.to_h
|
253
|
-
end
|
254
|
-
|
255
|
-
def count_freq_words(texts, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil, debug: false)
|
256
|
-
candidates, remains = texts.partition { |desc| desc.scan('/').size > 2 }
|
257
|
-
slash_freq = count_by_word(candidates, delim: '/', exclude_regexp: exclude_regexp)
|
258
|
-
puts "words splitted by /: #{slash_freq.take(10)}" if debug
|
259
|
-
|
260
|
-
candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
|
261
|
-
pipe_freq = count_by_word(candidates, delim: '|', exclude_regexp: exclude_regexp)
|
262
|
-
puts "words splitted by |: #{pipe_freq.take(10)}" if debug
|
263
|
-
|
264
|
-
noun_freq = count_by_word(remains, tagger: build_tagger, special_words: special_words, exclude_words: exclude_words, special_regexp: special_regexp, exclude_regexp: exclude_regexp)
|
265
|
-
puts "words tagged as noun: #{noun_freq.take(10)}" if debug
|
266
|
-
|
267
|
-
slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.
|
268
|
-
merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }
|
269
|
-
end
|
270
|
-
|
271
|
-
def build_tagger
|
272
|
-
require 'mecab'
|
273
|
-
MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
|
274
|
-
rescue => e
|
275
|
-
puts "Add gem 'mecab' to your Gemfile."
|
276
|
-
raise e
|
277
|
-
end
|
278
|
-
|
279
|
-
def include_hashtags?(tweet)
|
280
|
-
tweet.entities&.hashtags&.any?
|
281
|
-
end
|
282
|
-
|
283
|
-
def extract_hashtags(tweet)
|
284
|
-
tweet.entities.hashtags.map { |h| h.text }
|
285
|
-
end
|
286
|
-
|
287
|
-
SYNONYM_WORDS = (
|
288
|
-
%w(cosplay cosplayer cosplayers coser cos こすぷれ コスプレ レイヤ レイヤー コスプレイヤー レイヤーさん).map { |w| [w, 'coplay'] } +
|
289
|
-
%w(tsukuba tkb).map { |w| [w, 'tsukuba'] } +
|
290
|
-
%w(waseda 早稲田 早稲田大学).map { |w| [w, 'waseda'] } +
|
291
|
-
%w(keio 慶應 慶應義塾).map { |w| [w, 'keio'] } +
|
292
|
-
%w(gakusai gakuensai 学祭 学園祭).map { |w| [w, 'gakusai'] } +
|
293
|
-
%w(kosen kousen).map { |w| [w, 'kosen'] } +
|
294
|
-
%w(anime アニメ).map { |w| [w, 'anime'] } +
|
295
|
-
%w(photo photos).map { |w| [w, 'photo'] } +
|
296
|
-
%w(creator creater クリエイター).map { |w| [w, 'creator'] } +
|
297
|
-
%w(illustrator illustrater 絵師).map { |w| [w, 'illustrator'] } +
|
298
|
-
%w(artist art artists アート 芸術).map { |w| [w, 'artist'] } +
|
299
|
-
%w(design デザイン).map { |w| [w, 'design'] } +
|
300
|
-
%w(kawaii かわいい).map { |w| [w, 'kawaii'] } +
|
301
|
-
%w(idol あいどる アイドル 美人).map { |w| [w, 'idol'] } +
|
302
|
-
%w(music musician musicians dj netlabel label レーベル おんがく 音楽家 音楽).map { |w| [w, 'music'] } +
|
303
|
-
%w(engineer engineers engineering えんじにあ tech 技術 技術系 hacker coder programming programer programmer geek rubyist ruby scala java lisp).map { |w| [w, 'engineer'] } +
|
304
|
-
%w(internet インターネット).map { |w| [w, 'internet'] }
|
305
|
-
).to_h
|
306
|
-
|
307
|
-
def normalize_synonym(words)
|
308
|
-
words.map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|
312
|
-
end
|
313
|
-
end
|