twitter_friendly 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5611cd772b52ea39a00bb73218c52495d4934d3994fb5f471e37f8f9dc81cccc
4
- data.tar.gz: 9e6a9094fe7f541d30de5281fa812fb273956645044acd69b755963f4235fe95
3
+ metadata.gz: 687792a2d0065ebefe3b95ad38f1df4558a644f0c07819619961dd85f4b7fa7c
4
+ data.tar.gz: ec86328550855e0f864daea0f3ab8bbfadf25cc71e217cda7d794c1d3eb52a2b
5
5
  SHA512:
6
- metadata.gz: aa33d8279f915b94f5e43e96f2ce09ec32d0b019e18fbe214f207f403bbfcde773b45627d546f6177b7735a6f234954eb616f45f79b808f7322e0de775dad196
7
- data.tar.gz: 848f0e69305824b7cf7d3fbe219fbd278957c9968ad619ec7e648ed641d60174f3475294a20b185a877de35de80dcfee6cf3c186bae6ff8e839e9f54d9f0ec2c
6
+ metadata.gz: b3029c832c6da171bedcd7a9a82e2c211e2be912913deed15d6a32ff4d2884f18d4b990ec8f902f4e97dcd684815a59c13a7f0f5e95e6134dc2043e343d43430
7
+ data.tar.gz: 36365cf299fab85c22859303652963185700c967023a558b7f16f1b289b6b563ccc32ed6f014d48d8383bf0beb3438c102fe9d43e0b34e42c964cec57f650db5
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitter_friendly (1.1.0)
4
+ twitter_friendly (1.2.0)
5
5
  activesupport (>= 4.2, < 6.0)
6
6
  oj (~> 3.7.6)
7
7
  parallel (~> 1.12.1)
@@ -9,9 +9,6 @@ require 'twitter_friendly/rest/favorites'
9
9
  require 'twitter_friendly/rest/lists'
10
10
  require 'twitter_friendly/rest/tweets'
11
11
 
12
- # 後方互換性のために残した
13
- require 'twitter_friendly/rest/extension/clusters'
14
-
15
12
  module TwitterFriendly
16
13
  module REST
17
14
  module API
@@ -25,8 +22,6 @@ module TwitterFriendly
25
22
  include TwitterFriendly::REST::Favorites
26
23
  include TwitterFriendly::REST::Lists
27
24
  include TwitterFriendly::REST::Tweets
28
-
29
- include TwitterFriendly::REST::Extension::Clusters
30
25
  end
31
26
  end
32
27
  end
@@ -1,3 +1,3 @@
1
1
  module TwitterFriendly
2
- VERSION = "1.1.0"
2
+ VERSION = "1.2.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_friendly
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
@@ -188,7 +188,6 @@ files:
188
188
  - lib/twitter_friendly/rate_limit.rb
189
189
  - lib/twitter_friendly/rest/api.rb
190
190
  - lib/twitter_friendly/rest/collector.rb
191
- - lib/twitter_friendly/rest/extension/clusters.rb
192
191
  - lib/twitter_friendly/rest/favorites.rb
193
192
  - lib/twitter_friendly/rest/friends_and_followers.rb
194
193
  - lib/twitter_friendly/rest/lists.rb
@@ -1,313 +0,0 @@
1
- module TwitterFriendly
2
- module REST
3
- module Extension
4
- module Clusters
5
-
6
- PROFILE_SPECIAL_WORDS = %w(20↑ 成人済 腐女子)
7
- PROFILE_SPECIAL_REGEXP = nil
8
- PROFILE_EXCLUDE_WORDS = %w(in at of my to no er by is RT DM the and for you inc Inc com from info next gmail 好き こと 最近 紹介 連載 発売 依頼 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 代表 連絡 大好き サイト ブログ つぶやき 株式会社 最新 こちら 届け お仕事 ツイ 返信 プロ 今年 リプ ヘッダー アイコン アカ アカウント ツイート たま ブロック 無言 時間 お願い お願いします お願いいたします イベント フォロー フォロワー フォロバ スタッフ 自動 手動 迷言 名言 非公式 リリース 問い合わせ ツイッター)
9
- PROFILE_EXCLUDE_REGEXP = Regexp.union(/\w+@\w+\.(com|co\.jp)/, %r[\d{2,4}(年|/)\d{1,2}(月|/)\d{1,2}日], %r[\d{1,2}/\d{1,2}], /\d{2}th/, URI.regexp)
10
-
11
- def tweet_clusters(tweets, limit: 10, debug: false)
12
- return {} if tweets.blank?
13
- text = tweets.map(&:text).join(' ')
14
-
15
- if defined?(Rails)
16
- exclude_words = JSON.parse(File.read(Rails.configuration.x.constants['cluster_bad_words_path']))
17
- special_words = JSON.parse(File.read(Rails.configuration.x.constants['cluster_good_words_path']))
18
- else
19
- exclude_words = JSON.parse(File.read('./cluster_bad_words.json'))
20
- special_words = JSON.parse(File.read('./cluster_good_words.json'))
21
- end
22
-
23
- %w(べたら むっちゃ それとも たしかに さそう そんなに ったことある してるの しそうな おやくま ってますか これをやってるよ のせいか 面白い 可愛い).each { |w| exclude_words << w }
24
- %w(面白い 可愛い 食べ物 宇多田ヒカル ご飯 面倒 体調悪くなる 空腹 頑張ってない 眼鏡 台風 沖縄 らんま1/2 女の子 怪我 足のむくみ 彼女欲しい 彼氏欲しい 吐き気 注射 海鮮チヂミ 出勤 価格ドットコム 幹事 雑談 パズドラ ビオフェルミン 餃子 お金 まんだらけ 結婚 焼肉 タッチペン).each { |w| special_words << w }
25
-
26
- # クラスタ用の単語の出現回数を記録
27
- frequency =
28
- special_words.map { |sw| [sw, text.scan(sw)] }
29
- .delete_if { |_, matched| matched.empty? }
30
- .each_with_object(Hash.new(0)) { |(word, matched), memo| memo[word] = matched.size }
31
-
32
- # 同一文字種の繰り返しを見付ける。漢字の繰り返し、ひらがなの繰り返し、カタカナの繰り返し、など
33
- text.scan(/[一-龠〆ヵヶ々]+|[ぁ-んー~]+|[ァ-ヴー~]+|[a-zA-ZA-Z0-9]+|[、。!!??]+/).
34
-
35
- # 複数回繰り返される文字を除去
36
- map { |w| w.remove /[?!?!。、w]|(ー{2,})/ }.
37
-
38
- # 文字数の少なすぎる単語、除外単語を除去する
39
- delete_if { |w| w.length <= 2 || exclude_words.include?(w) }.
40
-
41
- # 出現回数を記録
42
- each { |w| frequency[w] += 1 }
43
-
44
- # 複数個以上見付かった単語のみを残し、出現頻度順にソート
45
- frequency.select { |_, v| 2 < v }.sort_by { |k, v| [-v, -k.size] }.take(limit).to_h
46
- end
47
-
48
- def count_freq_hashtags(tweets, with_prefix: true, use_regexp: false, debug: false)
49
- puts "tweets: #{tweets.size}" if debug
50
- return {} if tweets.blank?
51
-
52
- prefix = %w(# #)
53
- regexp = /[##]([A-Za-zA-Za-z_一-鿆0-90-9ぁ-ヶヲ-゚ー]+)/
54
-
55
- tweets =
56
- if use_regexp
57
- tweets.select { |t| t.text && prefix.any? { |char| t.text.include?(char)} }
58
- else
59
- tweets.select { |t| include_hashtags?(t) }
60
- end
61
- puts "tweets with hashtag: #{tweets.size}" if debug
62
-
63
- hashtags =
64
- if use_regexp
65
- tweets.map { |t| t.text.scan(regexp).flatten.map(&:strip) }
66
- else
67
- tweets.map { |t| extract_hashtags(t) }
68
- end.flatten
69
- hashtags = hashtags.map { |h| "#{prefix[0]}#{h}" } if with_prefix
70
-
71
- hashtags.each_with_object(Hash.new(0)) { |h, memo| memo[h] += 1 }.sort_by { |k, v| [-v, -k.size] }.to_h
72
- end
73
-
74
- def hashtag_clusters(hashtags, limit: 10, debug: false)
75
- puts "hashtags: #{hashtags.take(10)}" if debug
76
-
77
- hashtag, count = hashtags.take(3).each_with_object(Hash.new(0)) do |tag, memo|
78
- tweets = search(tag)
79
- puts "tweets #{tag}: #{tweets.size}" if debug
80
- memo[tag] = count_freq_hashtags(tweets).reject { |t, c| t == tag }.values.sum
81
- end.max_by { |_, c| c }
82
-
83
- hashtags = count_freq_hashtags(search(hashtag)).reject { |t, c| t == hashtag }.keys
84
- queries = hashtags.take(3).combination(2).map { |ary| ary.join(' AND ') }
85
- puts "selected #{hashtag}: #{queries.inspect}" if debug
86
-
87
- tweets = queries.map { |q| search(q) }.flatten
88
- puts "tweets #{queries.inspect}: #{tweets.size}" if debug
89
-
90
- if tweets.empty?
91
- tweets = search(hashtag)
92
- puts "tweets #{hashtag}: #{tweets.size}" if debug
93
- end
94
-
95
- members = tweets.map { |t| t.user }
96
- puts "members count: #{members.size}" if debug
97
-
98
- count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
99
- end
100
-
101
- def fetch_lists(user, debug: false)
102
- memberships(user, count: 500, call_limit: 2).sort_by { |li| li.member_count }
103
- rescue Twitter::Error::ServiceUnavailable => e
104
- puts "#{__method__}: #{e.class} #{e.message} #{user.inspect}" if debug
105
- []
106
- end
107
-
108
- def list_clusters(lists, shrink: false, shrink_limit: 100, list_member: 300, total_member: 3000, total_list: 50, rate: 0.3, limit: 10, debug: false)
109
- lists = lists.sort_by { |li| li.member_count }
110
- puts "lists: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
111
- return {} if lists.empty?
112
-
113
- open('lists.txt', 'w') {|f| f.write lists.map(&:full_name).join("\n") } if debug
114
-
115
- list_special_words = %w()
116
- list_exclude_regexp = %r(list[0-9]*|people-ive-faved|twizard-magic-list|my-favstar-fm-list|timeline-list|conversationlist|who-i-met)
117
- list_exclude_words = %w(it list people who met)
118
-
119
- # リスト名を - で分割 -> 1文字の単語を除去 -> 出現頻度の降順でソート
120
- words = lists.map { |li| li.full_name.split('/')[1] }.
121
- select { |n| !n.match(list_exclude_regexp) }.
122
- map { |n| n.split('-') }.flatten.
123
- delete_if { |w| w.size < 2 || list_exclude_words.include?(w) }.
124
- map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }.
125
- each_with_object(Hash.new(0)) { |w, memo| memo[w] += 1 }.
126
- sort_by { |k, v| [-v, -k.size] }
127
-
128
- puts "words: #{words.take(10)}" if debug
129
- return {} if words.empty?
130
-
131
- # 出現頻度の高い単語を名前に含むリストを抽出
132
- _words = []
133
- lists =
134
- filter(lists, min: 2) do |li, i|
135
- _words = words[0..i].map(&:first)
136
- name = li.full_name.split('/')[1]
137
- _words.any? { |w| name.include?(w) }
138
- end
139
- puts "lists include #{_words.inspect}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
140
- return {} if lists.empty?
141
-
142
- # 中間の 25-75% のリストを抽出
143
- while lists.size > shrink_limit
144
- percentile25 = ((lists.length * 0.25).ceil) - 1
145
- percentile75 = ((lists.length * 0.75).ceil) - 1
146
- lists = lists[percentile25..percentile75]
147
- puts "lists sliced by 25-75 percentile: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
148
- end if shrink || lists.size > shrink_limit
149
-
150
- # メンバー数がしきい値より少ないリストを抽出
151
- _list_member = 0
152
- _min_list_member = 10 < lists.size ? 10 : 0
153
- _lists =
154
- filter(lists, min: 2) do |li, i|
155
- _list_member = list_member * (1.0 + 0.25 * i)
156
- _min_list_member < li.member_count && li.member_count < _list_member
157
- end
158
- lists = _lists.empty? ? [lists[0]] : _lists
159
- puts "lists limited by list member #{_min_list_member}..#{_list_member.round}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
160
- return {} if lists.empty?
161
-
162
- # トータルメンバー数がしきい値より少なくなるリストを抽出
163
- _lists = []
164
- lists.size.times do |i|
165
- _lists = lists[0..(-1 - i)]
166
- if _lists.map { |li| li.member_count }.sum < total_member
167
- break
168
- else
169
- _lists = []
170
- end
171
- end
172
- lists = _lists.empty? ? [lists[0]] : _lists
173
- puts "lists limited by total members #{total_member}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
174
- return {} if lists.empty?
175
-
176
- # リスト数がしきい値より少なくなるリストを抽出
177
- if lists.size > total_list
178
- lists = lists[0..(total_list - 1)]
179
- end
180
- puts "lists limited by total lists #{total_list}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
181
- return {} if lists.empty?
182
-
183
- members = lists.map do |li|
184
- begin
185
- list_members(li.id)
186
- rescue Twitter::Error::NotFound => e
187
- puts "#{__method__}: #{e.class} #{e.message} #{li.id} #{li.full_name} #{li.mode}" if debug
188
- nil
189
- end
190
- end.compact.flatten
191
- puts "candidate members: #{members.size}" if debug
192
- return {} if members.empty?
193
-
194
- open('members.txt', 'w') {|f| f.write members.map{ |m| m.description.gsub(/\R/, ' ') }.join("\n") } if debug
195
-
196
- 3.times do
197
- _members = members.each_with_object(Hash.new(0)) { |member, memo| memo[member] += 1 }.
198
- select { |_, v| lists.size * rate < v }.keys
199
- if _members.size > 100
200
- members = _members
201
- break
202
- else
203
- rate -= 0.05
204
- end
205
- end
206
- puts "members included multi lists #{rate.round(3)}: #{members.size}" if debug
207
-
208
- count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
209
- end
210
-
211
- private
212
-
213
- def filter(lists, min:)
214
- min = [min, lists.size].min
215
- _lists = []
216
- 3.times do |i|
217
- _lists = lists.select { |li| yield(li, i) }
218
- break if _lists.size >= min
219
- end
220
- _lists
221
- end
222
-
223
- def count_by_word(texts, delim: nil, tagger: nil, min_length: 2, max_length: 5, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil)
224
- texts = texts.dup
225
-
226
- frequency = Hash.new(0)
227
- if special_words.any?
228
- texts.each do |text|
229
- special_words.map { |sw| [sw, text.scan(sw)] }
230
- .delete_if { |_, matched| matched.empty? }
231
- .each_with_object(frequency) { |(word, matched), memo| memo[word] += matched.size }
232
-
233
- end
234
- end
235
-
236
- if exclude_regexp
237
- texts = texts.map { |t| t.remove(exclude_regexp) }
238
- end
239
-
240
- if delim
241
- texts = texts.map { |t| t.split(delim) }.flatten.map(&:strip)
242
- end
243
-
244
- if tagger
245
- texts = texts.map { |t| tagger.parse(t).split("\n") }.flatten.
246
- select { |line| line.include?('名詞') }.
247
- map { |line| line.split("\t")[0] }
248
- end
249
-
250
- texts.delete_if { |w| w.empty? || w.size < min_length || max_length < w.size || exclude_words.include?(w) || w.match(/\d{2}/) }.
251
- each_with_object(frequency) { |word, memo| memo[word] += 1 }.
252
- sort_by { |k, v| [-v, -k.size] }.to_h
253
- end
254
-
255
- def count_freq_words(texts, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil, debug: false)
256
- candidates, remains = texts.partition { |desc| desc.scan('/').size > 2 }
257
- slash_freq = count_by_word(candidates, delim: '/', exclude_regexp: exclude_regexp)
258
- puts "words splitted by /: #{slash_freq.take(10)}" if debug
259
-
260
- candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
261
- pipe_freq = count_by_word(candidates, delim: '|', exclude_regexp: exclude_regexp)
262
- puts "words splitted by |: #{pipe_freq.take(10)}" if debug
263
-
264
- noun_freq = count_by_word(remains, tagger: build_tagger, special_words: special_words, exclude_words: exclude_words, special_regexp: special_regexp, exclude_regexp: exclude_regexp)
265
- puts "words tagged as noun: #{noun_freq.take(10)}" if debug
266
-
267
- slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.
268
- merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }
269
- end
270
-
271
- def build_tagger
272
- require 'mecab'
273
- MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
274
- rescue => e
275
- puts "Add gem 'mecab' to your Gemfile."
276
- raise e
277
- end
278
-
279
- def include_hashtags?(tweet)
280
- tweet.entities&.hashtags&.any?
281
- end
282
-
283
- def extract_hashtags(tweet)
284
- tweet.entities.hashtags.map { |h| h.text }
285
- end
286
-
287
- SYNONYM_WORDS = (
288
- %w(cosplay cosplayer cosplayers coser cos こすぷれ コスプレ レイヤ レイヤー コスプレイヤー レイヤーさん).map { |w| [w, 'coplay'] } +
289
- %w(tsukuba tkb).map { |w| [w, 'tsukuba'] } +
290
- %w(waseda 早稲田 早稲田大学).map { |w| [w, 'waseda'] } +
291
- %w(keio 慶應 慶應義塾).map { |w| [w, 'keio'] } +
292
- %w(gakusai gakuensai 学祭 学園祭).map { |w| [w, 'gakusai'] } +
293
- %w(kosen kousen).map { |w| [w, 'kosen'] } +
294
- %w(anime アニメ).map { |w| [w, 'anime'] } +
295
- %w(photo photos).map { |w| [w, 'photo'] } +
296
- %w(creator creater クリエイター).map { |w| [w, 'creator'] } +
297
- %w(illustrator illustrater 絵師).map { |w| [w, 'illustrator'] } +
298
- %w(artist art artists アート 芸術).map { |w| [w, 'artist'] } +
299
- %w(design デザイン).map { |w| [w, 'design'] } +
300
- %w(kawaii かわいい).map { |w| [w, 'kawaii'] } +
301
- %w(idol あいどる アイドル 美人).map { |w| [w, 'idol'] } +
302
- %w(music musician musicians dj netlabel label レーベル おんがく 音楽家 音楽).map { |w| [w, 'music'] } +
303
- %w(engineer engineers engineering えんじにあ tech 技術 技術系 hacker coder programming programer programmer geek rubyist ruby scala java lisp).map { |w| [w, 'engineer'] } +
304
- %w(internet インターネット).map { |w| [w, 'internet'] }
305
- ).to_h
306
-
307
- def normalize_synonym(words)
308
- words.map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }
309
- end
310
- end
311
- end
312
- end
313
- end