twitter_with_auto_pagination 0.8.11 → 0.8.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 94ba1fa595a712ad5d4fb6fa2239286878c79b6e
4
- data.tar.gz: 36248bb1d1ee3fc6d305fba03619291a9adbead5
3
+ metadata.gz: 1a5ffffb7a3903265ee83d5340491193424cf6b2
4
+ data.tar.gz: df27d9bc220d90ad97c28911ed67960233ba63dd
5
5
  SHA512:
6
- metadata.gz: b6c23ac68807dbb281b8ee499476fb8167b06c5e5c2ae538fa0209e75e596fa65c939c48d56d1642d5058c5f84315cc568d770274b81f8dd40a45849ef746497
7
- data.tar.gz: 132365a7584511b9772967d5bd01bb877a906b96d1ff169bedcf5e0e5c3ec75cc879d580f2bcd10189bc6bdd9fd631ac0763c5309f54046c9c92d6b4e12a26cd
6
+ metadata.gz: 978c61c49f0fc373584cc3284483a3c775a0035499e2ba49b6271a5e78b8ea163ecc1a6438b682d540ee1e691f4461a78cbc813a2235869b619655777af74496
7
+ data.tar.gz: f6956150c2825aeae989067263e1eebf5d51df0dece18767239fedfa2b3970b899cfc51569f99f4e2f0fe671df7712081445636510633dd14d903a6c519db9e0
@@ -6,7 +6,12 @@ module TwitterWithAutoPagination
6
6
  module Clusters
7
7
  include TwitterWithAutoPagination::REST::Utils
8
8
 
9
- def tweet_clusters(tweets, limit: 10)
9
+ PROFILE_SPECIAL_WORDS = %w(20↑ 成人済 腐女子)
10
+ PROFILE_SPECIAL_REGEXP = nil
11
+ PROFILE_EXCLUDE_WORDS = %w(in at of my to no er by is RT DM the and for you inc Inc com from info next gmail 好き こと 最近 紹介 連載 発売 依頼 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 代表 連絡 大好き サイト ブログ つぶやき 株式会社 最新 こちら 届け お仕事 ツイ 返信 プロ 今年 リプ ヘッダー アイコン アカ アカウント ツイート たま ブロック 無言 時間 お願い お願いします お願いいたします イベント フォロー フォロワー フォロバ スタッフ 自動 手動 迷言 名言 非公式 リリース 問い合わせ ツイッター)
12
+ PROFILE_EXCLUDE_REGEXP = Regexp.union(/\w+@\w+\.(com|co\.jp)/, %r[\d{2,4}(年|/)\d{1,2}(月|/)\d{1,2}日], %r[\d{1,2}/\d{1,2}], /\d{2}th/, URI.regexp)
13
+
14
+ def tweet_clusters(tweets, limit: 10, debug: false)
10
15
  return {} if tweets.blank?
11
16
  text = tweets.map(&:text).join(' ')
12
17
 
@@ -18,7 +23,7 @@ module TwitterWithAutoPagination
18
23
  special_words = JSON.parse(File.read('./cluster_good_words.json'))
19
24
  end
20
25
 
21
- %w(べたら それとも たしかに さそう そんなに ったことある してるの しそうな おやくま ってますか これをやってるよ のせいか).each { |w| exclude_words << w }
26
+ %w(べたら むっちゃ それとも たしかに さそう そんなに ったことある してるの しそうな おやくま ってますか これをやってるよ のせいか 面白い 可愛い).each { |w| exclude_words << w }
22
27
  %w(面白い 可愛い 食べ物 宇多田ヒカル ご飯 面倒 体調悪くなる 空腹 頑張ってない 眼鏡 台風 沖縄 らんま1/2 女の子 怪我 足のむくみ 彼女欲しい 彼氏欲しい 吐き気 注射 海鮮チヂミ 出勤 価格ドットコム 幹事 雑談 パズドラ ビオフェルミン 餃子 お金 まんだらけ 結婚 焼肉 タッチペン).each { |w| special_words << w }
23
28
 
24
29
  # クラスタ用の単語の出現回数を記録
@@ -40,54 +45,90 @@ module TwitterWithAutoPagination
40
45
  each { |w| frequency[w] += 1 }
41
46
 
42
47
  # 複数個以上見付かった単語のみを残し、出現頻度順にソート
43
- frequency.select { |_, v| 2 < v }.sort_by { |k, v| [-v, -k.size] }.slice(0, limit).to_h
48
+ frequency.select { |_, v| 2 < v }.sort_by { |k, v| [-v, -k.size] }.take(limit).to_h
44
49
  end
45
50
 
46
- def hashtag_clusters(tweets, limit: 10, debug: false)
51
+ def count_freq_hashtags(tweets, with_prefix: true, use_regexp: false, debug: false)
47
52
  puts "tweets: #{tweets.size}" if debug
48
53
  return {} if tweets.blank?
49
54
 
50
- tweets = tweets.select { |t| t.text && t.text.include?('#') }
55
+ prefix = %w(#)
56
+ regexp = /[##]([A-Za-zA-Za-z_一-鿆0-90-9ぁ-ヶヲ-゚ー]+)/
57
+
58
+ tweets =
59
+ if use_regexp
60
+ tweets.select { |t| t.text && prefix.any? { |char| t.text.include?(char)} }
61
+ else
62
+ tweets.select { |t| include_hashtags?(t) }
63
+ end
51
64
  puts "tweets with hashtag: #{tweets.size}" if debug
52
65
 
53
- hashtags = tweets.map { |t| t.text.scan(/[##][A-Za-zA-Za-z_一-鿆0-90-9ぁ-ヶヲ-゚ー]+/).map(&:strip) }.flatten
54
- puts "hashtags: #{hashtags.size}" if debug
66
+ hashtags =
67
+ if use_regexp
68
+ tweets.map { |t| t.text.scan(regexp).flatten.map(&:strip) }
69
+ else
70
+ tweets.map { |t| extract_hashtags(t) }
71
+ end.flatten
72
+ hashtags = hashtags.map { |h| "#{prefix[0]}#{h}" } if with_prefix
55
73
 
56
- hashtags.each_with_object(Hash.new(0)) { |h, memo| memo[h] += 1 }.sort_by { |k, v| [-v, -k.size] }.slice(0, limit).to_h
74
+ hashtags.each_with_object(Hash.new(0)) { |h, memo| memo[h] += 1 }.sort_by { |k, v| [-v, -k.size] }.to_h
57
75
  end
58
76
 
59
- def list_clusters(user, shrink: false, shrink_limit: 100, list_member: 300, total_member: 3000, total_list: 50, rate: 0.3, limit: 10, debug: false)
60
- begin
61
- require 'mecab'
62
- rescue => e
63
- puts "Add gem 'mecab' to your Gemfile."
64
- return nil
65
- end
77
+ def hashtag_clusters(hashtags, limit: 10, debug: false)
78
+ puts "hashtags: #{hashtags.take(10)}" if debug
66
79
 
67
- begin
68
- lists = memberships(user, count: 500, call_limit: 2).sort_by { |li| li.member_count }
69
- rescue Twitter::Error::ServiceUnavailable => e
70
- puts "#{__method__}: #{e.class} #{e.message} #{user.inspect}" if debug
71
- lists = []
80
+ hashtag, count = hashtags.take(3).each_with_object(Hash.new(0)) do |tag, memo|
81
+ tweets = search(tag)
82
+ puts "tweets #{tag}: #{tweets.size}" if debug
83
+ memo[tag] = count_freq_hashtags(tweets).reject { |t, c| t == tag }.values.sum
84
+ end.max_by { |_, c| c }
85
+
86
+ hashtags = count_freq_hashtags(search(hashtag)).reject { |t, c| t == hashtag }.keys
87
+ queries = hashtags.take(3).combination(2).map { |ary| ary.join(' AND ') }
88
+ puts "selected #{hashtag}: #{queries.inspect}" if debug
89
+
90
+ tweets = queries.map { |q| search(q) }.flatten
91
+ puts "tweets #{queries.inspect}: #{tweets.size}" if debug
92
+
93
+ if tweets.empty?
94
+ tweets = search(hashtag)
95
+ puts "tweets #{hashtag}: #{tweets.size}" if debug
72
96
  end
97
+
98
+ members = tweets.map { |t| t.user }
99
+ puts "members count: #{members.size}" if debug
100
+
101
+ count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
102
+ end
103
+
104
+ def fetch_lists(user, debug: false)
105
+ memberships(user, count: 500, call_limit: 2).sort_by { |li| li.member_count }
106
+ rescue Twitter::Error::ServiceUnavailable => e
107
+ puts "#{__method__}: #{e.class} #{e.message} #{user.inspect}" if debug
108
+ []
109
+ end
110
+
111
+ def list_clusters(lists, shrink: false, shrink_limit: 100, list_member: 300, total_member: 3000, total_list: 50, rate: 0.3, limit: 10, debug: false)
112
+ lists = lists.sort_by { |li| li.member_count }
73
113
  puts "lists: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
74
114
  return {} if lists.empty?
75
115
 
76
116
  open('lists.txt', 'w') {|f| f.write lists.map(&:full_name).join("\n") } if debug
77
117
 
78
118
  list_special_words = %w()
79
- list_exclude_names = %r(list[0-9]*|people-ive-faved|twizard-magic-list|my-favstar-fm-list|timeline-list|conversationlist|who-i-met)
119
+ list_exclude_regexp = %r(list[0-9]*|people-ive-faved|twizard-magic-list|my-favstar-fm-list|timeline-list|conversationlist|who-i-met)
80
120
  list_exclude_words = %w(it list people who met)
81
121
 
82
122
  # リスト名を - で分割 -> 1文字の単語を除去 -> 出現頻度の降順でソート
83
123
  words = lists.map { |li| li.full_name.split('/')[1] }.
84
- select { |n| !n.match(list_exclude_names) }.
124
+ select { |n| !n.match(list_exclude_regexp) }.
85
125
  map { |n| n.split('-') }.flatten.
86
126
  delete_if { |w| w.size < 2 || list_exclude_words.include?(w) }.
127
+ map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }.
87
128
  each_with_object(Hash.new(0)) { |w, memo| memo[w] += 1 }.
88
129
  sort_by { |k, v| [-v, -k.size] }
89
130
 
90
- puts "words: #{words.slice(0, 10)}" if debug
131
+ puts "words: #{words.take(10)}" if debug
91
132
  return {} if words.empty?
92
133
 
93
134
  # 出現頻度の高い単語を名前に含むリストを抽出
@@ -112,11 +153,12 @@ module TwitterWithAutoPagination
112
153
  # メンバー数がしきい値より少ないリストを抽出
113
154
  _list_member = 0
114
155
  _min_list_member = 10 < lists.size ? 10 : 0
115
- lists =
156
+ _lists =
116
157
  filter(lists, min: 2) do |li, i|
117
158
  _list_member = list_member * (1.0 + 0.25 * i)
118
159
  _min_list_member < li.member_count && li.member_count < _list_member
119
160
  end
161
+ lists = _lists.empty? ? [lists[0]] : _lists
120
162
  puts "lists limited by list member #{_min_list_member}..#{_list_member.round}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
121
163
  return {} if lists.empty?
122
164
 
@@ -166,25 +208,7 @@ module TwitterWithAutoPagination
166
208
  end
167
209
  puts "members included multi lists #{rate.round(3)}: #{members.size}" if debug
168
210
 
169
-
170
- profile_special_words = %w()
171
- profile_exclude_words = %w(in at of my no er the and for inc Inc com info gmail 好き こと 最近 連載 発売 依頼 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 代表 連絡 大好き サイト ブログ つぶやき 株式会社 こちら 届け お仕事 アカ アカウント ツイート たま ブロック 時間 お願い お願いします お願いいたします イベント フォロー)
172
-
173
- descriptions = members.map { |m| m.description.remove(URI.regexp) }
174
-
175
- candidates, remains = descriptions.partition { |desc| desc.scan('/').size > 2 }
176
- slash_freq = count_by_word(candidates, delim: '/')
177
- puts "words splitted by /: #{slash_freq.to_a.slice(0, 10)}" if debug
178
-
179
- candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
180
- pipe_freq = count_by_word(candidates, delim: '|')
181
- puts "words splitted by |: #{pipe_freq.to_a.slice(0, 10)}" if debug
182
-
183
- tagger = MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
184
- noun_freq = count_by_word(remains, tagger: tagger, exclude_words: profile_exclude_words)
185
- puts "words tagged as noun: #{noun_freq.to_a.slice(0, 10)}" if debug
186
-
187
- slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }.slice(0, limit)
211
+ count_freq_words(members.map { |m| m.description }, special_words: PROFILE_SPECIAL_WORDS, exclude_words: PROFILE_EXCLUDE_WORDS, special_regexp: PROFILE_SPECIAL_REGEXP, exclude_regexp: PROFILE_EXCLUDE_REGEXP, debug: debug).take(limit)
188
212
  end
189
213
 
190
214
  private
@@ -199,23 +223,93 @@ module TwitterWithAutoPagination
199
223
  _lists
200
224
  end
201
225
 
202
- def count_by_word(texts, delim: nil, tagger: nil, exclude_words: [])
226
+ def count_by_word(texts, delim: nil, tagger: nil, min_length: 2, max_length: 5, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil)
203
227
  texts = texts.dup
204
228
 
229
+ frequency = Hash.new(0)
230
+ if special_words.any?
231
+ texts.each do |text|
232
+ special_words.map { |sw| [sw, text.scan(sw)] }
233
+ .delete_if { |_, matched| matched.empty? }
234
+ .each_with_object(frequency) { |(word, matched), memo| memo[word] += matched.size }
235
+
236
+ end
237
+ end
238
+
239
+ if exclude_regexp
240
+ texts = texts.map { |t| t.remove(exclude_regexp) }
241
+ end
242
+
205
243
  if delim
206
244
  texts = texts.map { |t| t.split(delim) }.flatten.map(&:strip)
207
245
  end
208
246
 
209
247
  if tagger
210
- texts = tagger.parse(texts.join(' ')).split("\n").
248
+ texts = texts.map { |t| tagger.parse(t).split("\n") }.flatten.
211
249
  select { |line| line.include?('名詞') }.
212
250
  map { |line| line.split("\t")[0] }
213
251
  end
214
252
 
215
- texts.delete_if { |w| w.empty? || w.size < 2 || 5 < w.size || exclude_words.include?(w) }.
216
- each_with_object(Hash.new(0)) { |word, memo| memo[word] += 1 }.
253
+ texts.delete_if { |w| w.empty? || w.size < min_length || max_length < w.size || exclude_words.include?(w) || w.match(/\d{2}/) }.
254
+ each_with_object(frequency) { |word, memo| memo[word] += 1 }.
217
255
  sort_by { |k, v| [-v, -k.size] }.to_h
218
256
  end
257
+
258
+ def count_freq_words(texts, special_words: [], exclude_words: [], special_regexp: nil, exclude_regexp: nil, debug: false)
259
+ candidates, remains = texts.partition { |desc| desc.scan('/').size > 2 }
260
+ slash_freq = count_by_word(candidates, delim: '/', exclude_regexp: exclude_regexp)
261
+ puts "words splitted by /: #{slash_freq.take(10)}" if debug
262
+
263
+ candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
264
+ pipe_freq = count_by_word(candidates, delim: '|', exclude_regexp: exclude_regexp)
265
+ puts "words splitted by |: #{pipe_freq.take(10)}" if debug
266
+
267
+ noun_freq = count_by_word(remains, tagger: build_tagger, special_words: special_words, exclude_words: exclude_words, special_regexp: special_regexp, exclude_regexp: exclude_regexp)
268
+ puts "words tagged as noun: #{noun_freq.take(10)}" if debug
269
+
270
+ slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.
271
+ merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }
272
+ end
273
+
274
+ def build_tagger
275
+ require 'mecab'
276
+ MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
277
+ rescue => e
278
+ puts "Add gem 'mecab' to your Gemfile."
279
+ raise e
280
+ end
281
+
282
+ def include_hashtags?(tweet)
283
+ tweet.entities&.hashtags&.any?
284
+ end
285
+
286
+ def extract_hashtags(tweet)
287
+ tweet.entities.hashtags.map { |h| h.text }
288
+ end
289
+
290
+ SYNONYM_WORDS = (
291
+ %w(cosplay cosplayer cosplayers coser cos こすぷれ コスプレ レイヤ レイヤー コスプレイヤー レイヤーさん).map { |w| [w, 'coplay'] } +
292
+ %w(tsukuba tkb).map { |w| [w, 'tsukuba'] } +
293
+ %w(waseda 早稲田 早稲田大学).map { |w| [w, 'waseda'] } +
294
+ %w(keio 慶應 慶應義塾).map { |w| [w, 'keio'] } +
295
+ %w(gakusai gakuensai 学祭 学園祭).map { |w| [w, 'gakusai'] } +
296
+ %w(kosen kousen).map { |w| [w, 'kosen'] } +
297
+ %w(anime アニメ).map { |w| [w, 'anime'] } +
298
+ %w(photo photos).map { |w| [w, 'photo'] } +
299
+ %w(creator creater クリエイター).map { |w| [w, 'creator'] } +
300
+ %w(illustrator illustrater 絵師).map { |w| [w, 'illustrator'] } +
301
+ %w(artist art artists アート 芸術).map { |w| [w, 'artist'] } +
302
+ %w(design デザイン).map { |w| [w, 'design'] } +
303
+ %w(kawaii かわいい).map { |w| [w, 'kawaii'] } +
304
+ %w(idol あいどる アイドル 美人).map { |w| [w, 'idol'] } +
305
+ %w(music musician musicians dj netlabel label レーベル おんがく 音楽家 音楽).map { |w| [w, 'music'] } +
306
+ %w(engineer engineers engineering えんじにあ tech 技術 技術系 hacker coder programming programer programmer geek rubyist ruby scala java lisp).map { |w| [w, 'engineer'] } +
307
+ %w(internet インターネット).map { |w| [w, 'internet'] }
308
+ ).to_h
309
+
310
+ def normalize_synonym(words)
311
+ words.map { |w| SYNONYM_WORDS.has_key?(w) ? SYNONYM_WORDS[w] : w }
312
+ end
219
313
  end
220
314
  end
221
315
  end
@@ -22,5 +22,5 @@ Gem::Specification.new do |spec|
22
22
  spec.required_ruby_version = '>= 2.3'
23
23
  spec.summary = spec.description
24
24
  spec.test_files = Dir.glob('spec/**/*')
25
- spec.version = '0.8.11'
25
+ spec.version = '0.8.12'
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_with_auto_pagination
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.11
4
+ version: 0.8.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shinohara Teruki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-21 00:00:00.000000000 Z
11
+ date: 2016-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: twitter