twitter_with_auto_pagination 0.8.3 → 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c6e0231b2860c7fca4ae5d2b5b8c8a7ab397c8b
|
4
|
+
data.tar.gz: 9f8698b8e86923e4855d75be50440ac803a96d3e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf30ac3e1930e548e99941977260e5bf9ac70d48b51e5a978638cd4b7b71a998cda96f7dc1b9ea12e2587a4a70992bcb5df99cddd96086f84b1824bfdb07edaf
|
7
|
+
data.tar.gz: a2c5ca9ec3c096d8a892f616139211ee76eafc4e7a27a5c1e93a2259e38387a5fdf5e16d5f1d39e02e4ac199e64c8c14cb6861b0685f4ba5eb74e769396cacfe
|
@@ -45,8 +45,13 @@ module TwitterWithAutoPagination
|
|
45
45
|
|
46
46
|
alias tweet_clusters clusters_belong_to
|
47
47
|
|
48
|
-
def list_clusters(user, each_member: 300, total_member: 1000, rate: 0.3, limit: 10, debug: false)
|
49
|
-
|
48
|
+
def list_clusters(user, shrink: false, each_member: 300, total_member: 1000, rate: 0.3, limit: 10, debug: false)
|
49
|
+
begin
|
50
|
+
lists = memberships(user).sort_by { |li| li.member_count }
|
51
|
+
rescue => e
|
52
|
+
puts "#{e.class}: #{e.message} #{user.inspect}" if debug
|
53
|
+
lists = []
|
54
|
+
end
|
50
55
|
puts "lists: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
51
56
|
return {} if lists.empty?
|
52
57
|
|
@@ -55,47 +60,48 @@ module TwitterWithAutoPagination
|
|
55
60
|
percentile75 = ((lists.length * 0.75).ceil) - 1
|
56
61
|
lists = lists[percentile25..percentile75]
|
57
62
|
puts "lists sliced by 25-75 percentile: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
58
|
-
end
|
63
|
+
end if shrink
|
59
64
|
|
60
65
|
list_special_words = %w()
|
61
|
-
|
66
|
+
list_exclude_words1 = %r(list[0-9]*|people-ive-faved|twizard-magic-list|my-favstar-fm-list)
|
67
|
+
list_exclude_words2 = %w(it list people met)
|
62
68
|
|
63
|
-
words = lists.map { |li| li.full_name.split('/')[1]
|
64
|
-
|
69
|
+
words = lists.map { |li| li.full_name.split('/')[1] }.
|
70
|
+
select { |n| !n.match(list_exclude_words1) }.
|
71
|
+
map { |n| n.split('-') }.flatten.
|
72
|
+
delete_if { |w| w.size < 2 || list_exclude_words2.include?(w) }.
|
73
|
+
each_with_object(Hash.new(0)) { |w, memo| memo[w] += 1 }.
|
74
|
+
sort_by { |k, v| [-v, -k.size] }
|
65
75
|
|
66
76
|
puts "words: #{words.slice(0, 10)}" if debug
|
67
77
|
return {} if words.empty?
|
68
78
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
79
|
+
_words = []
|
80
|
+
lists =
|
81
|
+
filter(lists, min: 2) do |li, i|
|
82
|
+
_words = words[0..i].map(&:first)
|
83
|
+
name = li.full_name.split('/')[1]
|
84
|
+
_words.any? { |w| name.include?(w) }
|
85
|
+
end
|
86
|
+
puts "lists include #{_words.inspect}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
75
87
|
return {} if lists.empty?
|
76
88
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
else
|
83
|
-
each_member *= 1.25
|
89
|
+
_each_member = 0
|
90
|
+
lists =
|
91
|
+
filter(lists, min: 2) do |li, i|
|
92
|
+
_each_member = each_member * (1.0 + 0.25 * i)
|
93
|
+
(10 < lists.size ? 10 : 0) < li.member_count && li.member_count < _each_member
|
84
94
|
end
|
85
|
-
|
86
|
-
puts "lists limited by each member #{each_member}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
95
|
+
puts "lists limited by each member #{_each_member}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
87
96
|
return {} if lists.empty?
|
88
97
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
else
|
95
|
-
total_member *= 1.25
|
98
|
+
_total_member = 0
|
99
|
+
lists =
|
100
|
+
filter(lists, min: 1) do |_, i|
|
101
|
+
_total_member = total_member * (1.0 + 0.25 * i)
|
102
|
+
lists[0..i].map { |li| li.member_count }.sum < _total_member
|
96
103
|
end
|
97
|
-
|
98
|
-
puts "lists limited by total members #{total_member}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
104
|
+
puts "lists limited by total members #{_total_member}: #{lists.size} (#{lists.map { |li| li.member_count }.join(', ')})" if debug
|
99
105
|
return {} if lists.empty?
|
100
106
|
|
101
107
|
members = lists.map do |li|
|
@@ -110,7 +116,8 @@ module TwitterWithAutoPagination
|
|
110
116
|
return {} if members.empty?
|
111
117
|
|
112
118
|
3.times do
|
113
|
-
_members = members.each_with_object(Hash.new(0)) { |member, memo| memo[member] += 1 }.
|
119
|
+
_members = members.each_with_object(Hash.new(0)) { |member, memo| memo[member] += 1 }.
|
120
|
+
select { |_, v| lists.size * rate < v }.keys
|
114
121
|
if _members.size > 100
|
115
122
|
members = _members
|
116
123
|
break
|
@@ -120,7 +127,6 @@ module TwitterWithAutoPagination
|
|
120
127
|
end
|
121
128
|
puts "members included multi lists #{rate}: #{members.size}" if debug
|
122
129
|
|
123
|
-
require 'mecab'
|
124
130
|
|
125
131
|
profile_special_words = %w()
|
126
132
|
profile_exclude_words = %w(in at of my no er the and for inc Inc com gmail 好き こと 最近 情報 さん ちゃん くん 発言 関係 もの 活動 見解 所属 組織 連絡 大好き サイト ブログ つぶやき こちら アカ アカウント イベント フォロー)
|
@@ -128,37 +134,49 @@ module TwitterWithAutoPagination
|
|
128
134
|
descriptions = members.map { |m| m.description.remove(URI.regexp) }
|
129
135
|
|
130
136
|
candidates, remains = descriptions.partition { |desc| desc.scan('/').size > 2 }
|
131
|
-
slash_freq =
|
137
|
+
slash_freq = count_by_word(candidates, delim: '/')
|
132
138
|
puts "words splitted by /: #{slash_freq.to_a.slice(0, 10)}" if debug
|
133
139
|
|
134
140
|
candidates, remains = remains.partition { |desc| desc.scan('|').size > 2 }
|
135
|
-
pipe_freq =
|
141
|
+
pipe_freq = count_by_word(candidates, delim: '|')
|
136
142
|
puts "words splitted by |: #{pipe_freq.to_a.slice(0, 10)}" if debug
|
137
143
|
|
138
|
-
|
139
|
-
|
144
|
+
require 'mecab'
|
145
|
+
tagger = MeCab::Tagger.new("-d #{`mecab-config --dicdir`.chomp}/mecab-ipadic-neologd/")
|
146
|
+
|
147
|
+
noun_freq = count_by_word(remains, tagger: tagger, exclude_words: profile_exclude_words)
|
148
|
+
puts "words tagged as noun: #{noun_freq.to_a.slice(0, 10)}" if debug
|
140
149
|
|
141
150
|
slash_freq.merge(pipe_freq) { |_, old, neww| old + neww }.merge(noun_freq) { |_, old, neww| old + neww }.sort_by { |k, v| [-v, -k.size] }.slice(0, limit)
|
142
151
|
end
|
143
152
|
|
144
153
|
private
|
145
154
|
|
146
|
-
def
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
155
|
+
def filter(lists, min:)
|
156
|
+
min = [min, lists.size].min
|
157
|
+
_lists = []
|
158
|
+
3.times do |i|
|
159
|
+
_lists = lists.select { |li| yield(li, i) }
|
160
|
+
break if _lists.size >= min
|
161
|
+
end
|
162
|
+
_lists
|
152
163
|
end
|
153
164
|
|
154
|
-
def
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
map { |
|
159
|
-
|
165
|
+
def count_by_word(texts, delim: nil, tagger: nil, exclude_words: [])
|
166
|
+
texts = texts.dup
|
167
|
+
|
168
|
+
if delim
|
169
|
+
texts = texts.map { |t| t.split(delim) }.flatten.map(&:strip)
|
170
|
+
end
|
171
|
+
|
172
|
+
if tagger
|
173
|
+
texts = tagger.parse(texts.join(' ')).split("\n").
|
174
|
+
select { |line| line.include?('名詞') }.
|
175
|
+
map { |line| line.split("\t")[0] }
|
176
|
+
end
|
160
177
|
|
161
|
-
|
178
|
+
texts.delete_if { |w| w.empty? || w.size < 2 || 5 < w.size || exclude_words.include?(w) }.
|
179
|
+
each_with_object(Hash.new(0)) { |word, memo| memo[word] += 1 }.
|
162
180
|
sort_by { |k, v| [-v, -k.size] }.to_h
|
163
181
|
end
|
164
182
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_with_auto_pagination
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shinohara Teruki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: twitter
|