wuclan 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
@@ -0,0 +1,443 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'date'
|
3
|
+
# KLUDGE Executes only if run from command line
|
4
|
+
if __FILE__ == $0 then $: << File.dirname(__FILE__)+'/../..'; require 'wukong'; end
|
5
|
+
|
6
|
+
#
|
7
|
+
# Sampling Multiplier
|
8
|
+
#
|
9
|
+
TWEETS_SAMPLED_FRACTION = 2.62
|
10
|
+
|
11
|
+
|
12
|
+
require 'wukong/datatypes/enum'; include Wukong::Datatypes
|
13
|
+
class FoBin < Binned ; enumerates 0, 0, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10_000, 20_000, Infinity ;end
|
14
|
+
class FrBin < FoBin ; end
|
15
|
+
class NbhdSizeBin < FoBin ; end
|
16
|
+
|
17
|
+
def round1 f
|
18
|
+
(10 * f.to_f).round / 10.0
|
19
|
+
end
|
20
|
+
def octave3 i
|
21
|
+
10.0 ** round1(i.to_f / 3.0)
|
22
|
+
end
|
23
|
+
|
24
|
+
class TwDayBin < Binned ;
|
25
|
+
MO = 30.4368499
|
26
|
+
WK = 7.0
|
27
|
+
enumerates(* (
|
28
|
+
[-Infinity, 1.0 / MO, 2.0 / MO, 1 / WK, ] +
|
29
|
+
(-2 .. 9).map{|i| octave3(i + 0.5) } +
|
30
|
+
[Infinity]))
|
31
|
+
self.write_inheritable_attribute :names, (
|
32
|
+
['< 1/mo', ' 1-2/mo', ' 2/mo-1/wk', ' 1-2/wk', '~ 4/wk' ] +
|
33
|
+
[1, 2, 5, 10, 20, 50, 100, 200, 500, 1000].map{|i| "~ %4d/day"%i } +
|
34
|
+
['> 1500/day'])
|
35
|
+
end
|
36
|
+
class TwDayRecentBin < TwDayBin ; end
|
37
|
+
|
38
|
+
# < 1/mo 0.032854911177914
|
39
|
+
# 1-2/mo 0.065709822355828
|
40
|
+
# 2/mo-1/wk 0.142857142857143
|
41
|
+
# 1-2/wk 0.316227766016838
|
42
|
+
# ~ 4/wk 0.630957344480193
|
43
|
+
# ~ 1/day 1.58489319246111
|
44
|
+
# ~ 2/day 3.16227766016838
|
45
|
+
# ~ 5/day 6.30957344480193
|
46
|
+
# ~ 10/day 15.8489319246111
|
47
|
+
# ~ 20/day 31.6227766016838
|
48
|
+
# ~ 50/day 63.0957344480193
|
49
|
+
# ~ 100/day 158.489319246111
|
50
|
+
# ~ 200/day 316.227766016838
|
51
|
+
# ~ 500/day 630.957344480193
|
52
|
+
# ~ 1000/day 1584.89319246111
|
53
|
+
# > 1500/day Infinity
|
54
|
+
|
55
|
+
|
56
|
+
class TinyInt < Integer ; end
|
57
|
+
class NbhdBalBin < Binned
|
58
|
+
enumerates(* ([-Infinity] + ( (0..15).map{|n| (0.125+(n.to_f/20))} ) + [Infinity]) )
|
59
|
+
self.write_inheritable_attribute :names,
|
60
|
+
(['Few followers'] + (0..14).map{|n| (15+(5*n)) } + ['Mostly Followers'])
|
61
|
+
self.names[8] = 'Balanced'
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
#
|
66
|
+
#
|
67
|
+
module Wuclan
|
68
|
+
module Models
|
69
|
+
class BaseUserTweetMetrics < TypedStruct.new(
|
70
|
+
[:id, Integer],
|
71
|
+
[:tw_sampled, Integer],
|
72
|
+
[:last_tw_at, Bignum],
|
73
|
+
[:tw_recent, Integer]
|
74
|
+
)
|
75
|
+
end
|
76
|
+
class UserTweetMetrics < BaseUserTweetMetrics ; end
|
77
|
+
class UserTweetUrlMetrics < BaseUserTweetMetrics ; end
|
78
|
+
class UserHashtagMetrics < BaseUserTweetMetrics ; end
|
79
|
+
|
80
|
+
# 1 rsrc 2 id 3 screen_name 4protect 5 age 6duratio 7age_use 8age_las 9 fo
|
81
|
+
# 10fr 11 tw 12 fv 13 fr_fo 14 fo_week 15 fr_week 16 tw_day 17 fv_mo 18fo_sampled
|
82
|
+
# 19fr_sampled 20tw_sampled 21tw_rece 22tw_day_ 23at_in_s 24at_out_ 25rt_in_s
|
83
|
+
# 26rt_out_ 27fv_in_s 28fv_out_ 29any_wit 30any_out 31any_in_ 32at_in_w
|
84
|
+
# 33at_out_ 34rt_in_w 35rt_out_ 36fv_in_w 37fv_out_ 38at_tw_o 39rt_tw_o
|
85
|
+
# 40rt_at_o 41at_in_t 42rt_in_t 43rt_at_i 44 reach 45fo_cove 46fr_cove
|
86
|
+
# 47tw_cove 48fv_cove 49scrape_ 50scrape_ 51scrape_ 52 scraped_at
|
87
|
+
# 53part_scraped_at 54 created_at 55 last_tw_at
|
88
|
+
|
89
|
+
module StructToSQL
|
90
|
+
def to_sql_str
|
91
|
+
members.zip(mtypes, mnames).each do |attr, type, name|
|
92
|
+
type_str = case
|
93
|
+
when type <= String then 'VARCHAR(255) CHARACTER SET ASCII'
|
94
|
+
when type <= Enum then type.to_sql_str
|
95
|
+
else type.to_s.upcase
|
96
|
+
end
|
97
|
+
puts " %-23s\t%-23s\t-- %s" %["`#{attr}`", type_str+',', name]
|
98
|
+
end
|
99
|
+
[ UserMetrics.members.map{|attr| "`#{attr}`" }.join(", ") ]
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class NamedTypedStruct < TypedStruct
|
104
|
+
def self.new *args
|
105
|
+
members, mtypes, mnames = args.transpose
|
106
|
+
thing = super *[members, mtypes].transpose
|
107
|
+
if mnames
|
108
|
+
thing.class_eval do
|
109
|
+
cattr_accessor :mnames
|
110
|
+
self.mnames = mnames
|
111
|
+
extend StructToSQL
|
112
|
+
end
|
113
|
+
end
|
114
|
+
thing
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
class UserMetrics < NamedTypedStruct.new(
|
121
|
+
[:id, Integer , "User ID" ],
|
122
|
+
[:screen_name, String , "Twitter Name" ],
|
123
|
+
# 4
|
124
|
+
[:created_on, Date , "Created On Date" ],
|
125
|
+
[:created_at, DateTime , "Created At Date-Time" ],
|
126
|
+
[:protected, Integer , "Protected?" ],
|
127
|
+
[:active, TinyInt , "Active?" ],
|
128
|
+
# 8
|
129
|
+
[:fo, Integer , "# Followers" ],
|
130
|
+
[:fr, Integer , "# Friends" ],
|
131
|
+
[:tw, Integer , "# Tweets Sent" ],
|
132
|
+
[:fv, Integer , "# Favorites Out" ],
|
133
|
+
# 12
|
134
|
+
[:fo_week, Float , "Followers accumulated / week", ], # x
|
135
|
+
[:fr_week, Float , "Friends accumulated / week", ], # x
|
136
|
+
[:tw_day, Float , "Tweets sent / day", ], # x
|
137
|
+
[:fv_mo, Float , "Favorites accumulated / month", ], # x
|
138
|
+
# 16
|
139
|
+
[:tw_recent, Integer , "Tweets sampled 2008 Dec 1", ], # t
|
140
|
+
[:tw_day_recent, Float , "Recent Tweets / day (*)", ], # t
|
141
|
+
[:last_tw_at, Date , "Last Tweet Date (*)", ], # t
|
142
|
+
[:last_tw_age, Integer , "Age of most recent tweet (*)", ], # x
|
143
|
+
# 20
|
144
|
+
[:at_in_sampled, Integer , "# @atsigns to (*)", ], # g
|
145
|
+
[:at_out_sampled, Integer , "# @atsigns from (*)" ], # g
|
146
|
+
[:rt_in_sampled, Integer , "# ReTweets of (*)", ], # g
|
147
|
+
[:rt_out_sampled, Integer , "# ReTweets by (*)" ], # g
|
148
|
+
[:fv_in_sampled, Integer , "# favorites of (*)", ], # g
|
149
|
+
[:fv_out_sampled, Integer , "# favorites by (*)", ], # c
|
150
|
+
# 26
|
151
|
+
[:any_with, Integer , "Users linked to or from", ], # A
|
152
|
+
[:any_in_with, Integer , "Distinct At+RT+Fv users in", ], # A
|
153
|
+
[:any_out_with, Integer , "Distinct At+RT+Fv users out", ], # A
|
154
|
+
# 29
|
155
|
+
[:at_in_with, Integer , "Users @atsigned in (*)", ], # g
|
156
|
+
[:at_out_with, Integer , "Users @atsigned out (*)" ], # g",
|
157
|
+
[:rt_in_with, Integer , "Users Retweeted in (*)", ], # g
|
158
|
+
[:rt_out_with, Integer , "Users Retweeted out (*)" ], # g
|
159
|
+
[:fv_in_with, Integer , "Users Favorited in (*)", ], # g
|
160
|
+
[:fv_out_with, Integer , "Users Favorited out (*)", ], # c
|
161
|
+
# 35
|
162
|
+
[:tw_sampled, Integer , "How many tweets have we sampled", ], # t
|
163
|
+
[:hashtag_sampled, Integer, "How many hashtags have we seen"],
|
164
|
+
[:tweet_url_sampled, Integer, "How many tweet_url's have we seen"],
|
165
|
+
[:at_tw_out, Float , "Atsigns out per tweet out (*)", ], # g
|
166
|
+
[:rt_tw_out, Float , "Retweets out per tweet out (*)", ], # g
|
167
|
+
[:rt_at_out, Float , "Retweets out per atsign out (*)", ], # g
|
168
|
+
[:at_in_tw_out, Float , "Atsigns in per tweet out (*)", ], # g
|
169
|
+
[:rt_in_tw_out, Float , "Retweets in per tweet out (*)", ], # g
|
170
|
+
[:rt_at_in, Float , "Retweets in per atsign in (*)", ], # g
|
171
|
+
# 40
|
172
|
+
[:nbhd_bal, Float , "Neighborhood Balance", ], #
|
173
|
+
[:nbhd_size, Integer , "Neighborhood Size", ], #
|
174
|
+
[:reach, Integer , "Reach: (tweets/day) * |followers|", ], # x
|
175
|
+
# 43
|
176
|
+
[:fo_bin, FoBin , "# Followers grp"],
|
177
|
+
[:fr_bin, FrBin , "# Friends grp"],
|
178
|
+
[:nbhd_size_bin, NbhdSizeBin , "Neighborhood Size grp"],
|
179
|
+
[:nbhd_bal_bin, NbhdBalBin , "Neighborhood Balance grp"],
|
180
|
+
[:tw_day_bin, TwDayBin , "Tweets / day grp (*)"],
|
181
|
+
[:tw_day_recent_bin, TwDayRecentBin , "Recent Tweets / day grp (*)"],
|
182
|
+
#
|
183
|
+
[:part_scraped_at, Date , "User partial Scrape Date", ], #
|
184
|
+
[:scraped_at, Date , "User Scrape Date" ]
|
185
|
+
# #
|
186
|
+
# [:fo_sampled, Integer , "How many followers have we sampled", ], # c
|
187
|
+
# [:fr_sampled, Integer , "How many friend have we sampled", ], # c
|
188
|
+
# [:fo_coverage, Float , "Friends sampled / known to exist", ], # c
|
189
|
+
# [:fr_coverage, Float , "Followers sampled / known to exist", ], # c
|
190
|
+
# [:tw_coverage, Float , "Tweets sampled / known to exist", ], # c
|
191
|
+
# [:fv_coverage, Float , "Favorites sampled / known to exist", ], # c
|
192
|
+
# #
|
193
|
+
# [:scrape_age_fo, Integer , "How long since your followers graph record was scraped", ], # c
|
194
|
+
# [:scrape_age_fr, Integer , "How long since your friends graph record was scraped", ], # c
|
195
|
+
# [:scrape_age_fv, Integer , "How long since your favorites graph record was scraped", ], # c
|
196
|
+
#
|
197
|
+
# [:age, Integer , "Days between creation and now", ], # x
|
198
|
+
# [:duration, Integer , "Days between last scrape and creation", ], # x
|
199
|
+
# [:age_user_scrape, Integer , "How long since your user record was scraped", ], # x
|
200
|
+
)
|
201
|
+
SCRAPING_DAY_ZERO_STR = 20081201000000
|
202
|
+
SCRAPING_DAY_ZERO = DateTime.parse_safely(SCRAPING_DAY_ZERO_STR.to_s)
|
203
|
+
SINCE_DAY_ZERO = DateTime.now.utc - SCRAPING_DAY_ZERO
|
204
|
+
|
205
|
+
def fix!
|
206
|
+
get_tw_day_bin
|
207
|
+
get_tw_day_recent_bin
|
208
|
+
get_fo_bin
|
209
|
+
get_fr_bin
|
210
|
+
get_nbhd_size_bin
|
211
|
+
get_nbhd_bal_bin
|
212
|
+
get_active
|
213
|
+
end
|
214
|
+
|
215
|
+
#
|
216
|
+
# Fix formatting as we flatten
|
217
|
+
#
|
218
|
+
def to_a
|
219
|
+
members.zip(mtypes).map do |member, type|
|
220
|
+
val = self[member]
|
221
|
+
next if val.nil?
|
222
|
+
case
|
223
|
+
when member.to_sym == :id then "%010d" % val.to_i
|
224
|
+
when type == Float then "%f" % val.to_f
|
225
|
+
when type == Integer then "%7d" % val.to_i
|
226
|
+
when type == DateTime then val.strftime("%Y/%m/%d %H:%M:%S")
|
227
|
+
when type == Date then val.strftime("%Y/%m/%d")
|
228
|
+
else val
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
def protected?
|
233
|
+
protected.to_i == 1
|
234
|
+
end
|
235
|
+
|
236
|
+
#
|
237
|
+
#
|
238
|
+
#
|
239
|
+
def adopt_user user
|
240
|
+
@user_adopted = true
|
241
|
+
self.merge! user
|
242
|
+
self.created_at = DateTime.parse_safely(created_at)
|
243
|
+
self.scraped_at = DateTime.parse_safely(scraped_at)
|
244
|
+
end
|
245
|
+
def user_adopted?
|
246
|
+
@user_adopted
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
#
|
251
|
+
#
|
252
|
+
def adopt_user_partial user_partial
|
253
|
+
user_scraped_at = self.scraped_at
|
254
|
+
self.merge! user_partial
|
255
|
+
# restore scraped dates
|
256
|
+
self.part_scraped_at = DateTime.parse_safely(user_partial.scraped_at)
|
257
|
+
self.scraped_at = user_scraped_at
|
258
|
+
end
|
259
|
+
|
260
|
+
#
|
261
|
+
#
|
262
|
+
#
|
263
|
+
def adopt_scraping_metrics usm
|
264
|
+
[
|
265
|
+
[:friends_ids, :scrape_age_fr, ],
|
266
|
+
[:followers_ids, :scrape_age_fo, ],
|
267
|
+
[:favorites, :scrape_age_fv, ],
|
268
|
+
].each do |context, attr|
|
269
|
+
dt = usm.get context, :scraped_at
|
270
|
+
next if (usm.get(context, :successes).to_i < 1) || (dt.blank?)
|
271
|
+
# fudge bogus date records
|
272
|
+
dt = (dt.to_i < SCRAPING_DAY_ZERO_STR) ? SCRAPING_DAY_ZERO : DateTime.parse_safely(dt)
|
273
|
+
next if dt.blank?
|
274
|
+
self[attr] = now - dt
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
#
|
279
|
+
# From simple graph metrics
|
280
|
+
#
|
281
|
+
def adopt_graph_metrics user_graph_metrics
|
282
|
+
self.merge! user_graph_metrics
|
283
|
+
end
|
284
|
+
|
285
|
+
#
|
286
|
+
# User's Tweets -- from UserTweetMetrics
|
287
|
+
#
|
288
|
+
def adopt_tweet_metrics metrics
|
289
|
+
case metrics
|
290
|
+
when UserTweetMetrics
|
291
|
+
self.tw_sampled = metrics.tw_sampled
|
292
|
+
self.last_tw_at = DateTime.parse_safely(metrics.last_tw_at)
|
293
|
+
self.tw_recent = metrics.tw_recent
|
294
|
+
when UserHashtagMetrics
|
295
|
+
self.hashtag_sampled = metrics.tw_sampled
|
296
|
+
when UserTweetUrlMetrics
|
297
|
+
self.tweet_url_sampled = metrics.tw_sampled
|
298
|
+
else raise "Can't adopt #{metrics}" end
|
299
|
+
end
|
300
|
+
|
301
|
+
def followers_count=( fo) self.fo = fo.to_i end
|
302
|
+
def friends_count=( fr) self.fr = fr.to_i end
|
303
|
+
def statuses_count=( tw) self.tw = tw.to_i end
|
304
|
+
def favourites_count=(fv) self.fv = fv.to_i end
|
305
|
+
def followers_count() self.fo end
|
306
|
+
def friends_count() self.fr end
|
307
|
+
def statuses_count() self.tw end
|
308
|
+
def favourites_count() self.fv end
|
309
|
+
#
|
310
|
+
# Larger of fr and fo
|
311
|
+
#
|
312
|
+
def get_nbhd_size
|
313
|
+
self.nbhd_size = [fr, fo].compact.max
|
314
|
+
end
|
315
|
+
#
|
316
|
+
def get_nbhd_bal
|
317
|
+
return unless fr && fo && ((fr.to_i > 0) || (fo.to_i > 0))
|
318
|
+
self.nbhd_bal = ((fr > fo) ?
|
319
|
+
( (0.5 * fo.to_f / fr.to_f)) :
|
320
|
+
(1.0 - (0.5 * fr.to_f / fo.to_f)) )
|
321
|
+
end
|
322
|
+
def get_created_on
|
323
|
+
self.created_on = self.created_at
|
324
|
+
end
|
325
|
+
|
326
|
+
def crat
|
327
|
+
@crat ||= created_at
|
328
|
+
end
|
329
|
+
def scat
|
330
|
+
@scat ||= scraped_at
|
331
|
+
end
|
332
|
+
def part_scat
|
333
|
+
@part_scat ||= part_scraped_at
|
334
|
+
end
|
335
|
+
def twat
|
336
|
+
@twat ||= last_tw_at
|
337
|
+
end
|
338
|
+
def now
|
339
|
+
@now ||= DateTime.now
|
340
|
+
end
|
341
|
+
|
342
|
+
|
343
|
+
#
|
344
|
+
# duration --
|
345
|
+
#
|
346
|
+
def age()
|
347
|
+
return @age if @age
|
348
|
+
return unless crat
|
349
|
+
@age = ( now - crat ).to_i
|
350
|
+
end
|
351
|
+
def duration()
|
352
|
+
return @duration if @duration
|
353
|
+
return unless crat && (scat || part_scat)
|
354
|
+
scat_latest = [scat, part_scat].compact.max
|
355
|
+
@duration = ( scat_latest - crat ).to_i
|
356
|
+
end
|
357
|
+
def get_last_tw_age()
|
358
|
+
return unless twat && crat
|
359
|
+
self.last_tw_age = ( crat - twat ).to_i
|
360
|
+
end
|
361
|
+
def get_age_user_scrape()
|
362
|
+
return unless scat
|
363
|
+
self.age_user_scrape = ( now - scat ).to_i
|
364
|
+
end
|
365
|
+
|
366
|
+
#
|
367
|
+
# Per-day metrics
|
368
|
+
#
|
369
|
+
# Should possibly use duration but need to worry about which one.
|
370
|
+
#
|
371
|
+
def get_fo_week() self.fo_week = 7 * (fo.to_f / duration) unless (duration.to_i == 0) || (!fo) end
|
372
|
+
def get_fr_week() self.fr_week = 7 * (fr.to_f / age) unless (age.to_i == 0) || (!fr) end
|
373
|
+
def get_tw_day() self.tw_day = (tw.to_f / duration) unless (duration.to_i == 0) || (!tw) end
|
374
|
+
def get_fv_mo() self.fv_mo = 30.4368499 * (fv.to_f / age) unless (age.to_i == 0) || (!fv) end
|
375
|
+
# def get_fr_fo() self.fr_fo = (fr.to_f / fo) unless (fo.to_i == 0) end
|
376
|
+
def get_tw_day_recent()
|
377
|
+
self.tw_day_recent = TWEETS_SAMPLED_FRACTION * (tw.to_f / SINCE_DAY_ZERO) unless (! tw)
|
378
|
+
end
|
379
|
+
|
380
|
+
#
|
381
|
+
# Coverage: how many sampled vs. how many known to exist.
|
382
|
+
#
|
383
|
+
def get_fo_coverage() self.fo_coverage = (fo_sampled.to_f / fo) unless (fo.to_i == 0) end
|
384
|
+
def get_fr_coverage() self.fr_coverage = (fr_sampled.to_f / fr) unless (fr.to_i == 0) end
|
385
|
+
def get_tw_coverage() self.tw_coverage = (tw_sampled.to_f / tw) unless (tw.to_i == 0) end
|
386
|
+
def get_fv_coverage() self.fv_coverage = (fv_out_sampled.to_f / fv) unless (fv.to_i == 0) end
|
387
|
+
|
388
|
+
#
|
389
|
+
# Conversational metrics:
|
390
|
+
# favorites, @atsigns and RT's per tweet, in and out
|
391
|
+
# RT per @atsign, in and out
|
392
|
+
#
|
393
|
+
def get_at_tw_out() self.at_tw_out = (at_out_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! at_out_sampled) end
|
394
|
+
def get_rt_tw_out() self.rt_tw_out = (rt_out_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! rt_out_sampled) end
|
395
|
+
#
|
396
|
+
def get_at_in_tw_out() self.at_in_tw_out = (at_in_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! at_in_sampled) end
|
397
|
+
def get_rt_in_tw_out() self.rt_in_tw_out = (rt_in_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! rt_in_sampled) end
|
398
|
+
#
|
399
|
+
def get_rt_at_out() self.rt_at_out = (rt_out_sampled.to_f / at_out_sampled.to_f) unless (at_out_sampled.to_i == 0) || (! rt_out_sampled) end
|
400
|
+
def get_rt_at_in() self.rt_at_in = (rt_in_sampled.to_f / at_in_sampled.to_f) unless (at_in_sampled.to_i == 0) || (! rt_in_sampled) end
|
401
|
+
|
402
|
+
#
|
403
|
+
# Reach:
|
404
|
+
#
|
405
|
+
# (your msgs/day) * |n1|
|
406
|
+
#
|
407
|
+
# How many of your messages/day might get read. Audience Share (tw_out_share)
|
408
|
+
# is a better measure of your impact.
|
409
|
+
#
|
410
|
+
def get_reach()
|
411
|
+
self.get_tw_day or return
|
412
|
+
self.reach = (tw_day.to_f * fo)
|
413
|
+
end
|
414
|
+
|
415
|
+
|
416
|
+
#
|
417
|
+
# Bins
|
418
|
+
#
|
419
|
+
def get_fo_bin() self.fo_bin = FoBin[fo] end
|
420
|
+
def get_fr_bin() self.fr_bin = FrBin[fr] end
|
421
|
+
def get_nbhd_size_bin() self.nbhd_size_bin = NbhdSizeBin[nbhd_size] end
|
422
|
+
def get_nbhd_bal_bin() self.nbhd_bal_bin = NbhdBalBin[nbhd_bal] end
|
423
|
+
def get_tw_day_bin() self.tw_day_bin = TwDayBin[tw_day] end
|
424
|
+
def get_tw_day_recent_bin() self.tw_day_recent_bin = TwDayRecentBin[tw_day_recent] end
|
425
|
+
def get_active()
|
426
|
+
exists = (fo && fr && tw) or return
|
427
|
+
tweets = tw >= 3
|
428
|
+
has_followers = fo > 15
|
429
|
+
has_nbhd = (fo >= 3) && (fr >= 2)
|
430
|
+
self.active = ( exists && tweets && (has_followers || has_nbhd) ) ? 1 : 0
|
431
|
+
end
|
432
|
+
end
|
433
|
+
end
|
434
|
+
end
|
435
|
+
|
436
|
+
|
437
|
+
|
438
|
+
#
|
439
|
+
# Executes only if run from command line
|
440
|
+
#
|
441
|
+
if __FILE__ == $0
|
442
|
+
puts "rsrc\t"+Wuclan::Models::UserMetrics.members.join("\t")
|
443
|
+
end
|