wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'date'
|
|
3
|
+
# KLUDGE Executes only if run from command line
|
|
4
|
+
if __FILE__ == $0 then $: << File.dirname(__FILE__)+'/../..'; require 'wukong'; end
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# Sampling Multiplier
|
|
8
|
+
#
|
|
9
|
+
TWEETS_SAMPLED_FRACTION = 2.62
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
require 'wukong/datatypes/enum'; include Wukong::Datatypes
|
|
13
|
+
class FoBin < Binned ; enumerates 0, 0, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10_000, 20_000, Infinity ;end
|
|
14
|
+
class FrBin < FoBin ; end
|
|
15
|
+
class NbhdSizeBin < FoBin ; end
|
|
16
|
+
|
|
17
|
+
def round1 f
|
|
18
|
+
(10 * f.to_f).round / 10.0
|
|
19
|
+
end
|
|
20
|
+
def octave3 i
|
|
21
|
+
10.0 ** round1(i.to_f / 3.0)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
class TwDayBin < Binned ;
|
|
25
|
+
MO = 30.4368499
|
|
26
|
+
WK = 7.0
|
|
27
|
+
enumerates(* (
|
|
28
|
+
[-Infinity, 1.0 / MO, 2.0 / MO, 1 / WK, ] +
|
|
29
|
+
(-2 .. 9).map{|i| octave3(i + 0.5) } +
|
|
30
|
+
[Infinity]))
|
|
31
|
+
self.write_inheritable_attribute :names, (
|
|
32
|
+
['< 1/mo', ' 1-2/mo', ' 2/mo-1/wk', ' 1-2/wk', '~ 4/wk' ] +
|
|
33
|
+
[1, 2, 5, 10, 20, 50, 100, 200, 500, 1000].map{|i| "~ %4d/day"%i } +
|
|
34
|
+
['> 1500/day'])
|
|
35
|
+
end
|
|
36
|
+
class TwDayRecentBin < TwDayBin ; end
|
|
37
|
+
|
|
38
|
+
# < 1/mo 0.032854911177914
|
|
39
|
+
# 1-2/mo 0.065709822355828
|
|
40
|
+
# 2/mo-1/wk 0.142857142857143
|
|
41
|
+
# 1-2/wk 0.316227766016838
|
|
42
|
+
# ~ 4/wk 0.630957344480193
|
|
43
|
+
# ~ 1/day 1.58489319246111
|
|
44
|
+
# ~ 2/day 3.16227766016838
|
|
45
|
+
# ~ 5/day 6.30957344480193
|
|
46
|
+
# ~ 10/day 15.8489319246111
|
|
47
|
+
# ~ 20/day 31.6227766016838
|
|
48
|
+
# ~ 50/day 63.0957344480193
|
|
49
|
+
# ~ 100/day 158.489319246111
|
|
50
|
+
# ~ 200/day 316.227766016838
|
|
51
|
+
# ~ 500/day 630.957344480193
|
|
52
|
+
# ~ 1000/day 1584.89319246111
|
|
53
|
+
# > 1500/day Infinity
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TinyInt < Integer ; end
|
|
57
|
+
class NbhdBalBin < Binned
|
|
58
|
+
enumerates(* ([-Infinity] + ( (0..15).map{|n| (0.125+(n.to_f/20))} ) + [Infinity]) )
|
|
59
|
+
self.write_inheritable_attribute :names,
|
|
60
|
+
(['Few followers'] + (0..14).map{|n| (15+(5*n)) } + ['Mostly Followers'])
|
|
61
|
+
self.names[8] = 'Balanced'
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
#
|
|
66
|
+
#
|
|
67
|
+
module Wuclan
|
|
68
|
+
module Models
|
|
69
|
+
class BaseUserTweetMetrics < TypedStruct.new(
|
|
70
|
+
[:id, Integer],
|
|
71
|
+
[:tw_sampled, Integer],
|
|
72
|
+
[:last_tw_at, Bignum],
|
|
73
|
+
[:tw_recent, Integer]
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
class UserTweetMetrics < BaseUserTweetMetrics ; end
|
|
77
|
+
class UserTweetUrlMetrics < BaseUserTweetMetrics ; end
|
|
78
|
+
class UserHashtagMetrics < BaseUserTweetMetrics ; end
|
|
79
|
+
|
|
80
|
+
# 1 rsrc 2 id 3 screen_name 4protect 5 age 6duratio 7age_use 8age_las 9 fo
|
|
81
|
+
# 10fr 11 tw 12 fv 13 fr_fo 14 fo_week 15 fr_week 16 tw_day 17 fv_mo 18fo_sampled
|
|
82
|
+
# 19fr_sampled 20tw_sampled 21tw_rece 22tw_day_ 23at_in_s 24at_out_ 25rt_in_s
|
|
83
|
+
# 26rt_out_ 27fv_in_s 28fv_out_ 29any_wit 30any_out 31any_in_ 32at_in_w
|
|
84
|
+
# 33at_out_ 34rt_in_w 35rt_out_ 36fv_in_w 37fv_out_ 38at_tw_o 39rt_tw_o
|
|
85
|
+
# 40rt_at_o 41at_in_t 42rt_in_t 43rt_at_i 44 reach 45fo_cove 46fr_cove
|
|
86
|
+
# 47tw_cove 48fv_cove 49scrape_ 50scrape_ 51scrape_ 52 scraped_at
|
|
87
|
+
# 53part_scraped_at 54 created_at 55 last_tw_at
|
|
88
|
+
|
|
89
|
+
module StructToSQL
|
|
90
|
+
def to_sql_str
|
|
91
|
+
members.zip(mtypes, mnames).each do |attr, type, name|
|
|
92
|
+
type_str = case
|
|
93
|
+
when type <= String then 'VARCHAR(255) CHARACTER SET ASCII'
|
|
94
|
+
when type <= Enum then type.to_sql_str
|
|
95
|
+
else type.to_s.upcase
|
|
96
|
+
end
|
|
97
|
+
puts " %-23s\t%-23s\t-- %s" %["`#{attr}`", type_str+',', name]
|
|
98
|
+
end
|
|
99
|
+
[ UserMetrics.members.map{|attr| "`#{attr}`" }.join(", ") ]
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
class NamedTypedStruct < TypedStruct
|
|
104
|
+
def self.new *args
|
|
105
|
+
members, mtypes, mnames = args.transpose
|
|
106
|
+
thing = super *[members, mtypes].transpose
|
|
107
|
+
if mnames
|
|
108
|
+
thing.class_eval do
|
|
109
|
+
cattr_accessor :mnames
|
|
110
|
+
self.mnames = mnames
|
|
111
|
+
extend StructToSQL
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
thing
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class UserMetrics < NamedTypedStruct.new(
|
|
121
|
+
[:id, Integer , "User ID" ],
|
|
122
|
+
[:screen_name, String , "Twitter Name" ],
|
|
123
|
+
# 4
|
|
124
|
+
[:created_on, Date , "Created On Date" ],
|
|
125
|
+
[:created_at, DateTime , "Created At Date-Time" ],
|
|
126
|
+
[:protected, Integer , "Protected?" ],
|
|
127
|
+
[:active, TinyInt , "Active?" ],
|
|
128
|
+
# 8
|
|
129
|
+
[:fo, Integer , "# Followers" ],
|
|
130
|
+
[:fr, Integer , "# Friends" ],
|
|
131
|
+
[:tw, Integer , "# Tweets Sent" ],
|
|
132
|
+
[:fv, Integer , "# Favorites Out" ],
|
|
133
|
+
# 12
|
|
134
|
+
[:fo_week, Float , "Followers accumulated / week", ], # x
|
|
135
|
+
[:fr_week, Float , "Friends accumulated / week", ], # x
|
|
136
|
+
[:tw_day, Float , "Tweets sent / day", ], # x
|
|
137
|
+
[:fv_mo, Float , "Favorites accumulated / month", ], # x
|
|
138
|
+
# 16
|
|
139
|
+
[:tw_recent, Integer , "Tweets sampled 2008 Dec 1", ], # t
|
|
140
|
+
[:tw_day_recent, Float , "Recent Tweets / day (*)", ], # t
|
|
141
|
+
[:last_tw_at, Date , "Last Tweet Date (*)", ], # t
|
|
142
|
+
[:last_tw_age, Integer , "Age of most recent tweet (*)", ], # x
|
|
143
|
+
# 20
|
|
144
|
+
[:at_in_sampled, Integer , "# @atsigns to (*)", ], # g
|
|
145
|
+
[:at_out_sampled, Integer , "# @atsigns from (*)" ], # g
|
|
146
|
+
[:rt_in_sampled, Integer , "# ReTweets of (*)", ], # g
|
|
147
|
+
[:rt_out_sampled, Integer , "# ReTweets by (*)" ], # g
|
|
148
|
+
[:fv_in_sampled, Integer , "# favorites of (*)", ], # g
|
|
149
|
+
[:fv_out_sampled, Integer , "# favorites by (*)", ], # c
|
|
150
|
+
# 26
|
|
151
|
+
[:any_with, Integer , "Users linked to or from", ], # A
|
|
152
|
+
[:any_in_with, Integer , "Distinct At+RT+Fv users in", ], # A
|
|
153
|
+
[:any_out_with, Integer , "Distinct At+RT+Fv users out", ], # A
|
|
154
|
+
# 29
|
|
155
|
+
[:at_in_with, Integer , "Users @atsigned in (*)", ], # g
|
|
156
|
+
[:at_out_with, Integer , "Users @atsigned out (*)" ], # g",
|
|
157
|
+
[:rt_in_with, Integer , "Users Retweeted in (*)", ], # g
|
|
158
|
+
[:rt_out_with, Integer , "Users Retweeted out (*)" ], # g
|
|
159
|
+
[:fv_in_with, Integer , "Users Favorited in (*)", ], # g
|
|
160
|
+
[:fv_out_with, Integer , "Users Favorited out (*)", ], # c
|
|
161
|
+
# 35
|
|
162
|
+
[:tw_sampled, Integer , "How many tweets have we sampled", ], # t
|
|
163
|
+
[:hashtag_sampled, Integer, "How many hashtags have we seen"],
|
|
164
|
+
[:tweet_url_sampled, Integer, "How many tweet_url's have we seen"],
|
|
165
|
+
[:at_tw_out, Float , "Atsigns out per tweet out (*)", ], # g
|
|
166
|
+
[:rt_tw_out, Float , "Retweets out per tweet out (*)", ], # g
|
|
167
|
+
[:rt_at_out, Float , "Retweets out per atsign out (*)", ], # g
|
|
168
|
+
[:at_in_tw_out, Float , "Atsigns in per tweet out (*)", ], # g
|
|
169
|
+
[:rt_in_tw_out, Float , "Retweets in per tweet out (*)", ], # g
|
|
170
|
+
[:rt_at_in, Float , "Retweets in per atsign in (*)", ], # g
|
|
171
|
+
# 40
|
|
172
|
+
[:nbhd_bal, Float , "Neighborhood Balance", ], #
|
|
173
|
+
[:nbhd_size, Integer , "Neighborhood Size", ], #
|
|
174
|
+
[:reach, Integer , "Reach: (tweets/day) * |followers|", ], # x
|
|
175
|
+
# 43
|
|
176
|
+
[:fo_bin, FoBin , "# Followers grp"],
|
|
177
|
+
[:fr_bin, FrBin , "# Friends grp"],
|
|
178
|
+
[:nbhd_size_bin, NbhdSizeBin , "Neighborhood Size grp"],
|
|
179
|
+
[:nbhd_bal_bin, NbhdBalBin , "Neighborhood Balance grp"],
|
|
180
|
+
[:tw_day_bin, TwDayBin , "Tweets / day grp (*)"],
|
|
181
|
+
[:tw_day_recent_bin, TwDayRecentBin , "Recent Tweets / day grp (*)"],
|
|
182
|
+
#
|
|
183
|
+
[:part_scraped_at, Date , "User partial Scrape Date", ], #
|
|
184
|
+
[:scraped_at, Date , "User Scrape Date" ]
|
|
185
|
+
# #
|
|
186
|
+
# [:fo_sampled, Integer , "How many followers have we sampled", ], # c
|
|
187
|
+
# [:fr_sampled, Integer , "How many friend have we sampled", ], # c
|
|
188
|
+
# [:fo_coverage, Float , "Friends sampled / known to exist", ], # c
|
|
189
|
+
# [:fr_coverage, Float , "Followers sampled / known to exist", ], # c
|
|
190
|
+
# [:tw_coverage, Float , "Tweets sampled / known to exist", ], # c
|
|
191
|
+
# [:fv_coverage, Float , "Favorites sampled / known to exist", ], # c
|
|
192
|
+
# #
|
|
193
|
+
# [:scrape_age_fo, Integer , "How long since your followers graph record was scraped", ], # c
|
|
194
|
+
# [:scrape_age_fr, Integer , "How long since your friends graph record was scraped", ], # c
|
|
195
|
+
# [:scrape_age_fv, Integer , "How long since your favorites graph record was scraped", ], # c
|
|
196
|
+
#
|
|
197
|
+
# [:age, Integer , "Days between creation and now", ], # x
|
|
198
|
+
# [:duration, Integer , "Days between last scrape and creation", ], # x
|
|
199
|
+
# [:age_user_scrape, Integer , "How long since your user record was scraped", ], # x
|
|
200
|
+
)
|
|
201
|
+
SCRAPING_DAY_ZERO_STR = 20081201000000
|
|
202
|
+
SCRAPING_DAY_ZERO = DateTime.parse_safely(SCRAPING_DAY_ZERO_STR.to_s)
|
|
203
|
+
SINCE_DAY_ZERO = DateTime.now.utc - SCRAPING_DAY_ZERO
|
|
204
|
+
|
|
205
|
+
def fix!
|
|
206
|
+
get_tw_day_bin
|
|
207
|
+
get_tw_day_recent_bin
|
|
208
|
+
get_fo_bin
|
|
209
|
+
get_fr_bin
|
|
210
|
+
get_nbhd_size_bin
|
|
211
|
+
get_nbhd_bal_bin
|
|
212
|
+
get_active
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
#
|
|
216
|
+
# Fix formatting as we flatten
|
|
217
|
+
#
|
|
218
|
+
def to_a
|
|
219
|
+
members.zip(mtypes).map do |member, type|
|
|
220
|
+
val = self[member]
|
|
221
|
+
next if val.nil?
|
|
222
|
+
case
|
|
223
|
+
when member.to_sym == :id then "%010d" % val.to_i
|
|
224
|
+
when type == Float then "%f" % val.to_f
|
|
225
|
+
when type == Integer then "%7d" % val.to_i
|
|
226
|
+
when type == DateTime then val.strftime("%Y/%m/%d %H:%M:%S")
|
|
227
|
+
when type == Date then val.strftime("%Y/%m/%d")
|
|
228
|
+
else val
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
def protected?
|
|
233
|
+
protected.to_i == 1
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
#
|
|
237
|
+
#
|
|
238
|
+
#
|
|
239
|
+
def adopt_user user
|
|
240
|
+
@user_adopted = true
|
|
241
|
+
self.merge! user
|
|
242
|
+
self.created_at = DateTime.parse_safely(created_at)
|
|
243
|
+
self.scraped_at = DateTime.parse_safely(scraped_at)
|
|
244
|
+
end
|
|
245
|
+
def user_adopted?
|
|
246
|
+
@user_adopted
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
#
|
|
250
|
+
#
|
|
251
|
+
#
|
|
252
|
+
def adopt_user_partial user_partial
|
|
253
|
+
user_scraped_at = self.scraped_at
|
|
254
|
+
self.merge! user_partial
|
|
255
|
+
# restore scraped dates
|
|
256
|
+
self.part_scraped_at = DateTime.parse_safely(user_partial.scraped_at)
|
|
257
|
+
self.scraped_at = user_scraped_at
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
#
|
|
261
|
+
#
|
|
262
|
+
#
|
|
263
|
+
def adopt_scraping_metrics usm
|
|
264
|
+
[
|
|
265
|
+
[:friends_ids, :scrape_age_fr, ],
|
|
266
|
+
[:followers_ids, :scrape_age_fo, ],
|
|
267
|
+
[:favorites, :scrape_age_fv, ],
|
|
268
|
+
].each do |context, attr|
|
|
269
|
+
dt = usm.get context, :scraped_at
|
|
270
|
+
next if (usm.get(context, :successes).to_i < 1) || (dt.blank?)
|
|
271
|
+
# fudge bogus date records
|
|
272
|
+
dt = (dt.to_i < SCRAPING_DAY_ZERO_STR) ? SCRAPING_DAY_ZERO : DateTime.parse_safely(dt)
|
|
273
|
+
next if dt.blank?
|
|
274
|
+
self[attr] = now - dt
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
#
|
|
279
|
+
# From simple graph metrics
|
|
280
|
+
#
|
|
281
|
+
def adopt_graph_metrics user_graph_metrics
|
|
282
|
+
self.merge! user_graph_metrics
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
#
|
|
286
|
+
# User's Tweets -- from UserTweetMetrics
|
|
287
|
+
#
|
|
288
|
+
def adopt_tweet_metrics metrics
|
|
289
|
+
case metrics
|
|
290
|
+
when UserTweetMetrics
|
|
291
|
+
self.tw_sampled = metrics.tw_sampled
|
|
292
|
+
self.last_tw_at = DateTime.parse_safely(metrics.last_tw_at)
|
|
293
|
+
self.tw_recent = metrics.tw_recent
|
|
294
|
+
when UserHashtagMetrics
|
|
295
|
+
self.hashtag_sampled = metrics.tw_sampled
|
|
296
|
+
when UserTweetUrlMetrics
|
|
297
|
+
self.tweet_url_sampled = metrics.tw_sampled
|
|
298
|
+
else raise "Can't adopt #{metrics}" end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def followers_count=( fo) self.fo = fo.to_i end
|
|
302
|
+
def friends_count=( fr) self.fr = fr.to_i end
|
|
303
|
+
def statuses_count=( tw) self.tw = tw.to_i end
|
|
304
|
+
def favourites_count=(fv) self.fv = fv.to_i end
|
|
305
|
+
def followers_count() self.fo end
|
|
306
|
+
def friends_count() self.fr end
|
|
307
|
+
def statuses_count() self.tw end
|
|
308
|
+
def favourites_count() self.fv end
|
|
309
|
+
#
|
|
310
|
+
# Larger of fr and fo
|
|
311
|
+
#
|
|
312
|
+
def get_nbhd_size
|
|
313
|
+
self.nbhd_size = [fr, fo].compact.max
|
|
314
|
+
end
|
|
315
|
+
#
|
|
316
|
+
def get_nbhd_bal
|
|
317
|
+
return unless fr && fo && ((fr.to_i > 0) || (fo.to_i > 0))
|
|
318
|
+
self.nbhd_bal = ((fr > fo) ?
|
|
319
|
+
( (0.5 * fo.to_f / fr.to_f)) :
|
|
320
|
+
(1.0 - (0.5 * fr.to_f / fo.to_f)) )
|
|
321
|
+
end
|
|
322
|
+
def get_created_on
|
|
323
|
+
self.created_on = self.created_at
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def crat
|
|
327
|
+
@crat ||= created_at
|
|
328
|
+
end
|
|
329
|
+
def scat
|
|
330
|
+
@scat ||= scraped_at
|
|
331
|
+
end
|
|
332
|
+
def part_scat
|
|
333
|
+
@part_scat ||= part_scraped_at
|
|
334
|
+
end
|
|
335
|
+
def twat
|
|
336
|
+
@twat ||= last_tw_at
|
|
337
|
+
end
|
|
338
|
+
def now
|
|
339
|
+
@now ||= DateTime.now
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
#
|
|
344
|
+
# duration --
|
|
345
|
+
#
|
|
346
|
+
def age()
|
|
347
|
+
return @age if @age
|
|
348
|
+
return unless crat
|
|
349
|
+
@age = ( now - crat ).to_i
|
|
350
|
+
end
|
|
351
|
+
def duration()
|
|
352
|
+
return @duration if @duration
|
|
353
|
+
return unless crat && (scat || part_scat)
|
|
354
|
+
scat_latest = [scat, part_scat].compact.max
|
|
355
|
+
@duration = ( scat_latest - crat ).to_i
|
|
356
|
+
end
|
|
357
|
+
def get_last_tw_age()
|
|
358
|
+
return unless twat && crat
|
|
359
|
+
self.last_tw_age = ( crat - twat ).to_i
|
|
360
|
+
end
|
|
361
|
+
def get_age_user_scrape()
|
|
362
|
+
return unless scat
|
|
363
|
+
self.age_user_scrape = ( now - scat ).to_i
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
#
|
|
367
|
+
# Per-day metrics
|
|
368
|
+
#
|
|
369
|
+
# Should possibly use duration but need to worry about which one.
|
|
370
|
+
#
|
|
371
|
+
def get_fo_week() self.fo_week = 7 * (fo.to_f / duration) unless (duration.to_i == 0) || (!fo) end
|
|
372
|
+
def get_fr_week() self.fr_week = 7 * (fr.to_f / age) unless (age.to_i == 0) || (!fr) end
|
|
373
|
+
def get_tw_day() self.tw_day = (tw.to_f / duration) unless (duration.to_i == 0) || (!tw) end
|
|
374
|
+
def get_fv_mo() self.fv_mo = 30.4368499 * (fv.to_f / age) unless (age.to_i == 0) || (!fv) end
|
|
375
|
+
# def get_fr_fo() self.fr_fo = (fr.to_f / fo) unless (fo.to_i == 0) end
|
|
376
|
+
def get_tw_day_recent()
|
|
377
|
+
self.tw_day_recent = TWEETS_SAMPLED_FRACTION * (tw.to_f / SINCE_DAY_ZERO) unless (! tw)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
#
|
|
381
|
+
# Coverage: how many sampled vs. how many known to exist.
|
|
382
|
+
#
|
|
383
|
+
def get_fo_coverage() self.fo_coverage = (fo_sampled.to_f / fo) unless (fo.to_i == 0) end
|
|
384
|
+
def get_fr_coverage() self.fr_coverage = (fr_sampled.to_f / fr) unless (fr.to_i == 0) end
|
|
385
|
+
def get_tw_coverage() self.tw_coverage = (tw_sampled.to_f / tw) unless (tw.to_i == 0) end
|
|
386
|
+
def get_fv_coverage() self.fv_coverage = (fv_out_sampled.to_f / fv) unless (fv.to_i == 0) end
|
|
387
|
+
|
|
388
|
+
#
|
|
389
|
+
# Conversational metrics:
|
|
390
|
+
# favorites, @atsigns and RT's per tweet, in and out
|
|
391
|
+
# RT per @atsign, in and out
|
|
392
|
+
#
|
|
393
|
+
def get_at_tw_out() self.at_tw_out = (at_out_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! at_out_sampled) end
|
|
394
|
+
def get_rt_tw_out() self.rt_tw_out = (rt_out_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! rt_out_sampled) end
|
|
395
|
+
#
|
|
396
|
+
def get_at_in_tw_out() self.at_in_tw_out = (at_in_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! at_in_sampled) end
|
|
397
|
+
def get_rt_in_tw_out() self.rt_in_tw_out = (rt_in_sampled.to_f / tw_sampled.to_f) unless (tw_sampled.to_i == 0) || (! rt_in_sampled) end
|
|
398
|
+
#
|
|
399
|
+
def get_rt_at_out() self.rt_at_out = (rt_out_sampled.to_f / at_out_sampled.to_f) unless (at_out_sampled.to_i == 0) || (! rt_out_sampled) end
|
|
400
|
+
def get_rt_at_in() self.rt_at_in = (rt_in_sampled.to_f / at_in_sampled.to_f) unless (at_in_sampled.to_i == 0) || (! rt_in_sampled) end
|
|
401
|
+
|
|
402
|
+
#
|
|
403
|
+
# Reach:
|
|
404
|
+
#
|
|
405
|
+
# (your msgs/day) * |n1|
|
|
406
|
+
#
|
|
407
|
+
# How many of your messages/day might get read. Audience Share (tw_out_share)
|
|
408
|
+
# is a better measure of your impact.
|
|
409
|
+
#
|
|
410
|
+
def get_reach()
|
|
411
|
+
self.get_tw_day or return
|
|
412
|
+
self.reach = (tw_day.to_f * fo)
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
#
|
|
417
|
+
# Bins
|
|
418
|
+
#
|
|
419
|
+
def get_fo_bin() self.fo_bin = FoBin[fo] end
|
|
420
|
+
def get_fr_bin() self.fr_bin = FrBin[fr] end
|
|
421
|
+
def get_nbhd_size_bin() self.nbhd_size_bin = NbhdSizeBin[nbhd_size] end
|
|
422
|
+
def get_nbhd_bal_bin() self.nbhd_bal_bin = NbhdBalBin[nbhd_bal] end
|
|
423
|
+
def get_tw_day_bin() self.tw_day_bin = TwDayBin[tw_day] end
|
|
424
|
+
def get_tw_day_recent_bin() self.tw_day_recent_bin = TwDayRecentBin[tw_day_recent] end
|
|
425
|
+
def get_active()
|
|
426
|
+
exists = (fo && fr && tw) or return
|
|
427
|
+
tweets = tw >= 3
|
|
428
|
+
has_followers = fo > 15
|
|
429
|
+
has_nbhd = (fo >= 3) && (fr >= 2)
|
|
430
|
+
self.active = ( exists && tweets && (has_followers || has_nbhd) ) ? 1 : 0
|
|
431
|
+
end
|
|
432
|
+
end
|
|
433
|
+
end
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
#
|
|
439
|
+
# Executes only if run from command line
|
|
440
|
+
#
|
|
441
|
+
if __FILE__ == $0
|
|
442
|
+
puts "rsrc\t"+Wuclan::Models::UserMetrics.members.join("\t")
|
|
443
|
+
end
|