wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'date'
|
|
3
|
+
# Executes only if run from command line
|
|
4
|
+
if __FILE__ == $0 then $: << File.dirname(__FILE__)+'/../..'; require 'wukong'; end
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
module Wuclan
|
|
10
|
+
module Models
|
|
11
|
+
class UserTweetMetrics < TypedStruct.new(
|
|
12
|
+
[:id, Integer],
|
|
13
|
+
[:tw_seen, Integer],
|
|
14
|
+
[:last_tw_at, Bignum],
|
|
15
|
+
[:tw_recent, Integer]
|
|
16
|
+
)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# 1 rsrc 2 id 3 screen_name 4protect 5 age 6duratio 7age_use 8age_las 9 fo
|
|
21
|
+
# 10fr 11 tw 12 fv 13 fr_fo 14 fo_day 15 fr_day 16 tw_day 17 fv_day 18fo_seen
|
|
22
|
+
# 19fr_seen 20tw_seen 21tw_rece 22tw_day_ 23at_in_s 24at_out_ 25rt_in_s
|
|
23
|
+
# 26rt_out_ 27fv_in_s 28fv_out_ 29any_wit 30any_out 31any_in_ 32at_in_w
|
|
24
|
+
# 33at_out_ 34rt_in_w 35rt_out_ 36fv_in_w 37fv_out_ 38at_tw_o 39rt_tw_o
|
|
25
|
+
# 40rt_at_o 41at_in_t 42rt_in_t 43rt_at_i 44 reach 45fo_cove 46fr_cove
|
|
26
|
+
# 47tw_cove 48fv_cove 49scrape_ 50scrape_ 51scrape_ 52 scraped_at
|
|
27
|
+
# 53part_scraped_at 54 created_at 55 last_tw_at
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# [
|
|
32
|
+
# #
|
|
33
|
+
# [:fo_day, Float], # x Followers accumulated / day
|
|
34
|
+
# [:fr_day, Float], # x Friends accumulated / day
|
|
35
|
+
# [:tw_day, Float], # x Tweets sent / day
|
|
36
|
+
# [:fv_day, Float], # x Favorites accumulated / day
|
|
37
|
+
# #
|
|
38
|
+
# [:at_in_with, Integer], # g Users Atsigning you
|
|
39
|
+
# [:at_out_with, Integer], # g
|
|
40
|
+
# [:rt_in_with, Integer], # g Users Retweeting you
|
|
41
|
+
# [:rt_out_with, Integer], # g
|
|
42
|
+
# [:fv_in_with, Integer], # g Favorites by others of your tweets, with
|
|
43
|
+
# [:fv_out_with, Integer], # c Number of users who Favorited you
|
|
44
|
+
# #
|
|
45
|
+
# [:at_tw_out, Float], # g Atsigns out per tweet out
|
|
46
|
+
# [:rt_tw_out, Float], # g Retweets out per tweet out
|
|
47
|
+
# [:rt_at_out, Float], # g Retweets out per atsign out seen
|
|
48
|
+
# [:at_in_tw_out, Float], # g Atsigns in per tweet out
|
|
49
|
+
# [:rt_in_tw_out, Float], # g Retweets in per tweet out
|
|
50
|
+
# [:rt_at_in, Float], # g Retweets in seen per atsign in seen
|
|
51
|
+
# #
|
|
52
|
+
# [:reach, Integer], # x Reach: (your msgs/day) * |n1|
|
|
53
|
+
# ]
|
|
54
|
+
# [
|
|
55
|
+
# [:scraped_at, Bignum],
|
|
56
|
+
# [:part_scraped_at, Bignum], # Date of last user partial update
|
|
57
|
+
# #
|
|
58
|
+
# [:fo_coverage, Float], # c Friends seen / known to exist
|
|
59
|
+
# [:fr_coverage, Float], # c Followers seen / known to exist
|
|
60
|
+
# [:tw_coverage, Float], # c Tweets seen / known to exist
|
|
61
|
+
# [:fv_coverage, Float], # c Favorites seen / known to exist
|
|
62
|
+
# #
|
|
63
|
+
# [:fo_scraped_at, Integer], # c How long since your followers graph record was scraped
|
|
64
|
+
# [:fr_scraped_at, Integer], # c How long since your friends graph record was scraped
|
|
65
|
+
# [:fv_scraped_at, Integer], # c How long since your favorites graph record was scraped
|
|
66
|
+
# #
|
|
67
|
+
# ]
|
|
68
|
+
#
|
|
69
|
+
|
|
70
|
+
class UserMetrics < TypedStruct.new(
|
|
71
|
+
[:id, Integer],
|
|
72
|
+
#
|
|
73
|
+
[:created_at, Bignum],
|
|
74
|
+
[:last_tw_at, Bignum], # t Date of the last seen tweet
|
|
75
|
+
#
|
|
76
|
+
[:fo, Integer],
|
|
77
|
+
[:fr, Integer],
|
|
78
|
+
[:tw, Integer],
|
|
79
|
+
[:fv, Integer],
|
|
80
|
+
#
|
|
81
|
+
:n_followers_cat,
|
|
82
|
+
:n_friends_cat,
|
|
83
|
+
:n_tweets_cat,
|
|
84
|
+
:active,
|
|
85
|
+
|
|
86
|
+
[:any_with, Integer], # Any graph link to
|
|
87
|
+
:neighborhood_size,
|
|
88
|
+
:friend_follower_balance
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
SCRAPING_DAY_ZERO_STR = 20081201000000
|
|
92
|
+
SCRAPING_DAY_ZERO = DateTime.parse_safely(SCRAPING_DAY_ZERO_STR.to_s)
|
|
93
|
+
SINCE_DAY_ZERO = DateTime.now - SCRAPING_DAY_ZERO
|
|
94
|
+
|
|
95
|
+
def to_a
|
|
96
|
+
members.zip(mtypes).map do |member, type|
|
|
97
|
+
val = self[member]
|
|
98
|
+
next if val.nil?
|
|
99
|
+
case
|
|
100
|
+
when member.to_sym == :id then "%010d" % val.to_i
|
|
101
|
+
when type == Float then "%f" % val.to_f
|
|
102
|
+
when type == Integer then "%7d" % val.to_i
|
|
103
|
+
when type == Bignum then "%15s" % val
|
|
104
|
+
else val
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
def protected?
|
|
109
|
+
protected.to_i == 1
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
#
|
|
113
|
+
#
|
|
114
|
+
#
|
|
115
|
+
def user_adopted?
|
|
116
|
+
@user_adopted
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
#
|
|
120
|
+
#
|
|
121
|
+
#
|
|
122
|
+
def adopt_user user
|
|
123
|
+
@user_adopted = true
|
|
124
|
+
self.merge! user
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
#
|
|
128
|
+
#
|
|
129
|
+
#
|
|
130
|
+
def adopt_user_partial user_partial
|
|
131
|
+
user_scraped_at = self.scraped_at
|
|
132
|
+
self.merge! user_partial
|
|
133
|
+
# restore scraped dates
|
|
134
|
+
self.part_scraped_at = user_partial.scraped_at
|
|
135
|
+
self.scraped_at = user_scraped_at
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
#
|
|
139
|
+
#
|
|
140
|
+
#
|
|
141
|
+
def adopt_scraping_metrics usm
|
|
142
|
+
[
|
|
143
|
+
[:friends_ids, :scrape_age_fr, ],
|
|
144
|
+
[:followers_ids, :scrape_age_fo, ],
|
|
145
|
+
[:favorites, :scrape_age_fv, ],
|
|
146
|
+
].each do |context, attr|
|
|
147
|
+
dt = usm.get context, :scraped_at
|
|
148
|
+
next if (usm.get(context, :successes).to_i < 1) || (dt.blank?)
|
|
149
|
+
# fudge bogus date records
|
|
150
|
+
dt = (dt.to_i < SCRAPING_DAY_ZERO_STR) ? SCRAPING_DAY_ZERO : DateTime.parse_safely(dt)
|
|
151
|
+
next if dt.blank?
|
|
152
|
+
self[attr] = now - dt
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
#
|
|
157
|
+
# From simple graph metrics
|
|
158
|
+
#
|
|
159
|
+
def adopt_graph_metrics user_graph_metrics
|
|
160
|
+
self.merge! user_graph_metrics
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
#
|
|
164
|
+
# User's Tweets -- from UserTweetMetrics
|
|
165
|
+
#
|
|
166
|
+
def adopt_tweet_metrics user_tweet_metrics
|
|
167
|
+
self.tw_seen = user_tweet_metrics.tw_seen
|
|
168
|
+
self.last_tw_at = user_tweet_metrics.last_tw_at
|
|
169
|
+
self.tw_recent = user_tweet_metrics.tw_recent
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def followers_count=( fo) self.fo = fo.to_i end
|
|
173
|
+
def friends_count=( fr) self.fr = fr.to_i end
|
|
174
|
+
def statuses_count=( tw) self.tw = tw.to_i end
|
|
175
|
+
def favourites_count=(fv) self.fv = fv.to_i end
|
|
176
|
+
def followers_count() self.fo end
|
|
177
|
+
def friends_count() self.fr end
|
|
178
|
+
def statuses_count() self.tw end
|
|
179
|
+
def favourites_count() self.fv end
|
|
180
|
+
|
|
181
|
+
def crat
|
|
182
|
+
@crat ||= DateTime.parse_safely created_at
|
|
183
|
+
end
|
|
184
|
+
def scat
|
|
185
|
+
@scat ||= DateTime.parse_safely scraped_at
|
|
186
|
+
end
|
|
187
|
+
def part_scat
|
|
188
|
+
@part_scat ||= DateTime.parse_safely part_scraped_at
|
|
189
|
+
end
|
|
190
|
+
def twat
|
|
191
|
+
@twat ||= DateTime.parse_safely last_tw_at
|
|
192
|
+
end
|
|
193
|
+
def now
|
|
194
|
+
@now ||= DateTime.now
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
#
|
|
198
|
+
# duration --
|
|
199
|
+
#
|
|
200
|
+
def get_age()
|
|
201
|
+
return unless crat
|
|
202
|
+
self.age = ( now - crat ).to_i
|
|
203
|
+
end
|
|
204
|
+
def get_duration()
|
|
205
|
+
return unless crat && scat
|
|
206
|
+
self.duration = ( scat - crat ).to_i
|
|
207
|
+
end
|
|
208
|
+
def get_age_last_tw()
|
|
209
|
+
return unless twat
|
|
210
|
+
self.age_last_tw = ( now - twat ).to_i
|
|
211
|
+
end
|
|
212
|
+
def get_age_user_scrape()
|
|
213
|
+
return unless scat
|
|
214
|
+
self.age_user_scrape = ( now - scat ).to_i
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
#
|
|
218
|
+
# Per-day metrics
|
|
219
|
+
#
|
|
220
|
+
# Should possibly use duration but need to worry about which one.
|
|
221
|
+
#
|
|
222
|
+
def get_fo_day() self.fo_day = (fo.to_f / age) unless (age.to_i == 0) end
|
|
223
|
+
def get_fr_day() self.fr_day = (fr.to_f / age) unless (age.to_i == 0) end
|
|
224
|
+
def get_tw_day() self.tw_day = (tw.to_f / age) unless (age.to_i == 0) end
|
|
225
|
+
def get_fv_day() self.fv_day = (fv.to_f / age) unless (age.to_i == 0) end
|
|
226
|
+
def get_fr_fo() self.fr_fo = (fr.to_f / fo) unless (fo.to_i == 0) end
|
|
227
|
+
def get_tw_day_recent()
|
|
228
|
+
self.tw_day_recent = (tw.to_f / SINCE_DAY_ZERO)
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
#
|
|
232
|
+
# Coverage: how many seen vs. how many known to exist.
|
|
233
|
+
#
|
|
234
|
+
def get_fo_coverage() self.fo_coverage = (fo_seen.to_f / fo) unless (fo.to_i == 0) end
|
|
235
|
+
def get_fr_coverage() self.fr_coverage = (fr_seen.to_f / fr) unless (fr.to_i == 0) end
|
|
236
|
+
def get_tw_coverage() self.tw_coverage = (tw_seen.to_f / tw) unless (tw.to_i == 0) end
|
|
237
|
+
def get_fv_coverage() self.fv_coverage = (fv_out_seen.to_f / fv) unless (fv.to_i == 0) end
|
|
238
|
+
|
|
239
|
+
#
|
|
240
|
+
# Conversational metrics:
|
|
241
|
+
# favorites, @atsigns and RT's per tweet, in and out
|
|
242
|
+
# RT per @atsign, in and out
|
|
243
|
+
#
|
|
244
|
+
def get_at_tw_out() self.at_tw_out = (at_out_seen.to_f / tw_seen.to_f) unless (tw_seen.to_i == 0) end
|
|
245
|
+
def get_rt_tw_out() self.rt_tw_out = (rt_out_seen.to_f / tw_seen.to_f) unless (tw_seen.to_i == 0) end
|
|
246
|
+
#
|
|
247
|
+
def get_at_in_tw_out() self.at_in_tw_out = (at_in_seen.to_f / tw_seen.to_f) unless (tw_seen.to_i == 0) end
|
|
248
|
+
def get_rt_in_tw_out() self.rt_in_tw_out = (rt_in_seen.to_f / tw_seen.to_f) unless (tw_seen.to_i == 0) end
|
|
249
|
+
#
|
|
250
|
+
def get_rt_at_out() self.rt_at_out = (rt_out_seen.to_f / at_out_seen.to_f) unless (at_out_seen.to_i == 0) end
|
|
251
|
+
def get_rt_at_in() self.rt_at_in = (rt_in_seen.to_f / at_in_seen.to_f) unless (at_in_seen.to_i == 0) end
|
|
252
|
+
|
|
253
|
+
#
|
|
254
|
+
# Reach:
|
|
255
|
+
#
|
|
256
|
+
# (your msgs/day) * |n1|
|
|
257
|
+
#
|
|
258
|
+
# How many of your messages/day might get read. Audience Share (tw_out_share)
|
|
259
|
+
# is a better measure of your impact.
|
|
260
|
+
#
|
|
261
|
+
def get_reach()
|
|
262
|
+
self.get_tw_day or return
|
|
263
|
+
self.reach = (tw_day.to_f * fo)
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
#
|
|
273
|
+
# Executes only if run from command line
|
|
274
|
+
#
|
|
275
|
+
if __FILE__ == $0
|
|
276
|
+
puts "rsrc\t"+Wuclan::Models::UserMetrics.members.join("\t")
|
|
277
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
require 'date'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module Models
|
|
4
|
+
USER_SCRAPING_CONTEXTS = [:user, :friends_ids, :followers_ids, :favorites, ] # :friends, :followers, :user_timeline]
|
|
5
|
+
# USER_SCRAPING_METRICS_MEMBERS = []
|
|
6
|
+
# USER_SCRAPING_CONTEXTS.map do |context|
|
|
7
|
+
# USER_SCRAPING_METRICS_MEMBERS += [
|
|
8
|
+
# ["#{context}_scraped_at", Bignum],
|
|
9
|
+
# ["#{context}_attempts", Integer],
|
|
10
|
+
# ["#{context}_successes", Integer],
|
|
11
|
+
# ["#{context}_failures", String],
|
|
12
|
+
# ]
|
|
13
|
+
# end
|
|
14
|
+
# p USER_SCRAPING_METRICS_MEMBERS
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# Workable kludge to denormalize records
|
|
18
|
+
#
|
|
19
|
+
#
|
|
20
|
+
class UserScrapingMetrics < TypedStruct.new(
|
|
21
|
+
[:id, Integer],
|
|
22
|
+
# *USER_SCRAPING_METRICS_MEMBERS
|
|
23
|
+
["user_scraped_at", Bignum], ["user_attempts", Integer], ["user_successes", Integer], ["user_failures", String],
|
|
24
|
+
["friends_ids_scraped_at", Bignum], ["friends_ids_attempts", Integer], ["friends_ids_successes", Integer], ["friends_ids_failures", String],
|
|
25
|
+
["followers_ids_scraped_at", Bignum], ["followers_ids_attempts", Integer], ["followers_ids_successes", Integer], ["followers_ids_failures", String],
|
|
26
|
+
["favorites_scraped_at", Bignum], ["favorites_attempts", Integer], ["favorites_successes", Integer], ["favorites_failures", String]
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def set context, attr, val
|
|
30
|
+
self.send("#{context}_#{attr}=", val)
|
|
31
|
+
end
|
|
32
|
+
def get context, attr
|
|
33
|
+
self.send("#{context}_#{attr}")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
# Instantiate from pig grouped output
|
|
38
|
+
#
|
|
39
|
+
# user_scraping_metrics 3021 {(followers,20081228023148,2,2,0,0,0,0,0),(friends_ids,20090205064439,1,1,0,0,0,0,0),(followers_ids,20090205064439,1,1,0,0,0,0,0),(favorites,3021,1,1,0,0,0,0,0),(user,200902
|
|
40
|
+
#
|
|
41
|
+
def fill_from_bag scrapings_bag
|
|
42
|
+
scrapings_bag.split(/\),\(/).each do |scraping|
|
|
43
|
+
scraping = scraping.gsub(/^[\{\(]+/,'').gsub(/[)}]+$/,'')
|
|
44
|
+
context, *vals = scraping.split(",", 5)
|
|
45
|
+
next unless USER_SCRAPING_CONTEXTS.include?(context.to_sym)
|
|
46
|
+
[:scraped_at, :attempts, :successes, :failures].zip(vals).each do |attr, val|
|
|
47
|
+
set context, attr, val
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
self
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def self.new id, *args
|
|
54
|
+
if args.length == 1
|
|
55
|
+
usm = super(id)
|
|
56
|
+
usm.fill_from_bag *args
|
|
57
|
+
usm
|
|
58
|
+
else
|
|
59
|
+
super id, *args
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
|
2
|
+
module Wuclan
|
|
3
|
+
#
|
|
4
|
+
# == REST ==
|
|
5
|
+
#
|
|
6
|
+
# http://wiki.developer.myspace.com/index.php?title=MySpace_REST_Resources
|
|
7
|
+
#
|
|
8
|
+
# == OAuth ==
|
|
9
|
+
#
|
|
10
|
+
# http://wiki.developer.myspace.com/index.php?title=OAuth_REST_API_Usage_-_Authentication_Process
|
|
11
|
+
#
|
|
12
|
+
# To get a developer API key:
|
|
13
|
+
# * If you do not already have one, set up a MySpace Developer account.
|
|
14
|
+
# * Go to the [App] page under the MySpace Open Platform's Build tab,
|
|
15
|
+
# * select the button labeled "Create MySpaceID App"
|
|
16
|
+
#
|
|
17
|
+
module Myspace
|
|
18
|
+
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
File without changes
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module Lastfm
|
|
4
|
+
module Scrape
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# Base class for Lastfm API requests
|
|
8
|
+
#
|
|
9
|
+
class Base < TypedStruct.new(
|
|
10
|
+
[:identifier, Integer],
|
|
11
|
+
[:page, Integer],
|
|
12
|
+
[:moreinfo, String],
|
|
13
|
+
[:url, String],
|
|
14
|
+
[:scraped_at, Bignum],
|
|
15
|
+
[:response_code, Integer],
|
|
16
|
+
[:response_message, String],
|
|
17
|
+
[:contents, String]
|
|
18
|
+
)
|
|
19
|
+
# Basic ScrapeRequest functionality
|
|
20
|
+
include Monkeyshines::ScrapeRequestCore
|
|
21
|
+
# Contents are JSON
|
|
22
|
+
include Monkeyshines::RawJsonContents
|
|
23
|
+
# Paginated
|
|
24
|
+
class_inheritable_accessor :resource_path, :page_limit, :max_items
|
|
25
|
+
# API
|
|
26
|
+
cattr_accessor :api_key
|
|
27
|
+
#
|
|
28
|
+
def initialize *args
|
|
29
|
+
super *args
|
|
30
|
+
self.page = (page.to_i < 1 ? 1 : page.to_i)
|
|
31
|
+
make_url! if (! url)
|
|
32
|
+
end
|
|
33
|
+
#
|
|
34
|
+
# Generate request URL from other attributes
|
|
35
|
+
def make_url
|
|
36
|
+
# This works for most of the twitter calls
|
|
37
|
+
"http://ws.audioscrobbler.com/2.0/?method=#{resource_path}&artist=#{identifier}&limit=100&page=#{page}&api_key=#{api_key}&format=json"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
#
|
|
42
|
+
# For parsing, note the different form for empty responses.
|
|
43
|
+
#
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
class AlbumInfoRequest < Base ; self.resource_path = 'Album.getInfo' ; end
|
|
47
|
+
class AlbumTagsRequest < Base ; self.resource_path = 'Album.getTags' ; end
|
|
48
|
+
class ArtistEventsRequest < Base ; self.resource_path = 'Artist.getEvents' ; end
|
|
49
|
+
class ArtistImagesRequest < Base ; self.resource_path = 'Artist.getImages' ; end
|
|
50
|
+
class ArtistInfoRequest < Base ; self.resource_path = 'Artist.getInfo' ; end
|
|
51
|
+
class ArtistPodcastRequest < Base ; self.resource_path = 'Artist.getPodcast' ; end
|
|
52
|
+
class ArtistShoutsRequest < Base ; self.resource_path = 'Artist.getShouts' ; end
|
|
53
|
+
class ArtistShoutsRequest < Base ; self.resource_path = 'artist.getShouts' ; end
|
|
54
|
+
class ArtistSimilarRequest < Base ; self.resource_path = 'Artist.getSimilar' ; end
|
|
55
|
+
class ArtistTagsRequest < Base ; self.resource_path = 'Artist.getTags' ; end
|
|
56
|
+
class ArtistTopAlbumsRequest < Base ; self.resource_path = 'Artist.getTopAlbums' ; end
|
|
57
|
+
class ArtistTopFansRequest < Base ; self.resource_path = 'Artist.getTopFans' ; end
|
|
58
|
+
class ArtistTopTagsRequest < Base ; self.resource_path = 'Artist.getTopTags' ; end
|
|
59
|
+
class ArtistTopTracksRequest < Base ; self.resource_path = 'Artist.getTopTracks' ; end
|
|
60
|
+
class EventAttendeesRequest < Base ; self.resource_path = 'Event.getAttendees' ; end
|
|
61
|
+
class EventInfoRequest < Base ; self.resource_path = 'Event.getInfo' ; end
|
|
62
|
+
class EventShoutsRequest < Base ; self.resource_path = 'Event.getShouts' ; end
|
|
63
|
+
class GeoEventsRequest < Base ; self.resource_path = 'Geo.getEvents' ; end
|
|
64
|
+
class GeoTopArtistsRequest < Base ; self.resource_path = 'Geo.getTopArtists' ; end
|
|
65
|
+
class GeoTopTracksRequest < Base ; self.resource_path = 'Geo.getTopTracks' ; end
|
|
66
|
+
class GroupMembersRequest < Base ; self.resource_path = 'Group.getMembers' ; end
|
|
67
|
+
class GroupWeeklyAlbumChartRequest < Base ; self.resource_path = 'Group.getWeeklyAlbumChart' ; end
|
|
68
|
+
class GroupWeeklyArtistChartRequest < Base ; self.resource_path = 'Group.getWeeklyArtistChart' ; end
|
|
69
|
+
class GroupWeeklyChartListRequest < Base ; self.resource_path = 'Group.getWeeklyChartList' ; end
|
|
70
|
+
class GroupWeeklyTrackChartRequest < Base ; self.resource_path = 'Group.getWeeklyTrackChart' ; end
|
|
71
|
+
class PlaylistfetchRequest < Base ; self.resource_path = 'Playlist.fetch' ; end
|
|
72
|
+
class TagSimilarRequest < Base ; self.resource_path = 'Tag.getSimilar' ; end
|
|
73
|
+
class TagTopAlbumsRequest < Base ; self.resource_path = 'Tag.getTopAlbums' ; end
|
|
74
|
+
class TagTopArtistsRequest < Base ; self.resource_path = 'Tag.getTopArtists' ; end
|
|
75
|
+
class TagTopTagsRequest < Base ; self.resource_path = 'Tag.getTopTags' ; end
|
|
76
|
+
class TagTopTracksRequest < Base ; self.resource_path = 'Tag.getTopTracks' ; end
|
|
77
|
+
class TagWeeklyArtistChartRequest < Base ; self.resource_path = 'Tag.getWeeklyArtistChart' ; end
|
|
78
|
+
class TagWeeklyChartListRequest < Base ; self.resource_path = 'Tag.getWeeklyChartList' ; end
|
|
79
|
+
class TasteometercompareRequest < Base ; self.resource_path = 'Tasteometer.compare' ; end
|
|
80
|
+
class TrackInfoRequest < Base ; self.resource_path = 'Track.getInfo' ; end
|
|
81
|
+
class TrackSimilarRequest < Base ; self.resource_path = 'Track.getSimilar' ; end
|
|
82
|
+
class TrackTagsRequest < Base ; self.resource_path = 'Track.getTags' ; end
|
|
83
|
+
class TrackTopFansRequest < Base ; self.resource_path = 'Track.getTopFans' ; end
|
|
84
|
+
class TrackTopTagsRequest < Base ; self.resource_path = 'Track.getTopTags' ; end
|
|
85
|
+
class UserEventsRequest < Base ; self.resource_path = 'User.getEvents' ; end
|
|
86
|
+
class UserFriendsRequest < Base ; self.resource_path = 'User.getFriends' ; end
|
|
87
|
+
class UserInfoRequest < Base ; self.resource_path = 'User.getInfo' ; end
|
|
88
|
+
class UserLovedTracksRequest < Base ; self.resource_path = 'User.getLovedTracks' ; end
|
|
89
|
+
class UserNeighboursRequest < Base ; self.resource_path = 'User.getNeighbours' ; end
|
|
90
|
+
class UserPastEventsRequest < Base ; self.resource_path = 'User.getPastEvents' ; end
|
|
91
|
+
class UserPlaylistsRequest < Base ; self.resource_path = 'User.getPlaylists' ; end
|
|
92
|
+
class UserRecentStationsRequest < Base ; self.resource_path = 'User.getRecentStations' ; end
|
|
93
|
+
class UserRecentTracksRequest < Base ; self.resource_path = 'User.getRecentTracks' ; end
|
|
94
|
+
class UserRecommendedArtistsRequest < Base ; self.resource_path = 'User.getRecommendedArtists' ; end
|
|
95
|
+
class UserRecommendedEventsRequest < Base ; self.resource_path = 'User.getRecommendedEvents' ; end
|
|
96
|
+
class UserShoutsRequest < Base ; self.resource_path = 'User.getShouts' ; end
|
|
97
|
+
class UserTopAlbumsRequest < Base ; self.resource_path = 'User.getTopAlbums' ; end
|
|
98
|
+
class UserTopArtistsRequest < Base ; self.resource_path = 'User.getTopArtists' ; end
|
|
99
|
+
class UserTopTagsRequest < Base ; self.resource_path = 'User.getTopTags' ; end
|
|
100
|
+
class UserTopTracksRequest < Base ; self.resource_path = 'User.getTopTracks' ; end
|
|
101
|
+
class UserWeeklyAlbumChartRequest < Base ; self.resource_path = 'User.getWeeklyAlbumChart' ; end
|
|
102
|
+
class UserWeeklyArtistChartRequest < Base ; self.resource_path = 'User.getWeeklyArtistChart' ; end
|
|
103
|
+
class UserWeeklyChartListRequest < Base ; self.resource_path = 'User.getWeeklyChartList' ; end
|
|
104
|
+
class UserWeeklyTrackChartRequest < Base ; self.resource_path = 'User.getWeeklyTrackChart' ; end
|
|
105
|
+
class VenueEventsRequest < Base ; self.resource_path = 'Venue.getEvents' ; end
|
|
106
|
+
class VenuePastEventsRequest < Base ; self.resource_path = 'Venue.getPastEvents' ; end
|
|
107
|
+
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module TwitterFriends::StructModel
|
|
2
|
+
|
|
3
|
+
module RelationshipCommon
|
|
4
|
+
def rdf_resource
|
|
5
|
+
@rdf_resource ||= rdf_component(user_a_id, :user)
|
|
6
|
+
end
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
AFollowsB.class_eval do
|
|
10
|
+
include TwitterFriends::TwitterRdf
|
|
11
|
+
def to_rdf3_tuples
|
|
12
|
+
[
|
|
13
|
+
[rdf_component(user_a_id, :user), rdf_pred(:follows), rdf_component(user_b_id, :user)]
|
|
14
|
+
]
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
AFavoritesB.class_eval do
|
|
19
|
+
include TwitterFriends::TwitterRdf
|
|
20
|
+
def to_rdf3_tuples
|
|
21
|
+
[
|
|
22
|
+
[rdf_component(user_a_id, :user), rdf_pred(:favorited_tweet_by), rdf_component(user_b_id, :user), rdf_component(status_id, :tweet) ],
|
|
23
|
+
[rdf_component(user_a_id, :user), rdf_pred(:favorited_tweet), rdf_component(status_id, :tweet) ],
|
|
24
|
+
]
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
ARepliesB.class_eval do
|
|
29
|
+
include TwitterFriends::TwitterRdf
|
|
30
|
+
def to_rdf3_tuples
|
|
31
|
+
[
|
|
32
|
+
[rdf_component(user_a_id, :user), rdf_pred(:replied_to), rdf_component(user_b_id, :user), rdf_component(status_id, :tweet) ],
|
|
33
|
+
[rdf_component(status_id, :tweet), rdf_pred(:continues_thread), rdf_component(in_reply_to_status_id, :tweet) ],
|
|
34
|
+
]
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
AAtsignsB.class_eval do
|
|
39
|
+
include TwitterFriends::TwitterRdf
|
|
40
|
+
def to_rdf3_tuples
|
|
41
|
+
[
|
|
42
|
+
[rdf_component(user_a_id, :user), rdf_pred(:atsigns), rdf_component(user_b_name, :user)]
|
|
43
|
+
]
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
module TwitterFriends::StructModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# I don't know what graph arcs are interesting
|
|
6
|
+
# for RDF we're dumping
|
|
7
|
+
# user_a ==tweeted_hashtag=> hashtag
|
|
8
|
+
# and
|
|
9
|
+
# tweet ==has_hashtag=> hashtag
|
|
10
|
+
#
|
|
11
|
+
Hashtag.class_eval do
|
|
12
|
+
include TwitterFriends::TwitterRdf
|
|
13
|
+
#
|
|
14
|
+
# Ugh. We *really* wish we could reify in nt
|
|
15
|
+
#
|
|
16
|
+
def to_rdf3_tuples
|
|
17
|
+
[
|
|
18
|
+
[rdf_component(twitter_user_id, :user), rdf_pred(:tweeted_hashtag), rdf_component(text, :str), rdf_component(status_id, :tweet)],
|
|
19
|
+
[rdf_component(status_id, :tweet), rdf_pred(:has_hashtag), rdf_component(text, :str), rdf_component(twitter_user_id, :user)],
|
|
20
|
+
]
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
#
|
|
25
|
+
# I don't know what graph arcs are interesting
|
|
26
|
+
# for RDF we're dumping
|
|
27
|
+
# user_a ==tweeted_tweet_url=> tweet_url
|
|
28
|
+
# and
|
|
29
|
+
# tweet ==has_tweet_url=> tweet_url
|
|
30
|
+
#
|
|
31
|
+
TweetUrl.class_eval do
|
|
32
|
+
include TwitterFriends::TwitterRdf
|
|
33
|
+
def to_rdf3_tuples
|
|
34
|
+
[
|
|
35
|
+
[rdf_component(twitter_user_id, :user), rdf_pred(:tweeted_url), rdf_component(text, :str), rdf_component(status_id, :tweet)],
|
|
36
|
+
[rdf_component(status_id, :tweet), rdf_pred(:has_tweet_url), rdf_component(text, :str), rdf_component(twitter_user_id, :user)],
|
|
37
|
+
]
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
#
|
|
42
|
+
# I don't know what graph arcs are interesting
|
|
43
|
+
# for RDF we're dumping
|
|
44
|
+
# user ==retweets=> user
|
|
45
|
+
# and
|
|
46
|
+
# user ==rtwhored_in=> status
|
|
47
|
+
#
|
|
48
|
+
#
|
|
49
|
+
ARetweetsB.class_eval do
|
|
50
|
+
include TwitterFriends::TwitterRdf
|
|
51
|
+
def to_rdf3_tuples
|
|
52
|
+
if is_retweet?
|
|
53
|
+
[
|
|
54
|
+
[rdf_component(user_a_id, :user), rdf_pred(:retweets), rdf_component(user_b_name, :user), rdf_component(status_id, :tweet)],
|
|
55
|
+
]
|
|
56
|
+
else
|
|
57
|
+
[
|
|
58
|
+
[rdf_component(user_a_id, :user), rdf_pred(:rtwhored_in), rdf_component(status_id, :tweet)],
|
|
59
|
+
]
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
end
|