wuclan 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
2
|
+
module Wuclan
|
3
|
+
module Lastfm
|
4
|
+
module Scrape
|
5
|
+
|
6
|
+
#
|
7
|
+
# Simple requestables
|
8
|
+
#
|
9
|
+
|
10
|
+
class LastfmArtistInfoRequest
|
11
|
+
self.requestables = [
|
12
|
+
LastfmArtistSimilarRequest,
|
13
|
+
LastfmArtistTopAlbumsRequest,
|
14
|
+
LastfmArtistTopTracksRequest,
|
15
|
+
LastfmArtistShoutsRequest,
|
16
|
+
LastfmArtistEventsRequest,
|
17
|
+
LastfmArtistTopFansRequest,
|
18
|
+
# LastfmArtistTopTagsRequest, LastfmArtistImagesRequest, LastfmArtistPodcastRequest,
|
19
|
+
]
|
20
|
+
end
|
21
|
+
|
22
|
+
class LastfmTrackInfoRequest
|
23
|
+
self.requestables = [LastfmTrackSimilarRequest, LastfmTrackTopFansRequest, LastfmTrackTopTagsRequest]
|
24
|
+
end
|
25
|
+
class LastfmEventInfoRequest
|
26
|
+
self.requestables = [LastfmEventAttendeesRequest, LastfmEventShoutsRequest]
|
27
|
+
end
|
28
|
+
class LastfmUserTopTagsRequest # LastfmUserInfoRequest
|
29
|
+
self.requestables = [
|
30
|
+
# LastfmUserTopTagsRequest,
|
31
|
+
LastfmUserEventsRequest,
|
32
|
+
LastfmUserPastEventsRequest,
|
33
|
+
LastfmUserFriendsRequest, # recenttracks
|
34
|
+
LastfmUserNeighboursRequest,
|
35
|
+
LastfmUserLovedTracksRequest,
|
36
|
+
LastfmUserRecentTracksRequest,
|
37
|
+
LastfmUserShoutsRequest,
|
38
|
+
LastfmUserTopAlbumsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
|
39
|
+
LastfmUserTopArtistsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
|
40
|
+
LastfmUserTopTracksRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
|
41
|
+
# uninteresting(?): LastfmUserPlaylistsRequest, LastfmUserWeeklyAlbumChartRequest, LastfmUserWeeklyArtistChartRequest, LastfmUserWeeklyChartListRequest, LastfmUserWeeklyTrackChartRequest,
|
42
|
+
# needs auth: LastfmUserInfoRequest, LastfmUserRecentStationsRequest, LastfmUserRecommendedArtistsRequest, LastfmUserRecommendedEventsRequest,
|
43
|
+
]
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Recursive requests based on contents
|
48
|
+
#
|
49
|
+
|
50
|
+
module LastfmTimeWindowed
|
51
|
+
def recursive_requests *args, &block
|
52
|
+
super(*args, &block)
|
53
|
+
unless (identifier =~ /&period=/)
|
54
|
+
['7day', '3month', '6month'].each do |period|
|
55
|
+
req = self.class.new(identifier+"&period=#{period}")
|
56
|
+
req.generation = generation.to_i
|
57
|
+
yield req
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
[LastfmUserTopArtistsRequest, LastfmUserTopAlbumsRequest, LastfmUserTopTracksRequest
|
63
|
+
].each do |klass|
|
64
|
+
klass.class_eval do include LastfmTimeWindowed ; end
|
65
|
+
end
|
66
|
+
|
67
|
+
module LastfmContainsArtists
|
68
|
+
def recursive_requests *args, &block
|
69
|
+
super(*args, &block)
|
70
|
+
items.each do |artist|
|
71
|
+
req = LastfmArtistInfoRequest.new(url_encode(artist['name']))
|
72
|
+
req.generation = generation.to_i + 1
|
73
|
+
yield req
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
[ LastfmArtistSimilarRequest, LastfmGeoTopArtistsRequest, LastfmTagTopArtistsRequest,
|
78
|
+
LastfmUserRecommendedArtistsRequest, LastfmUserTopArtistsRequest,
|
79
|
+
].each do |klass|
|
80
|
+
klass.class_eval do include LastfmContainsArtists ; end
|
81
|
+
end
|
82
|
+
|
83
|
+
module LastfmContainsAlbums
|
84
|
+
def recursive_requests *args, &block
|
85
|
+
super(*args, &block)
|
86
|
+
items.each do |item|
|
87
|
+
obj_artist = item['artist']['name'] || item['artist']['#text'] rescue nil
|
88
|
+
req = LastfmAlbumInfoRequest.from_identifier_hash(
|
89
|
+
item['name'], :artist => obj_artist, :mbid => item['mbid'] )
|
90
|
+
req.generation = generation.to_i + 1
|
91
|
+
yield req
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
[ LastfmArtistTopAlbumsRequest, LastfmTagTopAlbumsRequest, LastfmUserTopAlbumsRequest,
|
96
|
+
].each do |klass|
|
97
|
+
klass.class_eval do include LastfmContainsAlbums ; end
|
98
|
+
end
|
99
|
+
|
100
|
+
module LastfmContainsTracks
|
101
|
+
def recursive_requests *args, &block
|
102
|
+
super(*args, &block)
|
103
|
+
items.each do |track|
|
104
|
+
obj_artist = track['artist']['name'] || track['artist']['#text'] rescue nil
|
105
|
+
req = LastfmTrackInfoRequest.from_identifier_hash(
|
106
|
+
track['name'], :artist => obj_artist, :mbid => track['mbid'])
|
107
|
+
req.generation = generation.to_i + 1
|
108
|
+
yield req
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
[ LastfmArtistTopTracksRequest, LastfmGeoTopTracksRequest, LastfmTagTopTracksRequest,
|
113
|
+
LastfmTrackSimilarRequest, LastfmUserLovedTracksRequest, LastfmUserRecentTracksRequest,
|
114
|
+
LastfmUserTopTracksRequest,
|
115
|
+
].each do |klass|
|
116
|
+
klass.class_eval do include LastfmContainsTracks ; end
|
117
|
+
end
|
118
|
+
|
119
|
+
module LastfmContainsEvents
|
120
|
+
def recursive_requests *args, &block
|
121
|
+
super(*args, &block)
|
122
|
+
items.each do |event|
|
123
|
+
req = LastfmEventInfoRequest.new(event['id'])
|
124
|
+
req.generation = generation.to_i + 1
|
125
|
+
yield req
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
[ LastfmArtistEventsRequest, LastfmGeoEventsRequest, LastfmUserEventsRequest,
|
130
|
+
LastfmUserPastEventsRequest, LastfmUserRecommendedEventsRequest, LastfmVenueEventsRequest,
|
131
|
+
LastfmVenuePastEventsRequest,
|
132
|
+
].each do |klass|
|
133
|
+
klass.class_eval do include LastfmContainsEvents ; end
|
134
|
+
end
|
135
|
+
|
136
|
+
module LastfmContainsUsers
|
137
|
+
def recursive_requests *args, &block
|
138
|
+
super(*args, &block)
|
139
|
+
items.each do |user|
|
140
|
+
req = LastfmUserTopTagsRequest.new(url_encode(user['name']))
|
141
|
+
req.generation = generation.to_i + 1
|
142
|
+
yield req
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
[ LastfmArtistTopFansRequest, LastfmEventAttendeesRequest, LastfmGroupMembersRequest,
|
147
|
+
LastfmTrackTopFansRequest, LastfmUserFriendsRequest, LastfmUserNeighboursRequest,
|
148
|
+
].each do |klass|
|
149
|
+
klass.class_eval do include LastfmContainsUsers ; end
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
2
|
+
require 'wuclan/lastfm/scrape/base.rb'
|
3
|
+
require 'wuclan/lastfm/scrape/concrete.rb'
|
4
|
+
|
5
|
+
module Wuclan
|
6
|
+
module Lastfm
|
7
|
+
module Scrape
|
8
|
+
autoload :LastfmJob, 'wuclan/lastfm/scrape/lastfm_job.rb'
|
9
|
+
autoload :LastfmRequestStream, 'wuclan/lastfm/scrape/lastfm_request_stream.rb'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Wuclan
|
2
|
+
module Models
|
3
|
+
class UserGraphMetrics < Struct.new(
|
4
|
+
:id,
|
5
|
+
:any_with,
|
6
|
+
:any_out_with,
|
7
|
+
:any_in_with,
|
8
|
+
#
|
9
|
+
:fo_sampled,
|
10
|
+
:fr_sampled,
|
11
|
+
#
|
12
|
+
:re_out_sampled,
|
13
|
+
:re_in_sampled,
|
14
|
+
#
|
15
|
+
:at_out_sampled,
|
16
|
+
:at_in_sampled,
|
17
|
+
:at_out_with,
|
18
|
+
:at_in_with,
|
19
|
+
#
|
20
|
+
:rt_out_sampled,
|
21
|
+
:rt_in_sampled,
|
22
|
+
:rt_out_with,
|
23
|
+
:rt_in_with,
|
24
|
+
#
|
25
|
+
:fv_out_sampled,
|
26
|
+
:fv_in_sampled,
|
27
|
+
:fv_out_with,
|
28
|
+
:fv_in_with
|
29
|
+
)
|
30
|
+
|
31
|
+
# ===========================================================================
|
32
|
+
#
|
33
|
+
# Graph Measures
|
34
|
+
#
|
35
|
+
|
36
|
+
#
|
37
|
+
# Influx:
|
38
|
+
#
|
39
|
+
# (messages/day) from all your n1
|
40
|
+
#
|
41
|
+
# This says how many messages you see go by in a day.
|
42
|
+
#
|
43
|
+
# A person with a massive influx is either not reading any tweets (uses
|
44
|
+
# twitter as a podium), is dipping into twitter as a news river (we should
|
45
|
+
# discount follow links), or is using a tool such as TweetDeck to fake
|
46
|
+
# follow (we should more aggressively segregate their strong links)
|
47
|
+
#
|
48
|
+
def get_influx()
|
49
|
+
#
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# tw_out_share -- Audience Share:
|
54
|
+
#
|
55
|
+
# (your msgs out/day) / (Sum over n1o of msgs in / day)
|
56
|
+
#
|
57
|
+
# This says how much of your followers' attention is occupied by your tweet
|
58
|
+
# stream
|
59
|
+
#
|
60
|
+
def get_tw_out_share()
|
61
|
+
self.tw_out_share = twitter_user.tw_out_share
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# n1i_fv_share -- Sum, for all that favorite you, of
|
66
|
+
#
|
67
|
+
# (favs to you / (max[20, number faved])
|
68
|
+
#
|
69
|
+
# if I have 12 faves and four are to you you get (4/20)favshare ; if I have
|
70
|
+
# twenty-four, and four are to you, that makes a (1/6)favshare contribution.
|
71
|
+
#
|
72
|
+
def get_n1i_fv_share( twitter_user)
|
73
|
+
self.n1i_fv_share = twitter_user.n1i_fv_share
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# n1o_strong -- Strong links out
|
78
|
+
#
|
79
|
+
def get_n1o_strong( twitter_user)
|
80
|
+
self.n1o_strong = twitter_user.n1o_strong
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# n1i_strong -- Strong links in
|
85
|
+
#
|
86
|
+
def get_n1i_strong( twitter_user)
|
87
|
+
self.n1i_strong = twitter_user.n1i_strong
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# cluster_coeff -- Strong links between members of n1 over number of possible
|
92
|
+
# links between members of n1
|
93
|
+
#
|
94
|
+
def cluster_coeff(twitter_user, multi_edge)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|