wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module Lastfm
|
|
4
|
+
module Scrape
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# Simple requestables
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
class LastfmArtistInfoRequest
|
|
11
|
+
self.requestables = [
|
|
12
|
+
LastfmArtistSimilarRequest,
|
|
13
|
+
LastfmArtistTopAlbumsRequest,
|
|
14
|
+
LastfmArtistTopTracksRequest,
|
|
15
|
+
LastfmArtistShoutsRequest,
|
|
16
|
+
LastfmArtistEventsRequest,
|
|
17
|
+
LastfmArtistTopFansRequest,
|
|
18
|
+
# LastfmArtistTopTagsRequest, LastfmArtistImagesRequest, LastfmArtistPodcastRequest,
|
|
19
|
+
]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class LastfmTrackInfoRequest
|
|
23
|
+
self.requestables = [LastfmTrackSimilarRequest, LastfmTrackTopFansRequest, LastfmTrackTopTagsRequest]
|
|
24
|
+
end
|
|
25
|
+
class LastfmEventInfoRequest
|
|
26
|
+
self.requestables = [LastfmEventAttendeesRequest, LastfmEventShoutsRequest]
|
|
27
|
+
end
|
|
28
|
+
class LastfmUserTopTagsRequest # LastfmUserInfoRequest
|
|
29
|
+
self.requestables = [
|
|
30
|
+
# LastfmUserTopTagsRequest,
|
|
31
|
+
LastfmUserEventsRequest,
|
|
32
|
+
LastfmUserPastEventsRequest,
|
|
33
|
+
LastfmUserFriendsRequest, # recenttracks
|
|
34
|
+
LastfmUserNeighboursRequest,
|
|
35
|
+
LastfmUserLovedTracksRequest,
|
|
36
|
+
LastfmUserRecentTracksRequest,
|
|
37
|
+
LastfmUserShoutsRequest,
|
|
38
|
+
LastfmUserTopAlbumsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
|
|
39
|
+
LastfmUserTopArtistsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
|
|
40
|
+
LastfmUserTopTracksRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
|
|
41
|
+
# uninteresting(?): LastfmUserPlaylistsRequest, LastfmUserWeeklyAlbumChartRequest, LastfmUserWeeklyArtistChartRequest, LastfmUserWeeklyChartListRequest, LastfmUserWeeklyTrackChartRequest,
|
|
42
|
+
# needs auth: LastfmUserInfoRequest, LastfmUserRecentStationsRequest, LastfmUserRecommendedArtistsRequest, LastfmUserRecommendedEventsRequest,
|
|
43
|
+
]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
#
|
|
47
|
+
# Recursive requests based on contents
|
|
48
|
+
#
|
|
49
|
+
|
|
50
|
+
module LastfmTimeWindowed
|
|
51
|
+
def recursive_requests *args, &block
|
|
52
|
+
super(*args, &block)
|
|
53
|
+
unless (identifier =~ /&period=/)
|
|
54
|
+
['7day', '3month', '6month'].each do |period|
|
|
55
|
+
req = self.class.new(identifier+"&period=#{period}")
|
|
56
|
+
req.generation = generation.to_i
|
|
57
|
+
yield req
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
[LastfmUserTopArtistsRequest, LastfmUserTopAlbumsRequest, LastfmUserTopTracksRequest
|
|
63
|
+
].each do |klass|
|
|
64
|
+
klass.class_eval do include LastfmTimeWindowed ; end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
module LastfmContainsArtists
|
|
68
|
+
def recursive_requests *args, &block
|
|
69
|
+
super(*args, &block)
|
|
70
|
+
items.each do |artist|
|
|
71
|
+
req = LastfmArtistInfoRequest.new(url_encode(artist['name']))
|
|
72
|
+
req.generation = generation.to_i + 1
|
|
73
|
+
yield req
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
[ LastfmArtistSimilarRequest, LastfmGeoTopArtistsRequest, LastfmTagTopArtistsRequest,
|
|
78
|
+
LastfmUserRecommendedArtistsRequest, LastfmUserTopArtistsRequest,
|
|
79
|
+
].each do |klass|
|
|
80
|
+
klass.class_eval do include LastfmContainsArtists ; end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
module LastfmContainsAlbums
|
|
84
|
+
def recursive_requests *args, &block
|
|
85
|
+
super(*args, &block)
|
|
86
|
+
items.each do |item|
|
|
87
|
+
obj_artist = item['artist']['name'] || item['artist']['#text'] rescue nil
|
|
88
|
+
req = LastfmAlbumInfoRequest.from_identifier_hash(
|
|
89
|
+
item['name'], :artist => obj_artist, :mbid => item['mbid'] )
|
|
90
|
+
req.generation = generation.to_i + 1
|
|
91
|
+
yield req
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
[ LastfmArtistTopAlbumsRequest, LastfmTagTopAlbumsRequest, LastfmUserTopAlbumsRequest,
|
|
96
|
+
].each do |klass|
|
|
97
|
+
klass.class_eval do include LastfmContainsAlbums ; end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
module LastfmContainsTracks
|
|
101
|
+
def recursive_requests *args, &block
|
|
102
|
+
super(*args, &block)
|
|
103
|
+
items.each do |track|
|
|
104
|
+
obj_artist = track['artist']['name'] || track['artist']['#text'] rescue nil
|
|
105
|
+
req = LastfmTrackInfoRequest.from_identifier_hash(
|
|
106
|
+
track['name'], :artist => obj_artist, :mbid => track['mbid'])
|
|
107
|
+
req.generation = generation.to_i + 1
|
|
108
|
+
yield req
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
[ LastfmArtistTopTracksRequest, LastfmGeoTopTracksRequest, LastfmTagTopTracksRequest,
|
|
113
|
+
LastfmTrackSimilarRequest, LastfmUserLovedTracksRequest, LastfmUserRecentTracksRequest,
|
|
114
|
+
LastfmUserTopTracksRequest,
|
|
115
|
+
].each do |klass|
|
|
116
|
+
klass.class_eval do include LastfmContainsTracks ; end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
module LastfmContainsEvents
|
|
120
|
+
def recursive_requests *args, &block
|
|
121
|
+
super(*args, &block)
|
|
122
|
+
items.each do |event|
|
|
123
|
+
req = LastfmEventInfoRequest.new(event['id'])
|
|
124
|
+
req.generation = generation.to_i + 1
|
|
125
|
+
yield req
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
[ LastfmArtistEventsRequest, LastfmGeoEventsRequest, LastfmUserEventsRequest,
|
|
130
|
+
LastfmUserPastEventsRequest, LastfmUserRecommendedEventsRequest, LastfmVenueEventsRequest,
|
|
131
|
+
LastfmVenuePastEventsRequest,
|
|
132
|
+
].each do |klass|
|
|
133
|
+
klass.class_eval do include LastfmContainsEvents ; end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
module LastfmContainsUsers
|
|
137
|
+
def recursive_requests *args, &block
|
|
138
|
+
super(*args, &block)
|
|
139
|
+
items.each do |user|
|
|
140
|
+
req = LastfmUserTopTagsRequest.new(url_encode(user['name']))
|
|
141
|
+
req.generation = generation.to_i + 1
|
|
142
|
+
yield req
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
[ LastfmArtistTopFansRequest, LastfmEventAttendeesRequest, LastfmGroupMembersRequest,
|
|
147
|
+
LastfmTrackTopFansRequest, LastfmUserFriendsRequest, LastfmUserNeighboursRequest,
|
|
148
|
+
].each do |klass|
|
|
149
|
+
klass.class_eval do include LastfmContainsUsers ; end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
|
2
|
+
require 'wuclan/lastfm/scrape/base.rb'
|
|
3
|
+
require 'wuclan/lastfm/scrape/concrete.rb'
|
|
4
|
+
|
|
5
|
+
module Wuclan
|
|
6
|
+
module Lastfm
|
|
7
|
+
module Scrape
|
|
8
|
+
autoload :LastfmJob, 'wuclan/lastfm/scrape/lastfm_job.rb'
|
|
9
|
+
autoload :LastfmRequestStream, 'wuclan/lastfm/scrape/lastfm_request_stream.rb'
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Models
|
|
3
|
+
class UserGraphMetrics < Struct.new(
|
|
4
|
+
:id,
|
|
5
|
+
:any_with,
|
|
6
|
+
:any_out_with,
|
|
7
|
+
:any_in_with,
|
|
8
|
+
#
|
|
9
|
+
:fo_sampled,
|
|
10
|
+
:fr_sampled,
|
|
11
|
+
#
|
|
12
|
+
:re_out_sampled,
|
|
13
|
+
:re_in_sampled,
|
|
14
|
+
#
|
|
15
|
+
:at_out_sampled,
|
|
16
|
+
:at_in_sampled,
|
|
17
|
+
:at_out_with,
|
|
18
|
+
:at_in_with,
|
|
19
|
+
#
|
|
20
|
+
:rt_out_sampled,
|
|
21
|
+
:rt_in_sampled,
|
|
22
|
+
:rt_out_with,
|
|
23
|
+
:rt_in_with,
|
|
24
|
+
#
|
|
25
|
+
:fv_out_sampled,
|
|
26
|
+
:fv_in_sampled,
|
|
27
|
+
:fv_out_with,
|
|
28
|
+
:fv_in_with
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# ===========================================================================
|
|
32
|
+
#
|
|
33
|
+
# Graph Measures
|
|
34
|
+
#
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
# Influx:
|
|
38
|
+
#
|
|
39
|
+
# (messages/day) from all your n1
|
|
40
|
+
#
|
|
41
|
+
# This says how many messages you see go by in a day.
|
|
42
|
+
#
|
|
43
|
+
# A person with a massive influx is either not reading any tweets (uses
|
|
44
|
+
# twitter as a podium), is dipping into twitter as a news river (we should
|
|
45
|
+
# discount follow links), or is using a tool such as TweetDeck to fake
|
|
46
|
+
# follow (we should more aggressively segregate their strong links)
|
|
47
|
+
#
|
|
48
|
+
def get_influx()
|
|
49
|
+
#
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
#
|
|
53
|
+
# tw_out_share -- Audience Share:
|
|
54
|
+
#
|
|
55
|
+
# (your msgs out/day) / (Sum over n1o of msgs in / day)
|
|
56
|
+
#
|
|
57
|
+
# This says how much of your followers' attention is occupied by your tweet
|
|
58
|
+
# stream
|
|
59
|
+
#
|
|
60
|
+
def get_tw_out_share()
|
|
61
|
+
self.tw_out_share = twitter_user.tw_out_share
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# n1i_fv_share -- Sum, for all that favorite you, of
|
|
66
|
+
#
|
|
67
|
+
# (favs to you / (max[20, number faved])
|
|
68
|
+
#
|
|
69
|
+
# if I have 12 faves and four are to you you get (4/20)favshare ; if I have
|
|
70
|
+
# twenty-four, and four are to you, that makes a (1/6)favshare contribution.
|
|
71
|
+
#
|
|
72
|
+
def get_n1i_fv_share( twitter_user)
|
|
73
|
+
self.n1i_fv_share = twitter_user.n1i_fv_share
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
#
|
|
77
|
+
# n1o_strong -- Strong links out
|
|
78
|
+
#
|
|
79
|
+
def get_n1o_strong( twitter_user)
|
|
80
|
+
self.n1o_strong = twitter_user.n1o_strong
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
#
|
|
84
|
+
# n1i_strong -- Strong links in
|
|
85
|
+
#
|
|
86
|
+
def get_n1i_strong( twitter_user)
|
|
87
|
+
self.n1i_strong = twitter_user.n1i_strong
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
#
|
|
91
|
+
# cluster_coeff -- Strong links between members of n1 over number of possible
|
|
92
|
+
# links between members of n1
|
|
93
|
+
#
|
|
94
|
+
def cluster_coeff(twitter_user, multi_edge)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
end
|
|
99
|
+
end
|