wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,154 @@
1
+ require 'monkeyshines/scrape_request/raw_json_contents'
2
+ module Wuclan
3
+ module Lastfm
4
+ module Scrape
5
+
6
+ #
7
+ # Simple requestables
8
+ #
9
+
10
+ class LastfmArtistInfoRequest
11
+ self.requestables = [
12
+ LastfmArtistSimilarRequest,
13
+ LastfmArtistTopAlbumsRequest,
14
+ LastfmArtistTopTracksRequest,
15
+ LastfmArtistShoutsRequest,
16
+ LastfmArtistEventsRequest,
17
+ LastfmArtistTopFansRequest,
18
+ # LastfmArtistTopTagsRequest, LastfmArtistImagesRequest, LastfmArtistPodcastRequest,
19
+ ]
20
+ end
21
+
22
+ class LastfmTrackInfoRequest
23
+ self.requestables = [LastfmTrackSimilarRequest, LastfmTrackTopFansRequest, LastfmTrackTopTagsRequest]
24
+ end
25
+ class LastfmEventInfoRequest
26
+ self.requestables = [LastfmEventAttendeesRequest, LastfmEventShoutsRequest]
27
+ end
28
+ class LastfmUserTopTagsRequest # LastfmUserInfoRequest
29
+ self.requestables = [
30
+ # LastfmUserTopTagsRequest,
31
+ LastfmUserEventsRequest,
32
+ LastfmUserPastEventsRequest,
33
+ LastfmUserFriendsRequest, # recenttracks
34
+ LastfmUserNeighboursRequest,
35
+ LastfmUserLovedTracksRequest,
36
+ LastfmUserRecentTracksRequest,
37
+ LastfmUserShoutsRequest,
38
+ LastfmUserTopAlbumsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
39
+ LastfmUserTopArtistsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
40
+ LastfmUserTopTracksRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
41
+ # uninteresting(?): LastfmUserPlaylistsRequest, LastfmUserWeeklyAlbumChartRequest, LastfmUserWeeklyArtistChartRequest, LastfmUserWeeklyChartListRequest, LastfmUserWeeklyTrackChartRequest,
42
+ # needs auth: LastfmUserInfoRequest, LastfmUserRecentStationsRequest, LastfmUserRecommendedArtistsRequest, LastfmUserRecommendedEventsRequest,
43
+ ]
44
+ end
45
+
46
+ #
47
+ # Recursive requests based on contents
48
+ #
49
+
50
+ module LastfmTimeWindowed
51
+ def recursive_requests *args, &block
52
+ super(*args, &block)
53
+ unless (identifier =~ /&period=/)
54
+ ['7day', '3month', '6month'].each do |period|
55
+ req = self.class.new(identifier+"&period=#{period}")
56
+ req.generation = generation.to_i
57
+ yield req
58
+ end
59
+ end
60
+ end
61
+ end
62
+ [LastfmUserTopArtistsRequest, LastfmUserTopAlbumsRequest, LastfmUserTopTracksRequest
63
+ ].each do |klass|
64
+ klass.class_eval do include LastfmTimeWindowed ; end
65
+ end
66
+
67
+ module LastfmContainsArtists
68
+ def recursive_requests *args, &block
69
+ super(*args, &block)
70
+ items.each do |artist|
71
+ req = LastfmArtistInfoRequest.new(url_encode(artist['name']))
72
+ req.generation = generation.to_i + 1
73
+ yield req
74
+ end
75
+ end
76
+ end
77
+ [ LastfmArtistSimilarRequest, LastfmGeoTopArtistsRequest, LastfmTagTopArtistsRequest,
78
+ LastfmUserRecommendedArtistsRequest, LastfmUserTopArtistsRequest,
79
+ ].each do |klass|
80
+ klass.class_eval do include LastfmContainsArtists ; end
81
+ end
82
+
83
+ module LastfmContainsAlbums
84
+ def recursive_requests *args, &block
85
+ super(*args, &block)
86
+ items.each do |item|
87
+ obj_artist = item['artist']['name'] || item['artist']['#text'] rescue nil
88
+ req = LastfmAlbumInfoRequest.from_identifier_hash(
89
+ item['name'], :artist => obj_artist, :mbid => item['mbid'] )
90
+ req.generation = generation.to_i + 1
91
+ yield req
92
+ end
93
+ end
94
+ end
95
+ [ LastfmArtistTopAlbumsRequest, LastfmTagTopAlbumsRequest, LastfmUserTopAlbumsRequest,
96
+ ].each do |klass|
97
+ klass.class_eval do include LastfmContainsAlbums ; end
98
+ end
99
+
100
+ module LastfmContainsTracks
101
+ def recursive_requests *args, &block
102
+ super(*args, &block)
103
+ items.each do |track|
104
+ obj_artist = track['artist']['name'] || track['artist']['#text'] rescue nil
105
+ req = LastfmTrackInfoRequest.from_identifier_hash(
106
+ track['name'], :artist => obj_artist, :mbid => track['mbid'])
107
+ req.generation = generation.to_i + 1
108
+ yield req
109
+ end
110
+ end
111
+ end
112
+ [ LastfmArtistTopTracksRequest, LastfmGeoTopTracksRequest, LastfmTagTopTracksRequest,
113
+ LastfmTrackSimilarRequest, LastfmUserLovedTracksRequest, LastfmUserRecentTracksRequest,
114
+ LastfmUserTopTracksRequest,
115
+ ].each do |klass|
116
+ klass.class_eval do include LastfmContainsTracks ; end
117
+ end
118
+
119
+ module LastfmContainsEvents
120
+ def recursive_requests *args, &block
121
+ super(*args, &block)
122
+ items.each do |event|
123
+ req = LastfmEventInfoRequest.new(event['id'])
124
+ req.generation = generation.to_i + 1
125
+ yield req
126
+ end
127
+ end
128
+ end
129
+ [ LastfmArtistEventsRequest, LastfmGeoEventsRequest, LastfmUserEventsRequest,
130
+ LastfmUserPastEventsRequest, LastfmUserRecommendedEventsRequest, LastfmVenueEventsRequest,
131
+ LastfmVenuePastEventsRequest,
132
+ ].each do |klass|
133
+ klass.class_eval do include LastfmContainsEvents ; end
134
+ end
135
+
136
+ module LastfmContainsUsers
137
+ def recursive_requests *args, &block
138
+ super(*args, &block)
139
+ items.each do |user|
140
+ req = LastfmUserTopTagsRequest.new(url_encode(user['name']))
141
+ req.generation = generation.to_i + 1
142
+ yield req
143
+ end
144
+ end
145
+ end
146
+ [ LastfmArtistTopFansRequest, LastfmEventAttendeesRequest, LastfmGroupMembersRequest,
147
+ LastfmTrackTopFansRequest, LastfmUserFriendsRequest, LastfmUserNeighboursRequest,
148
+ ].each do |klass|
149
+ klass.class_eval do include LastfmContainsUsers ; end
150
+ end
151
+
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,12 @@
1
+ require 'monkeyshines/scrape_request/raw_json_contents'
2
+ require 'wuclan/lastfm/scrape/base.rb'
3
+ require 'wuclan/lastfm/scrape/concrete.rb'
4
+
5
+ module Wuclan
6
+ module Lastfm
7
+ module Scrape
8
+ autoload :LastfmJob, 'wuclan/lastfm/scrape/lastfm_job.rb'
9
+ autoload :LastfmRequestStream, 'wuclan/lastfm/scrape/lastfm_request_stream.rb'
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,7 @@
1
+ require 'monkeyshines/scrape_request/raw_json_contents'
2
+ module Wuclan
3
+ module Lastfm
4
+ autoload :Scrape, 'wuclan/lastfm/scrape.rb'
5
+ autoload :Parse, 'wuclan/lastfm/parse.rb'
6
+ end
7
+ end
@@ -0,0 +1,99 @@
1
+ module Wuclan
2
+ module Models
3
+ class UserGraphMetrics < Struct.new(
4
+ :id,
5
+ :any_with,
6
+ :any_out_with,
7
+ :any_in_with,
8
+ #
9
+ :fo_sampled,
10
+ :fr_sampled,
11
+ #
12
+ :re_out_sampled,
13
+ :re_in_sampled,
14
+ #
15
+ :at_out_sampled,
16
+ :at_in_sampled,
17
+ :at_out_with,
18
+ :at_in_with,
19
+ #
20
+ :rt_out_sampled,
21
+ :rt_in_sampled,
22
+ :rt_out_with,
23
+ :rt_in_with,
24
+ #
25
+ :fv_out_sampled,
26
+ :fv_in_sampled,
27
+ :fv_out_with,
28
+ :fv_in_with
29
+ )
30
+
31
+ # ===========================================================================
32
+ #
33
+ # Graph Measures
34
+ #
35
+
36
+ #
37
+ # Influx:
38
+ #
39
+ # (messages/day) from all your n1
40
+ #
41
+ # This says how many messages you see go by in a day.
42
+ #
43
+ # A person with a massive influx is either not reading any tweets (uses
44
+ # twitter as a podium), is dipping into twitter as a news river (we should
45
+ # discount follow links), or is using a tool such as TweetDeck to fake
46
+ # follow (we should more aggressively segregate their strong links)
47
+ #
48
+ def get_influx()
49
+ #
50
+ end
51
+
52
+ #
53
+ # tw_out_share -- Audience Share:
54
+ #
55
+ # (your msgs out/day) / (Sum over n1o of msgs in / day)
56
+ #
57
+ # This says how much of your followers' attention is occupied by your tweet
58
+ # stream
59
+ #
60
+ def get_tw_out_share()
61
+ self.tw_out_share = twitter_user.tw_out_share
62
+ end
63
+
64
+ #
65
+ # n1i_fv_share -- Sum, for all that favorite you, of
66
+ #
67
+ # (favs to you / (max[20, number faved])
68
+ #
69
+ # if I have 12 faves and four are to you you get (4/20)favshare ; if I have
70
+ # twenty-four, and four are to you, that makes a (1/6)favshare contribution.
71
+ #
72
+ def get_n1i_fv_share( twitter_user)
73
+ self.n1i_fv_share = twitter_user.n1i_fv_share
74
+ end
75
+
76
+ #
77
+ # n1o_strong -- Strong links out
78
+ #
79
+ def get_n1o_strong( twitter_user)
80
+ self.n1o_strong = twitter_user.n1o_strong
81
+ end
82
+
83
+ #
84
+ # n1i_strong -- Strong links in
85
+ #
86
+ def get_n1i_strong( twitter_user)
87
+ self.n1i_strong = twitter_user.n1i_strong
88
+ end
89
+
90
+ #
91
+ # cluster_coeff -- Strong links between members of n1 over number of possible
92
+ # links between members of n1
93
+ #
94
+ def cluster_coeff(twitter_user, multi_edge)
95
+ end
96
+ end
97
+
98
+ end
99
+ end