wuclan 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
data/LICENSE.textile ADDED
@@ -0,0 +1,20 @@
1
+ ---
2
+ layout: default
3
+ title: MIT License
4
+ ---
5
+
6
+ h1(gemheader). {{ site.gemname }} %(small):: license%
7
+
8
+ <notextile><div class="toggle"></notextile>
9
+
10
+ h2. MIT License
11
+
12
+ __Copyright (c) 2009 Philip (flip) Kromer__
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
+
20
+ <notextile></div></notextile>
data/README.textile ADDED
@@ -0,0 +1,28 @@
1
+
2
+ h2. Help!
3
+
4
+ Send Wuclan questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
5
+
6
+ h3. lib/wuclan/models
7
+
8
+ Defines the Wukong objects we'll most often use
9
+
10
+ * The user models:
11
+
12
+ * TwitterUser
13
+ * TwitterUserProfiles
14
+
15
+
16
+
17
+ h3. lib/wuclan/request
18
+
19
+
20
+ * Request -- the basic request metadata
21
+
22
+ * Parse -- dispatches the request contents into wuclan objects
23
+
24
+ * Wuclan::Request::Streamer
25
+ ensures that the request is left alone while recordizing.
26
+
27
+
28
+ h3. lib/wuclan/
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/lib'
3
+ require 'wukong'
4
+ require 'wuclan/models/multi_edge'; include Wukong::Models
5
+
6
+ #
7
+ # Takes any number of flavors of directed edge with the form
8
+ #
9
+ # a_relatesto_b src_id dest_id [optional fields]
10
+ #
11
+ # and prepares a combined adjacency list. You need to supply a model named
12
+ # "MultiEdge" with members for each edge type.
13
+ #
14
+ # For instance, suppose you have a social network with edges like
15
+ #
16
+ # a_follows_b user_a_id user_b_id
17
+ # a_messages_b user_a_id user_b_id message_id date
18
+ # a_favorites_b user_a_id user_b_id message_id date
19
+ #
20
+ # Your MultiEdge class might look like
21
+ #
22
+ # class MultiEdge < Struct(
23
+ # :src, :dest,
24
+ # :a_follows_b, :b_follows_a,
25
+ # :a_messages_b, :b_messages_a,
26
+ # :a_favorites_b, :b_favorites_a
27
+ # )
28
+ # end
29
+ #
30
+ # The row for a user pair who follows each other; with user_a #24601 messaging b
31
+ # 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
32
+ # 62 times and favoriting none, will emerge as (tab separated, with [blank]
33
+ # indicating there is no text in that slot):
34
+ #
35
+ # ...
36
+ # 24601 8675309 1 1 57 62 5 [blank]
37
+ # ...
38
+ #
39
+ module GenMultiEdge
40
+ #
41
+ # Emit each relation as
42
+ #
43
+ # src dest rel
44
+ #
45
+ # Canonicalizes the src and dest ids to 10-character, zero-padded strings.
46
+ # (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
47
+ # Discards all the ancillary crap except +src+, +dest+ and +rel+
48
+ #
49
+ class Mapper < Wukong::Streamer::BasWukong::Streamer::Base
50
+ def process rsrc, src, dest, *_
51
+ # note that a_retweets_b_id matches here
52
+ m = /^a_([a-z]+)_b.*/.match(rsrc) or return
53
+ rel = m.captures.first
54
+ src = src.to_i ; dest = dest.to_i
55
+ return if ((src == 0) || (dest == 0))
56
+ yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
57
+ yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
58
+ end
59
+ end
60
+
61
+ #
62
+ # Aggregate all sightings of relations for each pair into
63
+ # a single combined
64
+ #
65
+ # Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
66
+ # relations repeated for each. That is, if there is an "a_messages_b"
67
+ # relation, you'll have edges
68
+ #
69
+ # x y ... a_messages_b(x,y) b_messages_a(y,x) ...
70
+ # y x ... a_messages_b(y,x) b_messages_a(x,y) ...
71
+ #
72
+ #
73
+ class Reducer < Wukong::Streamer::AccumulatingReducer
74
+ attr_accessor :multi_edge
75
+ def get_key src, dest, rel
76
+ [src, dest]
77
+ end
78
+ def start! *args
79
+ self.multi_edge = MultiEdge.new
80
+ end
81
+ def accumulate src, dest, rel
82
+ self.multi_edge[rel] ||= 0
83
+ self.multi_edge[rel] += 1
84
+ end
85
+ def finalize
86
+ multi_edge.src, multi_edge.dest = key
87
+ yield self.multi_edge
88
+ end
89
+ end
90
+
91
+ #
92
+ # Sort on the first two keys: each @[src, dest]@ pair winds up at the same
93
+ # reducer.
94
+ #
95
+ class Script < Wukong::Script
96
+ def default_options
97
+ super.merge :sort_fields => 2
98
+ end
99
+ end
100
+
101
+ # Execute the script
102
+ Script.new(Mapper, Reducer).run
103
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+ require 'wuclan/models'; include Wuclan::Models
5
+ require 'wuclan/models/multi_edge'
6
+
7
+ TwitterUser.class_eval do
8
+ def age
9
+ end
10
+ end
11
+
12
+ # Common to all user-user relationships.
13
+ module RelationshipBase
14
+ def both_edge_rels
15
+ [ ["a_#{self.class.rel_name}_b", user_a_id, user_b_id],
16
+ ["b_#{self.class.rel_name}_a", user_b_id, user_a_id] ]
17
+ end
18
+ end
19
+
20
+
21
+ class Mapper < Wukong::Streamer::StructStreamer
22
+ def process thing, *args
23
+ case thing
24
+ when AFollowsB, AAtsignsBId
25
+ thing.both_edge_rels.each{|edge| yield edge }
26
+ end
27
+ end
28
+ end
29
+
30
+ class Reducer < Wukong::Streamer::AccumulatingReducer
31
+ end
32
+
33
+ #
34
+ # Sort on the first two keys: each @[src, dest]@ pair winds up at the same
35
+ # reducer.
36
+ #
37
+ class Script < Wukong::Script
38
+ def default_options
39
+ super.merge :sort_fields => 2
40
+ end
41
+ end
42
+
43
+ # Execute the script
44
+ Script.new(Mapper, Reducer).run
45
+
46
+
47
+ #
48
+ # Take
49
+ # a < [... followers ...]
50
+ # and
51
+ #
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+
4
+ require 'rubygems'
5
+ # require 'active_support'
6
+ require 'wukong' ; include Wukong
7
+ require 'wuclan' ; include Wuclan::Models
8
+ require 'wuclan/models/tweet/tokenize'
9
+
10
+ load '/home/flip/ics/projects/twitter_friends/lib/twitter_friends/and_pig/init_load.rb'
11
+
12
+
13
+ pig_load_dir 'meta/aorf/word_count', TweetToken
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'wuclan' ; include Wuclan::Models
6
+ require 'wuclan/models/tweet/tokenize'
7
+ require 'wukong/streamer/count_keys'
8
+ require 'wukong/streamer/count_lines'
9
+
10
+ module FreqUser
11
+ class Mapper < Wukong::Streamer::StructStreamer
12
+ #
13
+ # extract just the word
14
+ #
15
+ def process thing, *args, &block
16
+ next unless thing.is_a? TweetToken
17
+ yield [thing.user_id, thing.word]
18
+ end
19
+ end
20
+
21
+ class Reducer < Wukong::Streamer::CountLines
22
+ end
23
+
24
+ # Execute the script
25
+ Wukong::Script.new(
26
+ Mapper,
27
+ Reducer,
28
+ :partition_fields => 2,
29
+ :sort_fields => 2
30
+ ).run
31
+ end
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'wuclan' ; include Wuclan::Models
6
+ require 'wuclan/models/tweet/tokenize'
7
+ require 'wukong/streamer/count_keys'
8
+
9
+ module FreqWholeCorpus
10
+ class Mapper < Wukong::Streamer::StructStreamer
11
+ #
12
+ # extract just the word
13
+ #
14
+ def process thing, *args, &block
15
+ next unless thing.is_a? TweetToken
16
+ yield thing.word
17
+ end
18
+ end
19
+
20
+ # Execute the script
21
+ Wukong::Script.new(
22
+ Mapper,
23
+ Wukong::Streamer::CountKeys
24
+ ).run
25
+ end
26
+
27
+
@@ -0,0 +1,43 @@
1
+ aor1hd_ids = LOAD 'meta/aorf/nbhd/aor1hd_ids_1.tsv' AS (id:int) ;
2
+ TwitterUserId = LOAD 'twall/all/twitter_user_id' AS (rsrc: chararray, id: int, screen_name: chararray, full: int, followers_count: int, created_at: long, protected: int, status: chararray) ;
3
+ TwitterUser = LOAD 'twall/all/twitter_user' AS (rsrc: chararray, id: int, scraped_at: long, screen_name: chararray, protected: int, followers_count: int, friends_count: int, statuses_count: int, favourites_count: int, created_at: long) ;
4
+
5
+
6
+ -- aor_users_0 = JOIN aor1hd_ids BY id, TwitterUser BY id;
7
+ -- aor_users_1 = FOREACH aor_users_0 GENERATE
8
+ -- rsrc, TwitterUser::id, scraped_at, screen_name, protected, followers_count, friends_count, statuses_count, favourites_count, created_at ;
9
+ -- ;
10
+ -- rmf meta/aorf/nbhd/aor1hd_users ; STORE aor_users_1 INTO 'meta/aorf/nbhd/aor1hd_users';
11
+
12
+ -- AFollowsB = LOAD 'twall/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
13
+ -- aor_a_follows_b_0 = JOIN aor1hd_ids BY id, AFollowsB BY user_a_id;
14
+ -- aor_a_follows_b_1 = FOREACH aor_a_follows_b_0 GENERATE user_a_id, user_b_id ;
15
+ -- rmf meta/aorf/nbhd/aor_2hd_o ; STORE aor_a_follows_b_1 INTO 'meta/aorf/nbhd/aor_2hd_o';
16
+
17
+ -- aor1hd_ids = LOAD 'meta/aorf/nbhd/aor1hd_ids_1.tsv' AS (id:int) ;
18
+ -- toks = LOAD 'meta/aorf/word_count/tokens' AS (word:chararray, user_id:int, tweet_id:int, freq:int);
19
+ -- aor_toks_0 = JOIN aor1hd_ids BY id, toks BY user_id;
20
+ -- aor_toks = FOREACH aor_toks GENERATE word, user_id, tweet_id, freq ;
21
+ -- STORE aor_toks INTO 'meta/aorf/word_count/aor_toks';
22
+
23
+ -- toks_0 = LOAD '/home/flip/ics/projects/twitter_friends/meta/aorf/word_count/aor_toks.tsv' AS (tok_word:chararray, user_id:int, tweet_id:int, freq:int);
24
+ -- toks = FOREACH toks_0 GENERATE tok_word, user_id ;
25
+ -- toks_g_0 = GROUP toks BY (user_id, tok_word) ;
26
+ -- toks_g = FOREACH toks_g_0 GENERATE FLATTEN(group.user_id) AS user_id, FLATTEN(group.tok_word) AS tok_word, COUNT(toks) AS freq;
27
+ -- rmf meta/aorf/word_count/aor_user_toks.tsv
28
+ -- STORE toks_g INTO 'meta/aorf/word_count/aor_user_toks.tsv' ;
29
+
30
+
31
+ user_toks_all = LOAD '/home/flip/ics/projects/twitter_friends/meta/aorf/word_count/aor_user_toks.tsv' AS (user_id:int, word:chararray, freq:int);
32
+ user_toks = FILTER user_toks_all BY freq > 5 ;
33
+ user_toks_g_0 = GROUP user_toks BY user_id ;
34
+ user_toks_g_1 = FOREACH user_toks_g_0 {
35
+ user_toks_sort = ORDER user_toks BY freq DESC;
36
+ GENERATE group AS user_id, user_toks_sort.(word, freq);
37
+ };
38
+
39
+ aor_users = LOAD '/home/flip/ics/projects/twitter_friends/meta/aorf/nbhd/aor1hd_users.tsv' AS (rsrc: chararray, id: int, scraped_at: long, screen_name: chararray, protected: int, followers_count: int, friends_count: int, statuses_count: int, favourites_count: int, created_at: long) ;
40
+ user_toks_id_0 = JOIN user_toks_g_1 BY user_id, aor_users BY id;
41
+ user_toks_id = FOREACH user_toks_id_0 GENERATE id, created_at, screen_name, followers_count, friends_count, statuses_count, user_toks_sort ;
42
+ rmf /home/flip/ics/projects/twitter_friends/meta/aorf/word_count/user_toks_id.tsv ; STORE user_toks_id INTO '/home/flip/ics/projects/twitter_friends/meta/aorf/word_count/user_toks_id.tsv' ;
43
+
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+
4
+ require 'rubygems'
5
+ # require 'active_support'
6
+ require 'wukong' ; include Wukong
7
+ require 'wuclan' ; include Wuclan::Models
8
+ require 'wuclan/models/tweet/tokenize'
9
+
10
+ module WordFreq
11
+ class Mapper < Wukong::Streamer::StructStreamer
12
+ #
13
+ # Extract all the semantic items (smilies, hashtags, etc)
14
+ # and all the remaining words from each tweet
15
+ #
16
+ def process thing, *args, &block
17
+ next unless thing.is_a? Tweet
18
+ # tokenize(true) to extract words as well as semantic tokens
19
+ thing.tokenize(true).each do |token|
20
+ # we call to_flat(false) to get the simple key
21
+ yield token.to_flat(false)
22
+ end
23
+ end
24
+ end
25
+
26
+
27
+ # Execute the script
28
+ Wukong::Script.new(
29
+ Mapper,
30
+ nil, # Reducer
31
+ :reduce_tasks => 0
32
+ ).run
33
+ end
34
+
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'edamame'
5
+ require 'wuclan/lastfm' ; include Wuclan::Lastfm::Scrape
6
+ include Monkeyshines
7
+ # Setup
8
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
9
+ Monkeyshines.load_global_options!
10
+ Monkeyshines.load_cmdline_options!
11
+ # Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:lastfm_api]
12
+ Wuclan::Lastfm::Scrape::Base.api_key = Monkeyshines::CONFIG[:lastfm_api][:api_key]
13
+
14
+ #
15
+ # Create store
16
+ #
17
+ source = Monkeyshines::Store::FlatFileStore.new(CONFIG[:source])
18
+ dest = Monkeyshines::RequestStream::EdamameQueue.new(
19
+ :queue => { :type => 'BeanstalkQueue', :uris => ['localhost:11250'] },
20
+ :store => { :type => 'TyrantStore', :uri => ':11251'}
21
+ )
22
+
23
+ source.each do |klass_name, *raw_req_args|
24
+ # Fetch basic artist info
25
+ klass = FactoryModule.get_class(Wuclan::Lastfm::Scrape, klass_name)
26
+ req = klass.new(Monkeyshines.url_encode(raw_req_args[0]))
27
+ req.req_generation = 0
28
+ dest.put req, nil, 0, 'scheduling' => Edamame::Scheduling::Every.new(4 *60*60)
29
+
30
+ p req.to_flat
31
+ end
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'edamame'
5
+ require 'monkeyshines/recursive_runner'
6
+ require 'wuclan/lastfm' ; include Wuclan::Lastfm::Scrape
7
+ # Setup
8
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
9
+ Monkeyshines.load_global_options!
10
+ Monkeyshines.load_cmdline_options!
11
+ Wuclan::Lastfm::Scrape::Base.api_key = Monkeyshines::CONFIG[:lastfm_api][:api_key]
12
+
13
+ #
14
+ # * jobs stream from an edamame job queue.
15
+ # * Many jobs generate paginated requests, stopping when a response overlaps the
16
+ # prev_max item.
17
+ # * Each request is fetched with the standard HTTP fetcher.
18
+ #
19
+ # * low-generation jobs are rescheduled based on the observed item rate
20
+ # * jobs can spawn recursive requests. These have their request_generation
21
+ # incremented
22
+ # * results are sent to a ChunkedFlatFileStore
23
+ #
24
+
25
+ #
26
+ # Create runner
27
+ #
28
+ scraper = Monkeyshines::RecursiveRunner.new({
29
+ :log => { :iters => 1, :dest => Monkeyshines::CONFIG[:handle] },
30
+ :source => { :type => Monkeyshines::RequestStream::KlassHashRequestStream,
31
+ :store => { :type => Monkeyshines::RequestStream::EdamameQueue,
32
+ :tube => Monkeyshines::CONFIG[:handle],
33
+ :queue => { :uris => ['localhost:11250'], :type => 'BeanstalkQueue', },
34
+ :store => { :uri => ':11251', :type => 'TyrantStore', }, }, },
35
+ :dest => { :type => :conditional_store,
36
+ :cache => { :uri => ':11252', },
37
+ :store => { :rootdir => WORK_DIR },},
38
+ # :fetcher => { :type => :fake_fetcher },
39
+ :force_fetch => false,
40
+ :sleep_time => 0.25, # Last.fm asks for > 0.2 please.
41
+ })
42
+
43
+ # Execute the scrape
44
+ loop do
45
+ puts Time.now
46
+ scraper.run
47
+ end
@@ -0,0 +1,147 @@
1
+ lastfm_artist_info_request Linkin Park
2
+ lastfm_artist_info_request Dead by Sunrise
3
+ lastfm_artist_info_request All American Rejects
4
+ lastfm_artist_info_request Enrique Iglesias
5
+
6
+ lastfm_artist_info_request Amir Derakh
7
+ lastfm_artist_info_request Brad Delson
8
+ lastfm_artist_info_request Brandon Belsky
9
+ lastfm_artist_info_request Chester Bennington
10
+ lastfm_artist_info_request Dave Farrell
11
+ lastfm_artist_info_request Elias Andra
12
+ lastfm_artist_info_request Fu Valcic
13
+ lastfm_artist_info_request Joe Hahn
14
+ lastfm_artist_info_request Julien-K
15
+ lastfm_artist_info_request Mike Shinoda
16
+ lastfm_artist_info_request Orgy
17
+ lastfm_artist_info_request Rob Bourdon
18
+ lastfm_artist_info_request Ryan Shuck
19
+
20
+ lastfm_artist_info_request Linkin Park
21
+ lastfm_artist_info_request Jay-Z and Linkin Park
22
+ lastfm_artist_info_request Linkin Park & Jay-Z
23
+ lastfm_artist_info_request Jay-Z & Linkin Park
24
+ lastfm_artist_info_request Busta Rhymes feat Linkin Park
25
+ lastfm_artist_info_request Busta Rhymes Feat. Linkin Park
26
+ lastfm_artist_info_request Britney Spears vs Linkin Park
27
+ lastfm_artist_info_request Jay-Z/ Linkin Park
28
+ lastfm_artist_info_request Jay-Z/Linkin Park
29
+ lastfm_artist_info_request Busta Rhymes Ft. Linkin Park
30
+ lastfm_artist_info_request Busta Rhymes Ft Linkin Park
31
+ lastfm_artist_info_request Busta Rhymes f. Linkin Park
32
+ lastfm_artist_info_request Linkin' Park
33
+ lastfm_artist_info_request Linkin Park & Adema
34
+ lastfm_artist_info_request Busta Rhymes & Linkin Park
35
+ lastfm_artist_info_request Big Jaz/Jay-Z/Linkin Park
36
+ lastfm_artist_info_request Chester Bennington of Linkin Park
37
+ lastfm_artist_info_request Linkin Park - [EMG]
38
+ lastfm_artist_info_request Linkin Park & Paramore
39
+ lastfm_artist_info_request Linkin Park vs Jay Z
40
+ lastfm_artist_info_request Britney Spears vs. Linkin Park
41
+ lastfm_artist_info_request Linkin Park/JayZ
42
+ lastfm_artist_info_request LinkinPark
43
+ lastfm_artist_info_request String Quartet Tribute to Linkin park
44
+ lastfm_artist_info_request Linkin Park feat. Jay-Z
45
+ lastfm_artist_info_request Jay Z & Linkin Park
46
+ lastfm_artist_info_request Fort Minor (Mike Shinoda Of Linkin Park Group)
47
+ lastfm_artist_info_request Linkin Park ft. Busta Rhymes
48
+ lastfm_artist_info_request Linkin Park Vs. Jay-Z
49
+ lastfm_artist_info_request Linkin Park/Jay Z
50
+ lastfm_artist_info_request Linkin Park And Jay-Z
51
+ lastfm_artist_info_request Linkin Park & Jay Z
52
+ lastfm_artist_info_request Linkin Park feat. Jay Z
53
+ lastfm_artist_info_request Jay Z and Linkin Park
54
+ lastfm_artist_info_request Jay Z ft. Linkin Park
55
+ lastfm_artist_info_request Evanescence & Linkin Park
56
+ lastfm_artist_info_request Linkin_Park
57
+ lastfm_artist_info_request Linkin Park/Jay-Z
58
+ lastfm_artist_info_request Jay Z/Linkin Park
59
+ lastfm_artist_info_request Jay Z ft Linkin Park
60
+ lastfm_artist_info_request Linkin Park f. Godsmack, Disturbed, Pantera, Limp Bizkit, Tool, Staind, Korn
61
+ lastfm_artist_info_request Evanescence, Linkin Park, Godsmack, Disturbed, Pantera, Limp Bizkit, Tool, Staind, Korn
62
+ lastfm_artist_info_request Creed, Nickelback, Incubus, Papa Roach, Staind, Pod, Linkin Park, Fuel, Christian Rock
63
+ lastfm_artist_info_request Linkin Park feat Jay-Z
64
+ lastfm_artist_info_request BSO - Linkin Park
65
+ lastfm_artist_info_request Linkin Park & Britney Spears
66
+ lastfm_artist_info_request Linkin Park vs. Britney Spears
67
+ lastfm_artist_info_request Jay-Z ft. Linkin Park
68
+ lastfm_artist_info_request Jay-Z ft Linkin Park
69
+ lastfm_artist_info_request Papa roach, Limp bizkit, Linkin park, KoRn
70
+ lastfm_artist_info_request Linkin Park ft Jay Z
71
+ lastfm_artist_info_request Linkin Park vs Britney Spears
72
+ lastfm_artist_info_request Jay-Z f. Linkin Park
73
+ lastfm_artist_info_request Linkin Park Jay-Z
74
+ lastfm_artist_info_request Evanescense & LINKIN PARK
75
+ lastfm_artist_info_request Linkin Park vs. Moby
76
+ lastfm_artist_info_request evanescence, linkin park, god
77
+ lastfm_artist_info_request korn & metallica & eminem & limp bizkit & linkin park
78
+ lastfm_artist_info_request Linkin Park (band)
79
+ lastfm_artist_info_request Linkin Park vs Evanescence
80
+ lastfm_artist_info_request Linkin Park vs. Genesis
81
+ lastfm_artist_info_request Busta Ryhmes feat Linkin Park
82
+ lastfm_artist_info_request Jay-Z / Linkin Park
83
+ lastfm_artist_info_request Jay-Z feat. Linkin Park
84
+ lastfm_artist_info_request Linkin Park ft. Jay-Z
85
+ lastfm_artist_info_request Linkin Park feat Jay Z
86
+ lastfm_artist_info_request Jay-Z; Linkin Park
87
+ lastfm_artist_info_request Big Jaz; Jay-Z; Linkin Park
88
+ lastfm_artist_info_request Linkin` Park
89
+ lastfm_artist_info_request Linkin Park DJ Lethal Chester Bennington
90
+ lastfm_artist_info_request Linkin Park & Depeche Mode
91
+ lastfm_artist_info_request Just!ce - Linkin Park
92
+ lastfm_artist_info_request Jay-Z Vs. linkin Park
93
+ lastfm_artist_info_request Linkin_park_feat_jay_z
94
+ lastfm_artist_info_request Linkin Park & Jay
95
+ lastfm_artist_info_request Depeche Mode vs. Linkin Park
96
+ lastfm_artist_info_request Linkin Park feat. Busta Rhymes
97
+ lastfm_artist_info_request Busta Rymes ft Linkin Park
98
+ lastfm_artist_info_request Linkin Park f. staind
99
+ lastfm_artist_info_request LinkinPark/Jay-Z
100
+ lastfm_artist_info_request Jay - Z/Linkin Park
101
+ lastfm_artist_info_request Xero (Linkin Park)
102
+ lastfm_artist_info_request Rihanna Feat. Linkin Park Feat. Ballboa /Rihanna Feat. Linkin Park Feat. Ballboa
103
+ lastfm_artist_info_request Soundtrack - Matrix Reloaded - Linkin Park
104
+ lastfm_artist_info_request Dream Theater + Linkin Park
105
+ lastfm_artist_info_request linkin park vs david banner
106
+ lastfm_artist_info_request Evanscence & Linkin Park -
107
+ lastfm_artist_info_request Linkin Park or Fort minor
108
+ lastfm_artist_info_request Linkin_Park_
109
+ lastfm_artist_info_request Jay-Z feat Linkin Park
110
+ lastfm_artist_info_request Linkin Park -
111
+ lastfm_artist_info_request Limp Bizkit Disturbed Cypress Hill Crazytown Linkin Park.mp
112
+ lastfm_artist_info_request Depeche Mode & Linkin Park
113
+ lastfm_artist_info_request Linkin Park ft. Jay Z
114
+ lastfm_artist_info_request X-Ecutioners Feat. Linkin Park
115
+ lastfm_artist_info_request Jay Z Linkin Park
116
+ lastfm_artist_info_request Linkin Park Tribute
117
+ lastfm_artist_info_request Depeche Mode and Linkin Park
118
+ lastfm_artist_info_request Bjork vs. Linkin Park
119
+ lastfm_artist_info_request Linkin Park Feat. Britney Spears
120
+ lastfm_artist_info_request Chester Bennington (Of Linkin Park)
121
+ lastfm_artist_info_request Juelz Santana & Linkin Park
122
+ lastfm_artist_info_request X-Ecutioners Featuring Mike Shinoda And Mr. Hahn OF Linkin Park
123
+ lastfm_artist_info_request Evanescence ft. Linkin Park
124
+ lastfm_artist_info_request Linkin Park vs. Justin Lassen
125
+ lastfm_artist_info_request Linkin Park & Fort Minor
126
+ lastfm_artist_info_request evanescence, linkin park
127
+ lastfm_artist_info_request Linkin Park / Jay Z / Paul McCartney
128
+ lastfm_artist_info_request Linkin Park [www.musikaki.blogspot.com]
129
+ lastfm_artist_info_request Linkin Park .:: www.PersianOne.com ::.
130
+ lastfm_artist_info_request Linkin Park Vs Jay-Z
131
+ lastfm_artist_info_request akon-feat-Linkin Park
132
+ lastfm_artist_info_request Paramore & Linkin Park
133
+ lastfm_artist_info_request Linkin Park & Jay-Z vs. Will Smith
134
+ lastfm_artist_info_request Jay-Z Linkin Park
135
+ lastfm_artist_info_request Linkin Park vs. Jay Z
136
+ lastfm_artist_info_request Jay-Z, Linkin Park
137
+ lastfm_artist_info_request Jay Z feat. Linkin Park
138
+ lastfm_artist_info_request Linkin Park, Jay-Z
139
+ lastfm_artist_info_request Linkin Park ft. JayZ
140
+
141
+ lastfm_artist_info_request Limp Bizkit
142
+ lastfm_artist_info_request Korn
143
+ lastfm_artist_info_request Katt Williams
144
+ lastfm_artist_info_request Avenged Sevenfold
145
+ lastfm_artist_info_request Eddie Izzard
146
+ lastfm_artist_info_request Slash
147
+ lastfm_artist_info_request Staind