wuclan 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
data/LICENSE.textile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: MIT License
|
4
|
+
---
|
5
|
+
|
6
|
+
h1(gemheader). {{ site.gemname }} %(small):: license%
|
7
|
+
|
8
|
+
<notextile><div class="toggle"></notextile>
|
9
|
+
|
10
|
+
h2. MIT License
|
11
|
+
|
12
|
+
__Copyright (c) 2009 Philip (flip) Kromer__
|
13
|
+
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
15
|
+
|
16
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
19
|
+
|
20
|
+
<notextile></div></notextile>
|
data/README.textile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
h2. Help!
|
3
|
+
|
4
|
+
Send Wuclan questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
|
5
|
+
|
6
|
+
h3. lib/wuclan/models
|
7
|
+
|
8
|
+
Defines the Wukong objects we'll most often use
|
9
|
+
|
10
|
+
* The user models:
|
11
|
+
|
12
|
+
* TwitterUser
|
13
|
+
* TwitterUserProfiles
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
h3. lib/wuclan/request
|
18
|
+
|
19
|
+
|
20
|
+
* Request -- the basic request metadata
|
21
|
+
|
22
|
+
* Parse -- dispatches the request contents into wuclan objects
|
23
|
+
|
24
|
+
* Wuclan::Request::Streamer
|
25
|
+
ensures that the request is left alone while recordizing.
|
26
|
+
|
27
|
+
|
28
|
+
h3. lib/wuclan/
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/lib'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wuclan/models/multi_edge'; include Wukong::Models
|
5
|
+
|
6
|
+
#
|
7
|
+
# Takes any number of flavors of directed edge with the form
|
8
|
+
#
|
9
|
+
# a_relatesto_b src_id dest_id [optional fields]
|
10
|
+
#
|
11
|
+
# and prepares a combined adjacency list. You need to supply a model named
|
12
|
+
# "MultiEdge" with members for each edge type.
|
13
|
+
#
|
14
|
+
# For instance, suppose you have a social network with edges like
|
15
|
+
#
|
16
|
+
# a_follows_b user_a_id user_b_id
|
17
|
+
# a_messages_b user_a_id user_b_id message_id date
|
18
|
+
# a_favorites_b user_a_id user_b_id message_id date
|
19
|
+
#
|
20
|
+
# Your MultiEdge class might look like
|
21
|
+
#
|
22
|
+
# class MultiEdge < Struct(
|
23
|
+
# :src, :dest,
|
24
|
+
# :a_follows_b, :b_follows_a,
|
25
|
+
# :a_messages_b, :b_messages_a,
|
26
|
+
# :a_favorites_b, :b_favorites_a
|
27
|
+
# )
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# The row for a user pair who follows each other; with user_a #24601 messaging b
|
31
|
+
# 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
|
32
|
+
# 62 times and favoriting none, will emerge as (tab separated, with [blank]
|
33
|
+
# indicating there is no text in that slot):
|
34
|
+
#
|
35
|
+
# ...
|
36
|
+
# 24601 8675309 1 1 57 62 5 [blank]
|
37
|
+
# ...
|
38
|
+
#
|
39
|
+
module GenMultiEdge
|
40
|
+
#
|
41
|
+
# Emit each relation as
|
42
|
+
#
|
43
|
+
# src dest rel
|
44
|
+
#
|
45
|
+
# Canonicalizes the src and dest ids to 10-character, zero-padded strings.
|
46
|
+
# (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
|
47
|
+
# Discards all the ancillary crap except +src+, +dest+ and +rel+
|
48
|
+
#
|
49
|
+
class Mapper < Wukong::Streamer::BasWukong::Streamer::Base
|
50
|
+
def process rsrc, src, dest, *_
|
51
|
+
# note that a_retweets_b_id matches here
|
52
|
+
m = /^a_([a-z]+)_b.*/.match(rsrc) or return
|
53
|
+
rel = m.captures.first
|
54
|
+
src = src.to_i ; dest = dest.to_i
|
55
|
+
return if ((src == 0) || (dest == 0))
|
56
|
+
yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
|
57
|
+
yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Aggregate all sightings of relations for each pair into
|
63
|
+
# a single combined
|
64
|
+
#
|
65
|
+
# Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
|
66
|
+
# relations repeated for each. That is, if there is an "a_messages_b"
|
67
|
+
# relation, you'll have edges
|
68
|
+
#
|
69
|
+
# x y ... a_messages_b(x,y) b_messages_a(y,x) ...
|
70
|
+
# y x ... a_messages_b(y,x) b_messages_a(x,y) ...
|
71
|
+
#
|
72
|
+
#
|
73
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
74
|
+
attr_accessor :multi_edge
|
75
|
+
def get_key src, dest, rel
|
76
|
+
[src, dest]
|
77
|
+
end
|
78
|
+
def start! *args
|
79
|
+
self.multi_edge = MultiEdge.new
|
80
|
+
end
|
81
|
+
def accumulate src, dest, rel
|
82
|
+
self.multi_edge[rel] ||= 0
|
83
|
+
self.multi_edge[rel] += 1
|
84
|
+
end
|
85
|
+
def finalize
|
86
|
+
multi_edge.src, multi_edge.dest = key
|
87
|
+
yield self.multi_edge
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Sort on the first two keys: each @[src, dest]@ pair winds up at the same
|
93
|
+
# reducer.
|
94
|
+
#
|
95
|
+
class Script < Wukong::Script
|
96
|
+
def default_options
|
97
|
+
super.merge :sort_fields => 2
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Execute the script
|
102
|
+
Script.new(Mapper, Reducer).run
|
103
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wuclan/models'; include Wuclan::Models
|
5
|
+
require 'wuclan/models/multi_edge'
|
6
|
+
|
7
|
+
TwitterUser.class_eval do
|
8
|
+
def age
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# Common to all user-user relationships.
|
13
|
+
module RelationshipBase
|
14
|
+
def both_edge_rels
|
15
|
+
[ ["a_#{self.class.rel_name}_b", user_a_id, user_b_id],
|
16
|
+
["b_#{self.class.rel_name}_a", user_b_id, user_a_id] ]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
class Mapper < Wukong::Streamer::StructStreamer
|
22
|
+
def process thing, *args
|
23
|
+
case thing
|
24
|
+
when AFollowsB, AAtsignsBId
|
25
|
+
thing.both_edge_rels.each{|edge| yield edge }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
# Sort on the first two keys: each @[src, dest]@ pair winds up at the same
|
35
|
+
# reducer.
|
36
|
+
#
|
37
|
+
class Script < Wukong::Script
|
38
|
+
def default_options
|
39
|
+
super.merge :sort_fields => 2
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Execute the script
|
44
|
+
Script.new(Mapper, Reducer).run
|
45
|
+
|
46
|
+
|
47
|
+
#
|
48
|
+
# Take
|
49
|
+
# a < [... followers ...]
|
50
|
+
# and
|
51
|
+
#
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
# require 'active_support'
|
6
|
+
require 'wukong' ; include Wukong
|
7
|
+
require 'wuclan' ; include Wuclan::Models
|
8
|
+
require 'wuclan/models/tweet/tokenize'
|
9
|
+
|
10
|
+
load '/home/flip/ics/projects/twitter_friends/lib/twitter_friends/and_pig/init_load.rb'
|
11
|
+
|
12
|
+
|
13
|
+
pig_load_dir 'meta/aorf/word_count', TweetToken
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'wuclan' ; include Wuclan::Models
|
6
|
+
require 'wuclan/models/tweet/tokenize'
|
7
|
+
require 'wukong/streamer/count_keys'
|
8
|
+
require 'wukong/streamer/count_lines'
|
9
|
+
|
10
|
+
module FreqUser
|
11
|
+
class Mapper < Wukong::Streamer::StructStreamer
|
12
|
+
#
|
13
|
+
# extract just the word
|
14
|
+
#
|
15
|
+
def process thing, *args, &block
|
16
|
+
next unless thing.is_a? TweetToken
|
17
|
+
yield [thing.user_id, thing.word]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Reducer < Wukong::Streamer::CountLines
|
22
|
+
end
|
23
|
+
|
24
|
+
# Execute the script
|
25
|
+
Wukong::Script.new(
|
26
|
+
Mapper,
|
27
|
+
Reducer,
|
28
|
+
:partition_fields => 2,
|
29
|
+
:sort_fields => 2
|
30
|
+
).run
|
31
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'wuclan' ; include Wuclan::Models
|
6
|
+
require 'wuclan/models/tweet/tokenize'
|
7
|
+
require 'wukong/streamer/count_keys'
|
8
|
+
|
9
|
+
module FreqWholeCorpus
|
10
|
+
class Mapper < Wukong::Streamer::StructStreamer
|
11
|
+
#
|
12
|
+
# extract just the word
|
13
|
+
#
|
14
|
+
def process thing, *args, &block
|
15
|
+
next unless thing.is_a? TweetToken
|
16
|
+
yield thing.word
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Execute the script
|
21
|
+
Wukong::Script.new(
|
22
|
+
Mapper,
|
23
|
+
Wukong::Streamer::CountKeys
|
24
|
+
).run
|
25
|
+
end
|
26
|
+
|
27
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
aor1hd_ids = LOAD 'meta/aorf/nbhd/aor1hd_ids_1.tsv' AS (id:int) ;
|
2
|
+
TwitterUserId = LOAD 'twall/all/twitter_user_id' AS (rsrc: chararray, id: int, screen_name: chararray, full: int, followers_count: int, created_at: long, protected: int, status: chararray) ;
|
3
|
+
TwitterUser = LOAD 'twall/all/twitter_user' AS (rsrc: chararray, id: int, scraped_at: long, screen_name: chararray, protected: int, followers_count: int, friends_count: int, statuses_count: int, favourites_count: int, created_at: long) ;
|
4
|
+
|
5
|
+
|
6
|
+
-- aor_users_0 = JOIN aor1hd_ids BY id, TwitterUser BY id;
|
7
|
+
-- aor_users_1 = FOREACH aor_users_0 GENERATE
|
8
|
+
-- rsrc, TwitterUser::id, scraped_at, screen_name, protected, followers_count, friends_count, statuses_count, favourites_count, created_at ;
|
9
|
+
-- ;
|
10
|
+
-- rmf meta/aorf/nbhd/aor1hd_users ; STORE aor_users_1 INTO 'meta/aorf/nbhd/aor1hd_users';
|
11
|
+
|
12
|
+
-- AFollowsB = LOAD 'twall/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
|
13
|
+
-- aor_a_follows_b_0 = JOIN aor1hd_ids BY id, AFollowsB BY user_a_id;
|
14
|
+
-- aor_a_follows_b_1 = FOREACH aor_a_follows_b_0 GENERATE user_a_id, user_b_id ;
|
15
|
+
-- rmf meta/aorf/nbhd/aor_2hd_o ; STORE aor_a_follows_b_1 INTO 'meta/aorf/nbhd/aor_2hd_o';
|
16
|
+
|
17
|
+
-- aor1hd_ids = LOAD 'meta/aorf/nbhd/aor1hd_ids_1.tsv' AS (id:int) ;
|
18
|
+
-- toks = LOAD 'meta/aorf/word_count/tokens' AS (word:chararray, user_id:int, tweet_id:int, freq:int);
|
19
|
+
-- aor_toks_0 = JOIN aor1hd_ids BY id, toks BY user_id;
|
20
|
+
-- aor_toks = FOREACH aor_toks GENERATE word, user_id, tweet_id, freq ;
|
21
|
+
-- STORE aor_toks INTO 'meta/aorf/word_count/aor_toks';
|
22
|
+
|
23
|
+
-- toks_0 = LOAD '/home/flip/ics/projects/twitter_friends/meta/aorf/word_count/aor_toks.tsv' AS (tok_word:chararray, user_id:int, tweet_id:int, freq:int);
|
24
|
+
-- toks = FOREACH toks_0 GENERATE tok_word, user_id ;
|
25
|
+
-- toks_g_0 = GROUP toks BY (user_id, tok_word) ;
|
26
|
+
-- toks_g = FOREACH toks_g_0 GENERATE FLATTEN(group.user_id) AS user_id, FLATTEN(group.tok_word) AS tok_word, COUNT(toks) AS freq;
|
27
|
+
-- rmf meta/aorf/word_count/aor_user_toks.tsv
|
28
|
+
-- STORE toks_g INTO 'meta/aorf/word_count/aor_user_toks.tsv' ;
|
29
|
+
|
30
|
+
|
31
|
+
user_toks_all = LOAD '/home/flip/ics/projects/twitter_friends/meta/aorf/word_count/aor_user_toks.tsv' AS (user_id:int, word:chararray, freq:int);
|
32
|
+
user_toks = FILTER user_toks_all BY freq > 5 ;
|
33
|
+
user_toks_g_0 = GROUP user_toks BY user_id ;
|
34
|
+
user_toks_g_1 = FOREACH user_toks_g_0 {
|
35
|
+
user_toks_sort = ORDER user_toks BY freq DESC;
|
36
|
+
GENERATE group AS user_id, user_toks_sort.(word, freq);
|
37
|
+
};
|
38
|
+
|
39
|
+
aor_users = LOAD '/home/flip/ics/projects/twitter_friends/meta/aorf/nbhd/aor1hd_users.tsv' AS (rsrc: chararray, id: int, scraped_at: long, screen_name: chararray, protected: int, followers_count: int, friends_count: int, statuses_count: int, favourites_count: int, created_at: long) ;
|
40
|
+
user_toks_id_0 = JOIN user_toks_g_1 BY user_id, aor_users BY id;
|
41
|
+
user_toks_id = FOREACH user_toks_id_0 GENERATE id, created_at, screen_name, followers_count, friends_count, statuses_count, user_toks_sort ;
|
42
|
+
rmf /home/flip/ics/projects/twitter_friends/meta/aorf/word_count/user_toks_id.tsv ; STORE user_toks_id INTO '/home/flip/ics/projects/twitter_friends/meta/aorf/word_count/user_toks_id.tsv' ;
|
43
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
# require 'active_support'
|
6
|
+
require 'wukong' ; include Wukong
|
7
|
+
require 'wuclan' ; include Wuclan::Models
|
8
|
+
require 'wuclan/models/tweet/tokenize'
|
9
|
+
|
10
|
+
module WordFreq
|
11
|
+
class Mapper < Wukong::Streamer::StructStreamer
|
12
|
+
#
|
13
|
+
# Extract all the semantic items (smilies, hashtags, etc)
|
14
|
+
# and all the remaining words from each tweet
|
15
|
+
#
|
16
|
+
def process thing, *args, &block
|
17
|
+
next unless thing.is_a? Tweet
|
18
|
+
# tokenize(true) to extract words as well as semantic tokens
|
19
|
+
thing.tokenize(true).each do |token|
|
20
|
+
# we call to_flat(false) to get the simple key
|
21
|
+
yield token.to_flat(false)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
# Execute the script
|
28
|
+
Wukong::Script.new(
|
29
|
+
Mapper,
|
30
|
+
nil, # Reducer
|
31
|
+
:reduce_tasks => 0
|
32
|
+
).run
|
33
|
+
end
|
34
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'monkeyshines'
|
4
|
+
require 'edamame'
|
5
|
+
require 'wuclan/lastfm' ; include Wuclan::Lastfm::Scrape
|
6
|
+
include Monkeyshines
|
7
|
+
# Setup
|
8
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
9
|
+
Monkeyshines.load_global_options!
|
10
|
+
Monkeyshines.load_cmdline_options!
|
11
|
+
# Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:lastfm_api]
|
12
|
+
Wuclan::Lastfm::Scrape::Base.api_key = Monkeyshines::CONFIG[:lastfm_api][:api_key]
|
13
|
+
|
14
|
+
#
|
15
|
+
# Create store
|
16
|
+
#
|
17
|
+
source = Monkeyshines::Store::FlatFileStore.new(CONFIG[:source])
|
18
|
+
dest = Monkeyshines::RequestStream::EdamameQueue.new(
|
19
|
+
:queue => { :type => 'BeanstalkQueue', :uris => ['localhost:11250'] },
|
20
|
+
:store => { :type => 'TyrantStore', :uri => ':11251'}
|
21
|
+
)
|
22
|
+
|
23
|
+
source.each do |klass_name, *raw_req_args|
|
24
|
+
# Fetch basic artist info
|
25
|
+
klass = FactoryModule.get_class(Wuclan::Lastfm::Scrape, klass_name)
|
26
|
+
req = klass.new(Monkeyshines.url_encode(raw_req_args[0]))
|
27
|
+
req.req_generation = 0
|
28
|
+
dest.put req, nil, 0, 'scheduling' => Edamame::Scheduling::Every.new(4 *60*60)
|
29
|
+
|
30
|
+
p req.to_flat
|
31
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'monkeyshines'
|
4
|
+
require 'edamame'
|
5
|
+
require 'monkeyshines/recursive_runner'
|
6
|
+
require 'wuclan/lastfm' ; include Wuclan::Lastfm::Scrape
|
7
|
+
# Setup
|
8
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
9
|
+
Monkeyshines.load_global_options!
|
10
|
+
Monkeyshines.load_cmdline_options!
|
11
|
+
Wuclan::Lastfm::Scrape::Base.api_key = Monkeyshines::CONFIG[:lastfm_api][:api_key]
|
12
|
+
|
13
|
+
#
|
14
|
+
# * jobs stream from an edamame job queue.
|
15
|
+
# * Many jobs generate paginated requests, stopping when a response overlaps the
|
16
|
+
# prev_max item.
|
17
|
+
# * Each request is fetched with the standard HTTP fetcher.
|
18
|
+
#
|
19
|
+
# * low-generation jobs are rescheduled based on the observed item rate
|
20
|
+
# * jobs can spawn recursive requests. These have their request_generation
|
21
|
+
# incremented
|
22
|
+
# * results are sent to a ChunkedFlatFileStore
|
23
|
+
#
|
24
|
+
|
25
|
+
#
|
26
|
+
# Create runner
|
27
|
+
#
|
28
|
+
scraper = Monkeyshines::RecursiveRunner.new({
|
29
|
+
:log => { :iters => 1, :dest => Monkeyshines::CONFIG[:handle] },
|
30
|
+
:source => { :type => Monkeyshines::RequestStream::KlassHashRequestStream,
|
31
|
+
:store => { :type => Monkeyshines::RequestStream::EdamameQueue,
|
32
|
+
:tube => Monkeyshines::CONFIG[:handle],
|
33
|
+
:queue => { :uris => ['localhost:11250'], :type => 'BeanstalkQueue', },
|
34
|
+
:store => { :uri => ':11251', :type => 'TyrantStore', }, }, },
|
35
|
+
:dest => { :type => :conditional_store,
|
36
|
+
:cache => { :uri => ':11252', },
|
37
|
+
:store => { :rootdir => WORK_DIR },},
|
38
|
+
# :fetcher => { :type => :fake_fetcher },
|
39
|
+
:force_fetch => false,
|
40
|
+
:sleep_time => 0.25, # Last.fm asks for > 0.2 please.
|
41
|
+
})
|
42
|
+
|
43
|
+
# Execute the scrape
|
44
|
+
loop do
|
45
|
+
puts Time.now
|
46
|
+
scraper.run
|
47
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
lastfm_artist_info_request Linkin Park
|
2
|
+
lastfm_artist_info_request Dead by Sunrise
|
3
|
+
lastfm_artist_info_request All American Rejects
|
4
|
+
lastfm_artist_info_request Enrique Iglesias
|
5
|
+
|
6
|
+
lastfm_artist_info_request Amir Derakh
|
7
|
+
lastfm_artist_info_request Brad Delson
|
8
|
+
lastfm_artist_info_request Brandon Belsky
|
9
|
+
lastfm_artist_info_request Chester Bennington
|
10
|
+
lastfm_artist_info_request Dave Farrell
|
11
|
+
lastfm_artist_info_request Elias Andra
|
12
|
+
lastfm_artist_info_request Fu Valcic
|
13
|
+
lastfm_artist_info_request Joe Hahn
|
14
|
+
lastfm_artist_info_request Julien-K
|
15
|
+
lastfm_artist_info_request Mike Shinoda
|
16
|
+
lastfm_artist_info_request Orgy
|
17
|
+
lastfm_artist_info_request Rob Bourdon
|
18
|
+
lastfm_artist_info_request Ryan Shuck
|
19
|
+
|
20
|
+
lastfm_artist_info_request Linkin Park
|
21
|
+
lastfm_artist_info_request Jay-Z and Linkin Park
|
22
|
+
lastfm_artist_info_request Linkin Park & Jay-Z
|
23
|
+
lastfm_artist_info_request Jay-Z & Linkin Park
|
24
|
+
lastfm_artist_info_request Busta Rhymes feat Linkin Park
|
25
|
+
lastfm_artist_info_request Busta Rhymes Feat. Linkin Park
|
26
|
+
lastfm_artist_info_request Britney Spears vs Linkin Park
|
27
|
+
lastfm_artist_info_request Jay-Z/ Linkin Park
|
28
|
+
lastfm_artist_info_request Jay-Z/Linkin Park
|
29
|
+
lastfm_artist_info_request Busta Rhymes Ft. Linkin Park
|
30
|
+
lastfm_artist_info_request Busta Rhymes Ft Linkin Park
|
31
|
+
lastfm_artist_info_request Busta Rhymes f. Linkin Park
|
32
|
+
lastfm_artist_info_request Linkin' Park
|
33
|
+
lastfm_artist_info_request Linkin Park & Adema
|
34
|
+
lastfm_artist_info_request Busta Rhymes & Linkin Park
|
35
|
+
lastfm_artist_info_request Big Jaz/Jay-Z/Linkin Park
|
36
|
+
lastfm_artist_info_request Chester Bennington of Linkin Park
|
37
|
+
lastfm_artist_info_request Linkin Park - [EMG]
|
38
|
+
lastfm_artist_info_request Linkin Park & Paramore
|
39
|
+
lastfm_artist_info_request Linkin Park vs Jay Z
|
40
|
+
lastfm_artist_info_request Britney Spears vs. Linkin Park
|
41
|
+
lastfm_artist_info_request Linkin Park/JayZ
|
42
|
+
lastfm_artist_info_request LinkinPark
|
43
|
+
lastfm_artist_info_request String Quartet Tribute to Linkin park
|
44
|
+
lastfm_artist_info_request Linkin Park feat. Jay-Z
|
45
|
+
lastfm_artist_info_request Jay Z & Linkin Park
|
46
|
+
lastfm_artist_info_request Fort Minor (Mike Shinoda Of Linkin Park Group)
|
47
|
+
lastfm_artist_info_request Linkin Park ft. Busta Rhymes
|
48
|
+
lastfm_artist_info_request Linkin Park Vs. Jay-Z
|
49
|
+
lastfm_artist_info_request Linkin Park/Jay Z
|
50
|
+
lastfm_artist_info_request Linkin Park And Jay-Z
|
51
|
+
lastfm_artist_info_request Linkin Park & Jay Z
|
52
|
+
lastfm_artist_info_request Linkin Park feat. Jay Z
|
53
|
+
lastfm_artist_info_request Jay Z and Linkin Park
|
54
|
+
lastfm_artist_info_request Jay Z ft. Linkin Park
|
55
|
+
lastfm_artist_info_request Evanescence & Linkin Park
|
56
|
+
lastfm_artist_info_request Linkin_Park
|
57
|
+
lastfm_artist_info_request Linkin Park/Jay-Z
|
58
|
+
lastfm_artist_info_request Jay Z/Linkin Park
|
59
|
+
lastfm_artist_info_request Jay Z ft Linkin Park
|
60
|
+
lastfm_artist_info_request Linkin Park f. Godsmack, Disturbed, Pantera, Limp Bizkit, Tool, Staind, Korn
|
61
|
+
lastfm_artist_info_request Evanescence, Linkin Park, Godsmack, Disturbed, Pantera, Limp Bizkit, Tool, Staind, Korn
|
62
|
+
lastfm_artist_info_request Creed, Nickelback, Incubus, Papa Roach, Staind, Pod, Linkin Park, Fuel, Christian Rock
|
63
|
+
lastfm_artist_info_request Linkin Park feat Jay-Z
|
64
|
+
lastfm_artist_info_request BSO - Linkin Park
|
65
|
+
lastfm_artist_info_request Linkin Park & Britney Spears
|
66
|
+
lastfm_artist_info_request Linkin Park vs. Britney Spears
|
67
|
+
lastfm_artist_info_request Jay-Z ft. Linkin Park
|
68
|
+
lastfm_artist_info_request Jay-Z ft Linkin Park
|
69
|
+
lastfm_artist_info_request Papa roach, Limp bizkit, Linkin park, KoRn
|
70
|
+
lastfm_artist_info_request Linkin Park ft Jay Z
|
71
|
+
lastfm_artist_info_request Linkin Park vs Britney Spears
|
72
|
+
lastfm_artist_info_request Jay-Z f. Linkin Park
|
73
|
+
lastfm_artist_info_request Linkin Park Jay-Z
|
74
|
+
lastfm_artist_info_request Evanescense & LINKIN PARK
|
75
|
+
lastfm_artist_info_request Linkin Park vs. Moby
|
76
|
+
lastfm_artist_info_request evanescence, linkin park, god
|
77
|
+
lastfm_artist_info_request korn & metallica & eminem & limp bizkit & linkin park
|
78
|
+
lastfm_artist_info_request Linkin Park (band)
|
79
|
+
lastfm_artist_info_request Linkin Park vs Evanescence
|
80
|
+
lastfm_artist_info_request Linkin Park vs. Genesis
|
81
|
+
lastfm_artist_info_request Busta Ryhmes feat Linkin Park
|
82
|
+
lastfm_artist_info_request Jay-Z / Linkin Park
|
83
|
+
lastfm_artist_info_request Jay-Z feat. Linkin Park
|
84
|
+
lastfm_artist_info_request Linkin Park ft. Jay-Z
|
85
|
+
lastfm_artist_info_request Linkin Park feat Jay Z
|
86
|
+
lastfm_artist_info_request Jay-Z; Linkin Park
|
87
|
+
lastfm_artist_info_request Big Jaz; Jay-Z; Linkin Park
|
88
|
+
lastfm_artist_info_request Linkin` Park
|
89
|
+
lastfm_artist_info_request Linkin Park DJ Lethal Chester Bennington
|
90
|
+
lastfm_artist_info_request Linkin Park & Depeche Mode
|
91
|
+
lastfm_artist_info_request Just!ce - Linkin Park
|
92
|
+
lastfm_artist_info_request Jay-Z Vs. linkin Park
|
93
|
+
lastfm_artist_info_request Linkin_park_feat_jay_z
|
94
|
+
lastfm_artist_info_request Linkin Park & Jay
|
95
|
+
lastfm_artist_info_request Depeche Mode vs. Linkin Park
|
96
|
+
lastfm_artist_info_request Linkin Park feat. Busta Rhymes
|
97
|
+
lastfm_artist_info_request Busta Rymes ft Linkin Park
|
98
|
+
lastfm_artist_info_request Linkin Park f. staind
|
99
|
+
lastfm_artist_info_request LinkinPark/Jay-Z
|
100
|
+
lastfm_artist_info_request Jay - Z/Linkin Park
|
101
|
+
lastfm_artist_info_request Xero (Linkin Park)
|
102
|
+
lastfm_artist_info_request Rihanna Feat. Linkin Park Feat. Ballboa /Rihanna Feat. Linkin Park Feat. Ballboa
|
103
|
+
lastfm_artist_info_request Soundtrack - Matrix Reloaded - Linkin Park
|
104
|
+
lastfm_artist_info_request Dream Theater + Linkin Park
|
105
|
+
lastfm_artist_info_request linkin park vs david banner
|
106
|
+
lastfm_artist_info_request Evanscence & Linkin Park -
|
107
|
+
lastfm_artist_info_request Linkin Park or Fort minor
|
108
|
+
lastfm_artist_info_request Linkin_Park_
|
109
|
+
lastfm_artist_info_request Jay-Z feat Linkin Park
|
110
|
+
lastfm_artist_info_request Linkin Park -
|
111
|
+
lastfm_artist_info_request Limp Bizkit Disturbed Cypress Hill Crazytown Linkin Park.mp
|
112
|
+
lastfm_artist_info_request Depeche Mode & Linkin Park
|
113
|
+
lastfm_artist_info_request Linkin Park ft. Jay Z
|
114
|
+
lastfm_artist_info_request X-Ecutioners Feat. Linkin Park
|
115
|
+
lastfm_artist_info_request Jay Z Linkin Park
|
116
|
+
lastfm_artist_info_request Linkin Park Tribute
|
117
|
+
lastfm_artist_info_request Depeche Mode and Linkin Park
|
118
|
+
lastfm_artist_info_request Bjork vs. Linkin Park
|
119
|
+
lastfm_artist_info_request Linkin Park Feat. Britney Spears
|
120
|
+
lastfm_artist_info_request Chester Bennington (Of Linkin Park)
|
121
|
+
lastfm_artist_info_request Juelz Santana & Linkin Park
|
122
|
+
lastfm_artist_info_request X-Ecutioners Featuring Mike Shinoda And Mr. Hahn OF Linkin Park
|
123
|
+
lastfm_artist_info_request Evanescence ft. Linkin Park
|
124
|
+
lastfm_artist_info_request Linkin Park vs. Justin Lassen
|
125
|
+
lastfm_artist_info_request Linkin Park & Fort Minor
|
126
|
+
lastfm_artist_info_request evanescence, linkin park
|
127
|
+
lastfm_artist_info_request Linkin Park / Jay Z / Paul McCartney
|
128
|
+
lastfm_artist_info_request Linkin Park [www.musikaki.blogspot.com]
|
129
|
+
lastfm_artist_info_request Linkin Park .:: www.PersianOne.com ::.
|
130
|
+
lastfm_artist_info_request Linkin Park Vs Jay-Z
|
131
|
+
lastfm_artist_info_request akon-feat-Linkin Park
|
132
|
+
lastfm_artist_info_request Paramore & Linkin Park
|
133
|
+
lastfm_artist_info_request Linkin Park & Jay-Z vs. Will Smith
|
134
|
+
lastfm_artist_info_request Jay-Z Linkin Park
|
135
|
+
lastfm_artist_info_request Linkin Park vs. Jay Z
|
136
|
+
lastfm_artist_info_request Jay-Z, Linkin Park
|
137
|
+
lastfm_artist_info_request Jay Z feat. Linkin Park
|
138
|
+
lastfm_artist_info_request Linkin Park, Jay-Z
|
139
|
+
lastfm_artist_info_request Linkin Park ft. JayZ
|
140
|
+
|
141
|
+
lastfm_artist_info_request Limp Bizkit
|
142
|
+
lastfm_artist_info_request Korn
|
143
|
+
lastfm_artist_info_request Katt Williams
|
144
|
+
lastfm_artist_info_request Avenged Sevenfold
|
145
|
+
lastfm_artist_info_request Eddie Izzard
|
146
|
+
lastfm_artist_info_request Slash
|
147
|
+
lastfm_artist_info_request Staind
|