wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module JsonModel
|
|
3
|
+
|
|
4
|
+
# ===========================================================================
|
|
5
|
+
#
|
|
6
|
+
# Public timeline is an array of tweets => users
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
class PublicTimelineParser < GenericJsonParser
|
|
10
|
+
attr_accessor :scraped_at
|
|
11
|
+
def initialize raw, context, scraped_at, *ignore
|
|
12
|
+
super raw
|
|
13
|
+
self.scraped_at = scraped_at
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Public timeline is an array of users with one tweet each
|
|
17
|
+
def healthy?() raw && raw.is_a?(Array) end
|
|
18
|
+
def each &block
|
|
19
|
+
raw.each do |hsh|
|
|
20
|
+
parsed = JsonTweet.new(hsh, nil)
|
|
21
|
+
next unless parsed && parsed.healthy?
|
|
22
|
+
[
|
|
23
|
+
parsed.generate_user_partial,
|
|
24
|
+
parsed.generate_tweet
|
|
25
|
+
].each do |obj|
|
|
26
|
+
yield obj
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
class SingleStatusParser < GenericJsonParser
|
|
33
|
+
attr_accessor :scraped_at
|
|
34
|
+
def initialize raw, context, scraped_at, *ignore
|
|
35
|
+
super raw
|
|
36
|
+
self.scraped_at = scraped_at
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Public timeline is an array of users with one tweet each
|
|
40
|
+
def healthy?() raw && raw.is_a?(Hash) end
|
|
41
|
+
def each &block
|
|
42
|
+
hsh = raw
|
|
43
|
+
parsed = JsonTweet.new(hsh, nil)
|
|
44
|
+
next unless parsed && parsed.healthy?
|
|
45
|
+
[
|
|
46
|
+
parsed.generate_user_partial,
|
|
47
|
+
parsed.generate_tweet
|
|
48
|
+
].each do |obj|
|
|
49
|
+
yield obj
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
require 'wukong/encoding'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module Twitter
|
|
4
|
+
module Scrape
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# ScrapeRequest for the twitter Search API.
|
|
8
|
+
#
|
|
9
|
+
# Examines the parsed contents to describe result
|
|
10
|
+
#
|
|
11
|
+
class TwitterSearchRequest < Monkeyshines::ScrapeRequest
|
|
12
|
+
include Wuclan::Twitter::Model
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# Parse
|
|
16
|
+
#
|
|
17
|
+
def parse *args, &block
|
|
18
|
+
items.each do |item|
|
|
19
|
+
self.encode_and_sanitize!(item)
|
|
20
|
+
tweet = tweet_from_parse(item)
|
|
21
|
+
from_user_sid = TwitterUserSearchId.new(item['from_user'], item['from_user_id'])
|
|
22
|
+
to_user_sid = TwitterUserSearchId.new(item['to_user'], item['to_user_id'] ) if (! item['to_user_id'].blank?)
|
|
23
|
+
yield tweet
|
|
24
|
+
yield from_user_sid
|
|
25
|
+
if to_user_sid
|
|
26
|
+
yield to_user_sid
|
|
27
|
+
yield ARepliesBName.new(
|
|
28
|
+
from_user_sid.screen_name, to_user_sid.screen_name,
|
|
29
|
+
tweet.id, nil,
|
|
30
|
+
from_user_sid.sid, to_user_sid.sid)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def tweet_from_parse item
|
|
36
|
+
SearchTweet.new(item['id'], item['created_at'],
|
|
37
|
+
nil, nil, nil, # twitter_user_id, favorited, truncated
|
|
38
|
+
item[''], nil, item['text'],
|
|
39
|
+
item['source'],
|
|
40
|
+
item['to_user'], item['to_user_id'],
|
|
41
|
+
item['from_user'], item['from_user_id'], item['iso_language_code'])
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def encode_and_sanitize! item
|
|
45
|
+
item['from_user'].wukong_encode!(:url)
|
|
46
|
+
item['to_user'].wukong_encode!(:url) unless item['to_user'].blank?
|
|
47
|
+
item['text'].wukong_encode!
|
|
48
|
+
item['created_at'] = Time.parse(item['created_at']).utc.to_flat
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.encode_screen_name screen_name
|
|
52
|
+
screen_name.wukong_encode!(:url) if (screen_name =~ /\W/)
|
|
53
|
+
screen_name
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
end # TwitterSearchRequest (parse)
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module JsonModel
|
|
3
|
+
|
|
4
|
+
class UserParser < GenericJsonParser
|
|
5
|
+
attr_accessor :scraped_at
|
|
6
|
+
def initialize raw, context, scraped_at, *ignore
|
|
7
|
+
super raw
|
|
8
|
+
self.scraped_at = scraped_at
|
|
9
|
+
end
|
|
10
|
+
def healthy?() raw && raw.is_a?(Hash) end
|
|
11
|
+
|
|
12
|
+
def user
|
|
13
|
+
@user ||= JsonTwitterUser.new(raw, scraped_at)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def generate_twitter_user
|
|
17
|
+
user.generate_user_classes(TwitterUser).first
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def each &block
|
|
21
|
+
user.generate_user_profile_and_style.each do |obj|
|
|
22
|
+
yield obj
|
|
23
|
+
end
|
|
24
|
+
tweet = user.generate_tweet
|
|
25
|
+
yield tweet if tweet
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
require 'monkeyshines/scrape_request/raw_json_contents'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module Twitter
|
|
4
|
+
module Scrape
|
|
5
|
+
# Effectively unlimited request maximum
|
|
6
|
+
# NO_LIMIT = 2**31
|
|
7
|
+
NO_LIMIT = (50_000_000 / 100) # controlled insanity
|
|
8
|
+
|
|
9
|
+
#
|
|
10
|
+
# Base class for twitter API requests
|
|
11
|
+
#
|
|
12
|
+
class Base < TypedStruct.new(
|
|
13
|
+
[:twitter_user_id, Integer],
|
|
14
|
+
[:page, Integer],
|
|
15
|
+
[:moreinfo, String],
|
|
16
|
+
[:url, String],
|
|
17
|
+
[:scraped_at, Bignum],
|
|
18
|
+
[:response_code, Integer],
|
|
19
|
+
[:response_message, String],
|
|
20
|
+
[:contents, String]
|
|
21
|
+
)
|
|
22
|
+
class_inheritable_accessor :resource_path, :page_limit, :max_items
|
|
23
|
+
include Monkeyshines::ScrapeRequestCore
|
|
24
|
+
# Let us be peers with AFollowsB and TwitterUser and etc.
|
|
25
|
+
include Wuclan::Twitter::Model
|
|
26
|
+
# Contents are JSON
|
|
27
|
+
include Monkeyshines::RawJsonContents
|
|
28
|
+
# Requests are paginated
|
|
29
|
+
include Monkeyshines::ScrapeRequest::Paginated
|
|
30
|
+
|
|
31
|
+
#
|
|
32
|
+
def healthy?
|
|
33
|
+
(! url.blank) && ( # has a URL and either:
|
|
34
|
+
scraped_at.blank? || # hasn't been scraped,
|
|
35
|
+
(! response_code.blank?) || # or has, with response code
|
|
36
|
+
(! contents.blank?) ) # or has, with response
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Generate request URL from other attributes
|
|
40
|
+
def make_url
|
|
41
|
+
# This works for most of the twitter calls
|
|
42
|
+
"http://twitter.com/#{resource_path}/#{twitter_user_id}.json?page=#{page||1}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def key
|
|
46
|
+
[twitter_user_id, page||1].join('-')
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Characters to scrub from contents.
|
|
50
|
+
# !! FIXME !! -- destructive.
|
|
51
|
+
BAD_CHARS = { "\r" => " ", "\n" => " ", "\t" => "	" }
|
|
52
|
+
#
|
|
53
|
+
# Set the contents from the fetch payload
|
|
54
|
+
#
|
|
55
|
+
def response= response
|
|
56
|
+
self.contents = response.body.gsub(/[\r\n\t]/){|c| BAD_CHARS[c]}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
#
|
|
60
|
+
# Pagination
|
|
61
|
+
#
|
|
62
|
+
|
|
63
|
+
# creates the paginated request
|
|
64
|
+
def request_for_page page, pageinfo=nil
|
|
65
|
+
(page.to_i > 1) ? self.class.new(twitter_user_id, page) : self
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Number of items
|
|
69
|
+
def num_items
|
|
70
|
+
parsed_contents.length rescue 0
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# if from_result has something to say about the max_total_items, fix the
|
|
74
|
+
# value appropriately. (For example, a twitter_user's :statuses_count
|
|
75
|
+
# sets the max_total_items for a TwitterUserTimelineRequest)
|
|
76
|
+
def set_total_items from_result
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# language: http://en.wikipedia.org/wiki/ISO_639-1
|
|
84
|
+
#
|
|
85
|
+
# * Find tweets containing a word: http://search.twitter.com/search.atom?q=twitter
|
|
86
|
+
# * Find tweets from a user: http://search.twitter.com/search.atom?q=from%3Aalexiskold
|
|
87
|
+
# * Find tweets to a user: http://search.twitter.com/search.atom?q=to%3Atechcrunch
|
|
88
|
+
# * Find tweets referencing a user: http://search.twitter.com/search.atom?q=%40mashable
|
|
89
|
+
# * Find tweets containing a hashtag: http://search.twitter.com/search.atom?q=%23haiku
|
|
90
|
+
# * Combine any of the operators together: http://search.twitter.com/search.atom?q=movie+%3A%29
|
|
91
|
+
#
|
|
92
|
+
# * lang: restricts tweets to the given language, given by an ISO 639-1 code. Ex: http://search.twitter.com/search.atom?lang=en&q=devo
|
|
93
|
+
# * rpp: the number of tweets to return per page, up to a max of 100. Ex: http://search.twitter.com/search.atom?lang=en&q=devo&rpp=15
|
|
94
|
+
# * page: the page number (starting at 1) to return, up to a max of roughly 1500 results (based on rpp * page)
|
|
95
|
+
# * since_id: returns tweets with status ids greater than the given id.
|
|
96
|
+
# * geocode: returns tweets by users located within a given radius of the given latitude/longitude, where the user's location is taken from their Twitter profile. The parameter value is specified by "latitide,longitude,radius", where radius units must be specified as either "mi" (miles) or "km" (kilometers). Ex: http://search.twitter.com/search.atom?geocode=40.757929%2C-73.985506%2C25km. Note that you cannot use the near operator via the API to geocode arbitrary locations; however you can use this geocode parameter to search near geocodes directly.
|
|
97
|
+
# * show_user: when "true", adds "<user>:" to the beginning of the tweet. This is useful for readers that do not display Atom's author field. The default is "false".
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
module Wuclan::Twitter::Scrape
|
|
2
|
+
#
|
|
3
|
+
# Older versions of wuclan had a slightly different request naming scheme, and
|
|
4
|
+
# had additional specialized fields. This module adapts old to new; you're only
|
|
5
|
+
# likely to need this if you're me, @mrflip.
|
|
6
|
+
#
|
|
7
|
+
module OldSkoolRequest
|
|
8
|
+
def initialize(priority, twitter_user_id, page, screen_name, url, *args)
|
|
9
|
+
self.twitter_user_id = twitter_user_id
|
|
10
|
+
super(twitter_user_id, page, screen_name, url, *args)
|
|
11
|
+
self.url = make_url
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def parse *args, &block
|
|
15
|
+
handle_special_cases!(*args, &block) or return
|
|
16
|
+
# super *args
|
|
17
|
+
yield self
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def handle_special_cases! *args, &block
|
|
21
|
+
if scraped_at.to_s !~ /\d{14}/
|
|
22
|
+
yield BadRecord.new({:bad_date => scraped_at}.to_json, self.to_flat)
|
|
23
|
+
return nil
|
|
24
|
+
end
|
|
25
|
+
true
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class Followers < TwitterFollowersRequest ; include OldSkoolRequest ; end
|
|
30
|
+
class Friends < TwitterFriendsRequest ; include OldSkoolRequest ; end
|
|
31
|
+
class Favorites < TwitterFavoritesRequest ; include OldSkoolRequest ; end
|
|
32
|
+
class UserTimeline < TwitterUserTimelineRequest ; include OldSkoolRequest ; end
|
|
33
|
+
class Bogus < BadRecord ;
|
|
34
|
+
def parse suffix=nil, *args
|
|
35
|
+
errors = suffix.split('-')
|
|
36
|
+
klass_name = errors.pop
|
|
37
|
+
yield BadRecord.new("%-23s"%errors.to_json, klass_name, record)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
|
|
2
|
+
#
|
|
3
|
+
# returns json for a hash / array as appropriate
|
|
4
|
+
#
|
|
5
|
+
class TwitterFakeFetcher < Monkeyshines::Fetcher::FakeFetcher
|
|
6
|
+
|
|
7
|
+
def fake_contents req
|
|
8
|
+
case req
|
|
9
|
+
when TwitterFollowersIdsRequest, TwitterFriendsIdsRequest
|
|
10
|
+
(1 .. rand(50)).map{ rand(1e6) }.to_json
|
|
11
|
+
when TwitterUserRequest
|
|
12
|
+
{ :id => (req.twitter_user_id.to_i == 0 ? rand(1e6) : req.twitter_user_id),
|
|
13
|
+
:created_at => Time.parse('08-08-2008 12:45'), :name => req.url, :protected => false,
|
|
14
|
+
:followers_count => rand(2001), :friends_count => rand(401), :statuses_count => rand(120), :favourites_count => rand(50) }.to_json
|
|
15
|
+
when TwitterFollowersRequest, TwitterFriendsRequest, TwitterFavoritesRequest
|
|
16
|
+
(0..req.max_items).map{|i| { :fetched => req.url, :id => i } }.to_json
|
|
17
|
+
else
|
|
18
|
+
if (req[:page].to_i > 1) && (rand(8) == 0)
|
|
19
|
+
[].to_json
|
|
20
|
+
else
|
|
21
|
+
(0..req.max_items).map{|i| { :fetched => req.url, :id => i } }.to_json
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def get req
|
|
27
|
+
super req
|
|
28
|
+
req.contents = fake_contents(req)
|
|
29
|
+
req
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Twitter
|
|
3
|
+
module Scrape
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# API request for the full list of a user's followers (IDs only).
|
|
7
|
+
# One call gives the whole list.
|
|
8
|
+
#
|
|
9
|
+
# Produces a (possibly very large) number of AFollowsB.
|
|
10
|
+
#
|
|
11
|
+
class TwitterFollowersIdsRequest < Wuclan::Twitter::Scrape::Base
|
|
12
|
+
self.resource_path = 'followers/ids'
|
|
13
|
+
self.hard_request_limit = 1
|
|
14
|
+
self.max_items = NO_LIMIT
|
|
15
|
+
def items_count(thing) thing.followers_count == 0 ? 0 : 1 end
|
|
16
|
+
def make_url() "http://twitter.com/#{resource_path}/#{twitter_user_id}.json" end
|
|
17
|
+
def key
|
|
18
|
+
twitter_user_id
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# followers_ids should be an array of user_ids
|
|
22
|
+
def healthy?()
|
|
23
|
+
parsed_contents && parsed_contents.is_a?(Array)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
#
|
|
27
|
+
# unpacks the raw API response, yielding all the relationships.
|
|
28
|
+
#
|
|
29
|
+
def parse *args, &block
|
|
30
|
+
parsed_contents.each do |user_b_id|
|
|
31
|
+
user_b_id = "%010d"%user_b_id.to_i
|
|
32
|
+
# B is a follower: B follows user.
|
|
33
|
+
yield AFollowsB.new(user_b_id, user_a_id)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
#
|
|
39
|
+
# API request for the full list of a user's friends (IDs only).
|
|
40
|
+
# One call gives the whole list.
|
|
41
|
+
#
|
|
42
|
+
# Produces a (possibly very large) number of AFollowsB.
|
|
43
|
+
#
|
|
44
|
+
class TwitterFriendsIdsRequest < Wuclan::Twitter::Scrape::Base
|
|
45
|
+
self.resource_path = 'friends/ids'
|
|
46
|
+
self.hard_request_limit = 1
|
|
47
|
+
self.max_items = NO_LIMIT
|
|
48
|
+
def items_count(thing) thing.friends_count == 0 ? 0 : 1 end
|
|
49
|
+
def make_url() "http://twitter.com/#{resource_path}/#{twitter_user_id}.json" end
|
|
50
|
+
def key
|
|
51
|
+
twitter_user_id
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
#
|
|
55
|
+
# friends_ids should be an array of user_id's
|
|
56
|
+
#
|
|
57
|
+
def healthy?()
|
|
58
|
+
parsed_contents && parsed_contents.is_a?(Array)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
#
|
|
62
|
+
# unpacks the raw API response, yielding all the relationships.
|
|
63
|
+
#
|
|
64
|
+
def parse *args, &block
|
|
65
|
+
parsed_contents.each do |user_b_id|
|
|
66
|
+
user_b_id = "%010d"%user_b_id.to_i
|
|
67
|
+
# B is a friend: user follows B
|
|
68
|
+
yield AFollowsB.new(user_a_id, user_b_id)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Twitter
|
|
3
|
+
module Scrape
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# API request for the timeline from a user's followers.
|
|
7
|
+
#
|
|
8
|
+
# Produces max 100 TwitterUser,Profile,Style, their most recent Tweet,
|
|
9
|
+
# and an AFollowsB link
|
|
10
|
+
#
|
|
11
|
+
# Before early 2009, produced TwitterUserPartials, not full records
|
|
12
|
+
#
|
|
13
|
+
# http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-statuses%C2%A0followers
|
|
14
|
+
#
|
|
15
|
+
class TwitterFollowersRequest < Wuclan::Twitter::Scrape::Base
|
|
16
|
+
self.resource_path = 'statuses/followers'
|
|
17
|
+
self.hard_request_limit = NO_LIMIT
|
|
18
|
+
self.max_items = 100
|
|
19
|
+
def items_count(thing) thing.followers_count end
|
|
20
|
+
|
|
21
|
+
# set max_total_items from the favourites_count.
|
|
22
|
+
def set_total_items twitter_user_info
|
|
23
|
+
self.max_total_items = twitter_user_info['followers_count'].to_i rescue nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Extracted JSON should be an array
|
|
27
|
+
def healthy?()
|
|
28
|
+
parsed_contents && parsed_contents.is_a?(Array)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
#
|
|
32
|
+
# unpacks the raw API response, yielding all the interesting objects
|
|
33
|
+
# and relationships within.
|
|
34
|
+
#
|
|
35
|
+
def parse *args, &block
|
|
36
|
+
return unless healthy?
|
|
37
|
+
parsed_contents.each do |hsh|
|
|
38
|
+
json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
|
|
39
|
+
next unless json_obj && json_obj.healthy?
|
|
40
|
+
#
|
|
41
|
+
# Extract user, tweet and relationship
|
|
42
|
+
yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
|
|
43
|
+
json_obj.each(&block)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
#
|
|
49
|
+
# API request for the timeline from a user's friends.
|
|
50
|
+
#
|
|
51
|
+
# Produces max 100 TwitterUser,Profile,Style and their most recent Tweet
|
|
52
|
+
#
|
|
53
|
+
# Before early 2009, produced TwitterUserPartials, not full records
|
|
54
|
+
#
|
|
55
|
+
# http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-statuses%C2%A0friends
|
|
56
|
+
#
|
|
57
|
+
class TwitterFriendsRequest < Wuclan::Twitter::Scrape::Base
|
|
58
|
+
self.resource_path = 'statuses/friends'
|
|
59
|
+
self.hard_request_limit = NO_LIMIT
|
|
60
|
+
self.max_items = 100
|
|
61
|
+
def items_count(thing) thing.friends_count end
|
|
62
|
+
|
|
63
|
+
# set max_total_items from the friends_count.
|
|
64
|
+
def set_total_items twitter_user_info
|
|
65
|
+
self.max_total_items = twitter_user_info['friends_count'].to_i rescue nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Extracted JSON should be an array
|
|
69
|
+
def healthy?()
|
|
70
|
+
parsed_contents && parsed_contents.is_a?(Array)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
#
|
|
74
|
+
# unpacks the raw API response, yielding all the interesting objects
|
|
75
|
+
# and relationships within.
|
|
76
|
+
#
|
|
77
|
+
def parse *args, &block
|
|
78
|
+
return unless healthy?
|
|
79
|
+
parsed_contents.each do |hsh|
|
|
80
|
+
json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
|
|
81
|
+
next unless json_obj && json_obj.healthy?
|
|
82
|
+
#
|
|
83
|
+
# Extract user, tweet and relationship
|
|
84
|
+
yield AFollowsB.new(self.twitter_user_id, json_obj.user.id) if json_obj.user
|
|
85
|
+
json_obj.each(&block)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
#
|
|
91
|
+
# API request for the tweets favorited by the given user. At 20 requests
|
|
92
|
+
# per page, this is the worst bargain on the Twitter API call market.
|
|
93
|
+
#
|
|
94
|
+
# Produces max 20 TwitterUser,Profile,Style and the favorited Tweet.
|
|
95
|
+
#
|
|
96
|
+
# Before early 2009, produced TwitterUserPartials, not full records
|
|
97
|
+
#
|
|
98
|
+
# http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-favorites
|
|
99
|
+
#
|
|
100
|
+
class TwitterFavoritesRequest < Wuclan::Twitter::Scrape::Base
|
|
101
|
+
self.resource_path = 'favorites'
|
|
102
|
+
self.hard_request_limit = NO_LIMIT
|
|
103
|
+
self.max_items = 20
|
|
104
|
+
def items_count(thing) thing.favourites_count end
|
|
105
|
+
|
|
106
|
+
# Extracted JSON should be an array
|
|
107
|
+
def healthy?()
|
|
108
|
+
parsed_contents && parsed_contents.is_a?(Array)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# set max_total_items from the favourites_count.
|
|
112
|
+
def set_total_items twitter_user_info
|
|
113
|
+
self.max_total_items = twitter_user_info['favourites_count'].to_i rescue nil
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
#
|
|
117
|
+
# unpacks the raw API response, yielding all the interesting objects
|
|
118
|
+
# and relationships within.
|
|
119
|
+
#
|
|
120
|
+
def parse *args, &block
|
|
121
|
+
return unless healthy?
|
|
122
|
+
parsed_contents.each do |hsh|
|
|
123
|
+
json_obj = JsonTweetWithUser.new(hsh, 'scraped_at' => scraped_at)
|
|
124
|
+
next unless json_obj && json_obj.healthy?
|
|
125
|
+
#
|
|
126
|
+
# Extract user, tweet and relationship
|
|
127
|
+
yield AFavoritesB.new(self.twitter_user_id, json_obj.user.id, json_obj.tweet.id) if json_obj.user && json_obj.tweet
|
|
128
|
+
json_obj.each(&block)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
require 'wukong/encoding'
|
|
2
|
+
module Wuclan::Twitter::Scrape
|
|
3
|
+
class JsonUserTweetPair
|
|
4
|
+
include Wuclan::Twitter::Model
|
|
5
|
+
attr_accessor :raw, :moreinfo
|
|
6
|
+
def initialize raw, moreinfo
|
|
7
|
+
self.raw = raw
|
|
8
|
+
self.moreinfo = moreinfo
|
|
9
|
+
fix_raw_user!
|
|
10
|
+
fix_raw_tweet!
|
|
11
|
+
# p ['new', self.class, raw, moreinfo]
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Extracted JSON should be an array
|
|
15
|
+
def healthy?()
|
|
16
|
+
raw && raw.is_a?(Hash) && (raw_tweet.nil? || raw_tweet.is_a?(Hash))
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#
|
|
20
|
+
# generate all the contained TwitterXXX objects
|
|
21
|
+
#
|
|
22
|
+
def each
|
|
23
|
+
if is_partial?
|
|
24
|
+
yield user
|
|
25
|
+
else
|
|
26
|
+
yield user
|
|
27
|
+
yield user_profile
|
|
28
|
+
yield user_style
|
|
29
|
+
end
|
|
30
|
+
yield tweet if tweet
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#
|
|
34
|
+
# Before mid-2009, most calls returned only the fields in
|
|
35
|
+
# TwitterUserPartial. After a mid-2009 API update, most calls return a full
|
|
36
|
+
# user record: TwitterUser, TwitterUserStyle and TwitterUserProfile
|
|
37
|
+
#
|
|
38
|
+
# This method tries to guess, based on the fields in the raw_user, which it has.
|
|
39
|
+
#
|
|
40
|
+
def is_partial?
|
|
41
|
+
not raw_user.include?('friends_count')
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def tweet
|
|
46
|
+
Tweet.from_hash raw_tweet if raw_tweet
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# create TwitterUser object from raw info
|
|
50
|
+
def user
|
|
51
|
+
if is_partial?
|
|
52
|
+
TwitterUserPartial.from_hash raw_user
|
|
53
|
+
else
|
|
54
|
+
TwitterUser.from_hash raw_user
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
def user_profile
|
|
58
|
+
TwitterUserProfile.from_hash raw_user
|
|
59
|
+
end
|
|
60
|
+
def user_style
|
|
61
|
+
TwitterUserStyle.from_hash raw_user
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# Standardize the raw user hash's fields for further processing with Wukong
|
|
66
|
+
#
|
|
67
|
+
def fix_raw_user!
|
|
68
|
+
return unless raw_user
|
|
69
|
+
raw_user['scraped_at'] = self.moreinfo['scraped_at']
|
|
70
|
+
raw_user['created_at'] = ModelCommon.flatten_date(raw_user['created_at'])
|
|
71
|
+
raw_user['id'] = ModelCommon.zeropad_id( raw_user['id'])
|
|
72
|
+
raw_user['protected'] = ModelCommon.unbooleanize(raw_user['protected'])
|
|
73
|
+
raw_user['profile_background_tile'] = ModelCommon.unbooleanize(raw['profile_background_tile']) unless raw_user['profile_background_tile'].nil?
|
|
74
|
+
Wukong.encode_components raw_user, 'name', 'location', 'description', 'url'
|
|
75
|
+
# There are several users with bogus screen names
|
|
76
|
+
# These we need to **URL encode** -- not XML-encode.
|
|
77
|
+
if raw_user['screen_name'] !~ /\A\w+\z/
|
|
78
|
+
raw_user['screen_name'] = Wukong.encode_str(raw_user['screen_name'], :url)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
#
|
|
83
|
+
# Standardize the raw tweet hash's fields for further processing with Wukong
|
|
84
|
+
#
|
|
85
|
+
def fix_raw_tweet!
|
|
86
|
+
return unless raw_tweet
|
|
87
|
+
raw_tweet['id'] = ModelCommon.zeropad_id( raw_tweet['id'])
|
|
88
|
+
raw_tweet['created_at'] = ModelCommon.flatten_date(raw_tweet['created_at'])
|
|
89
|
+
raw_tweet['favorited'] = ModelCommon.unbooleanize(raw_tweet['favorited'])
|
|
90
|
+
raw_tweet['truncated'] = ModelCommon.unbooleanize(raw_tweet['truncated'])
|
|
91
|
+
raw_tweet['twitter_user_id'] = ModelCommon.zeropad_id( raw_tweet['twitter_user_id'] )
|
|
92
|
+
raw_tweet['in_reply_to_user_id'] = ModelCommon.zeropad_id( raw_tweet['in_reply_to_user_id']) unless raw_tweet['in_reply_to_user_id'].blank? || (raw_tweet['in_reply_to_user_id'].to_i == 0)
|
|
93
|
+
raw_tweet['in_reply_to_status_id'] = ModelCommon.zeropad_id( raw_tweet['in_reply_to_status_id']) unless raw_tweet['in_reply_to_status_id'].blank? || (raw_tweet['in_reply_to_status_id'].to_i == 0)
|
|
94
|
+
Wukong.encode_components raw_tweet, 'text', 'in_reply_to_screen_name'
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class JsonUserWithTweet < JsonUserTweetPair
|
|
101
|
+
|
|
102
|
+
def raw_tweet
|
|
103
|
+
return @raw_tweet if @raw_tweet
|
|
104
|
+
@raw_tweet = raw['status']
|
|
105
|
+
@raw_tweet['twitter_user_id'] = raw_user['id'] if @raw_tweet
|
|
106
|
+
@raw_tweet
|
|
107
|
+
end
|
|
108
|
+
def raw_user
|
|
109
|
+
@raw_user ||= raw
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class JsonTweetWithUser < JsonUserTweetPair
|
|
115
|
+
|
|
116
|
+
def raw_tweet
|
|
117
|
+
@raw_tweet ||= raw
|
|
118
|
+
end
|
|
119
|
+
def raw_user
|
|
120
|
+
return @raw_user if @raw_user
|
|
121
|
+
@raw_user = raw['user']
|
|
122
|
+
@raw_user
|
|
123
|
+
end
|
|
124
|
+
end
|