wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,54 @@
1
+ module Wuclan
2
+ module JsonModel
3
+
4
+ # ===========================================================================
5
+ #
6
+ # Public timeline is an array of tweets => users
7
+ #
8
+ #
9
+ class PublicTimelineParser < GenericJsonParser
10
+ attr_accessor :scraped_at
11
+ def initialize raw, context, scraped_at, *ignore
12
+ super raw
13
+ self.scraped_at = scraped_at
14
+ end
15
+
16
+ # Public timeline is an array of users with one tweet each
17
+ def healthy?() raw && raw.is_a?(Array) end
18
+ def each &block
19
+ raw.each do |hsh|
20
+ parsed = JsonTweet.new(hsh, nil)
21
+ next unless parsed && parsed.healthy?
22
+ [
23
+ parsed.generate_user_partial,
24
+ parsed.generate_tweet
25
+ ].each do |obj|
26
+ yield obj
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ class SingleStatusParser < GenericJsonParser
33
+ attr_accessor :scraped_at
34
+ def initialize raw, context, scraped_at, *ignore
35
+ super raw
36
+ self.scraped_at = scraped_at
37
+ end
38
+
39
+ # Public timeline is an array of users with one tweet each
40
+ def healthy?() raw && raw.is_a?(Hash) end
41
+ def each &block
42
+ hsh = raw
43
+ parsed = JsonTweet.new(hsh, nil)
44
+ next unless parsed && parsed.healthy?
45
+ [
46
+ parsed.generate_user_partial,
47
+ parsed.generate_tweet
48
+ ].each do |obj|
49
+ yield obj
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,60 @@
1
+ require 'wukong/encoding'
2
+ module Wuclan
3
+ module Twitter
4
+ module Scrape
5
+
6
+ #
7
+ # ScrapeRequest for the twitter Search API.
8
+ #
9
+ # Examines the parsed contents to describe result
10
+ #
11
+ class TwitterSearchRequest < Monkeyshines::ScrapeRequest
12
+ include Wuclan::Twitter::Model
13
+
14
+ #
15
+ # Parse
16
+ #
17
+ def parse *args, &block
18
+ items.each do |item|
19
+ self.encode_and_sanitize!(item)
20
+ tweet = tweet_from_parse(item)
21
+ from_user_sid = TwitterUserSearchId.new(item['from_user'], item['from_user_id'])
22
+ to_user_sid = TwitterUserSearchId.new(item['to_user'], item['to_user_id'] ) if (! item['to_user_id'].blank?)
23
+ yield tweet
24
+ yield from_user_sid
25
+ if to_user_sid
26
+ yield to_user_sid
27
+ yield ARepliesBName.new(
28
+ from_user_sid.screen_name, to_user_sid.screen_name,
29
+ tweet.id, nil,
30
+ from_user_sid.sid, to_user_sid.sid)
31
+ end
32
+ end
33
+ end
34
+
35
+ def tweet_from_parse item
36
+ SearchTweet.new(item['id'], item['created_at'],
37
+ nil, nil, nil, # twitter_user_id, favorited, truncated
38
+ item[''], nil, item['text'],
39
+ item['source'],
40
+ item['to_user'], item['to_user_id'],
41
+ item['from_user'], item['from_user_id'], item['iso_language_code'])
42
+ end
43
+
44
+ def encode_and_sanitize! item
45
+ item['from_user'].wukong_encode!(:url)
46
+ item['to_user'].wukong_encode!(:url) unless item['to_user'].blank?
47
+ item['text'].wukong_encode!
48
+ item['created_at'] = Time.parse(item['created_at']).utc.to_flat
49
+ end
50
+
51
+ def self.encode_screen_name screen_name
52
+ screen_name.wukong_encode!(:url) if (screen_name =~ /\W/)
53
+ screen_name
54
+ end
55
+
56
+ end # TwitterSearchRequest (parse)
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,30 @@
1
+ module Wuclan
2
+ module JsonModel
3
+
4
+ class UserParser < GenericJsonParser
5
+ attr_accessor :scraped_at
6
+ def initialize raw, context, scraped_at, *ignore
7
+ super raw
8
+ self.scraped_at = scraped_at
9
+ end
10
+ def healthy?() raw && raw.is_a?(Hash) end
11
+
12
+ def user
13
+ @user ||= JsonTwitterUser.new(raw, scraped_at)
14
+ end
15
+
16
+ def generate_twitter_user
17
+ user.generate_user_classes(TwitterUser).first
18
+ end
19
+
20
+ def each &block
21
+ user.generate_user_profile_and_style.each do |obj|
22
+ yield obj
23
+ end
24
+ tweet = user.generate_tweet
25
+ yield tweet if tweet
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,97 @@
1
+ require 'monkeyshines/scrape_request/raw_json_contents'
2
+ module Wuclan
3
+ module Twitter
4
+ module Scrape
5
+ # Effectively unlimited request maximum
6
+ # NO_LIMIT = 2**31
7
+ NO_LIMIT = (50_000_000 / 100) # controlled insanity
8
+
9
+ #
10
+ # Base class for twitter API requests
11
+ #
12
+ class Base < TypedStruct.new(
13
+ [:twitter_user_id, Integer],
14
+ [:page, Integer],
15
+ [:moreinfo, String],
16
+ [:url, String],
17
+ [:scraped_at, Bignum],
18
+ [:response_code, Integer],
19
+ [:response_message, String],
20
+ [:contents, String]
21
+ )
22
+ class_inheritable_accessor :resource_path, :page_limit, :max_items
23
+ include Monkeyshines::ScrapeRequestCore
24
+ # Let us be peers with AFollowsB and TwitterUser and etc.
25
+ include Wuclan::Twitter::Model
26
+ # Contents are JSON
27
+ include Monkeyshines::RawJsonContents
28
+ # Requests are paginated
29
+ include Monkeyshines::ScrapeRequest::Paginated
30
+
31
+ #
32
+ def healthy?
33
+ (! url.blank) && ( # has a URL and either:
34
+ scraped_at.blank? || # hasn't been scraped,
35
+ (! response_code.blank?) || # or has, with response code
36
+ (! contents.blank?) ) # or has, with response
37
+ end
38
+
39
+ # Generate request URL from other attributes
40
+ def make_url
41
+ # This works for most of the twitter calls
42
+ "http://twitter.com/#{resource_path}/#{twitter_user_id}.json?page=#{page||1}"
43
+ end
44
+
45
+ def key
46
+ [twitter_user_id, page||1].join('-')
47
+ end
48
+
49
+ # Characters to scrub from contents.
50
+ # !! FIXME !! -- destructive.
51
+ BAD_CHARS = { "\r" => "&#13;", "\n" => "&#10;", "\t" => "&#9;" }
52
+ #
53
+ # Set the contents from the fetch payload
54
+ #
55
+ def response= response
56
+ self.contents = response.body.gsub(/[\r\n\t]/){|c| BAD_CHARS[c]}
57
+ end
58
+
59
+ #
60
+ # Pagination
61
+ #
62
+
63
+ # creates the paginated request
64
+ def request_for_page page, pageinfo=nil
65
+ (page.to_i > 1) ? self.class.new(twitter_user_id, page) : self
66
+ end
67
+
68
+ # Number of items
69
+ def num_items
70
+ parsed_contents.length rescue 0
71
+ end
72
+
73
+ # if from_result has something to say about the max_total_items, fix the
74
+ # value appropriately. (For example, a twitter_user's :statuses_count
75
+ # sets the max_total_items for a TwitterUserTimelineRequest)
76
+ def set_total_items from_result
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ # language: http://en.wikipedia.org/wiki/ISO_639-1
84
+ #
85
+ # * Find tweets containing a word: http://search.twitter.com/search.atom?q=twitter
86
+ # * Find tweets from a user: http://search.twitter.com/search.atom?q=from%3Aalexiskold
87
+ # * Find tweets to a user: http://search.twitter.com/search.atom?q=to%3Atechcrunch
88
+ # * Find tweets referencing a user: http://search.twitter.com/search.atom?q=%40mashable
89
+ # * Find tweets containing a hashtag: http://search.twitter.com/search.atom?q=%23haiku
90
+ # * Combine any of the operators together: http://search.twitter.com/search.atom?q=movie+%3A%29
91
+ #
92
+ # * lang: restricts tweets to the given language, given by an ISO 639-1 code. Ex: http://search.twitter.com/search.atom?lang=en&q=devo
93
+ # * rpp: the number of tweets to return per page, up to a max of 100. Ex: http://search.twitter.com/search.atom?lang=en&q=devo&rpp=15
94
+ # * page: the page number (starting at 1) to return, up to a max of roughly 1500 results (based on rpp * page)
95
+ # * since_id: returns tweets with status ids greater than the given id.
96
+ # * geocode: returns tweets by users located within a given radius of the given latitude/longitude, where the user's location is taken from their Twitter profile. The parameter value is specified by "latitide,longitude,radius", where radius units must be specified as either "mi" (miles) or "km" (kilometers). Ex: http://search.twitter.com/search.atom?geocode=40.757929%2C-73.985506%2C25km. Note that you cannot use the near operator via the API to geocode arbitrary locations; however you can use this geocode parameter to search near geocodes directly.
97
+ # * show_user: when "true", adds "<user>:" to the beginning of the tweet. This is useful for readers that do not display Atom's author field. The default is "false".
@@ -0,0 +1,40 @@
1
+ module Wuclan::Twitter::Scrape
2
+ #
3
+ # Older versions of wuclan had a slightly different request naming scheme, and
4
+ # had additional specialized fields. This module adapts old to new; you're only
5
+ # likely to need this if you're me, @mrflip.
6
+ #
7
+ module OldSkoolRequest
8
+ def initialize(priority, twitter_user_id, page, screen_name, url, *args)
9
+ self.twitter_user_id = twitter_user_id
10
+ super(twitter_user_id, page, screen_name, url, *args)
11
+ self.url = make_url
12
+ end
13
+
14
+ def parse *args, &block
15
+ handle_special_cases!(*args, &block) or return
16
+ # super *args
17
+ yield self
18
+ end
19
+
20
+ def handle_special_cases! *args, &block
21
+ if scraped_at.to_s !~ /\d{14}/
22
+ yield BadRecord.new({:bad_date => scraped_at}.to_json, self.to_flat)
23
+ return nil
24
+ end
25
+ true
26
+ end
27
+ end
28
+
29
+ class Followers < TwitterFollowersRequest ; include OldSkoolRequest ; end
30
+ class Friends < TwitterFriendsRequest ; include OldSkoolRequest ; end
31
+ class Favorites < TwitterFavoritesRequest ; include OldSkoolRequest ; end
32
+ class UserTimeline < TwitterUserTimelineRequest ; include OldSkoolRequest ; end
33
+ class Bogus < BadRecord ;
34
+ def parse suffix=nil, *args
35
+ errors = suffix.split('-')
36
+ klass_name = errors.pop
37
+ yield BadRecord.new("%-23s"%errors.to_json, klass_name, record)
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,31 @@
1
+
2
+ #
3
+ # returns json for a hash / array as appropriate
4
+ #
5
+ class TwitterFakeFetcher < Monkeyshines::Fetcher::FakeFetcher
6
+
7
+ def fake_contents req
8
+ case req
9
+ when TwitterFollowersIdsRequest, TwitterFriendsIdsRequest
10
+ (1 .. rand(50)).map{ rand(1e6) }.to_json
11
+ when TwitterUserRequest
12
+ { :id => (req.twitter_user_id.to_i == 0 ? rand(1e6) : req.twitter_user_id),
13
+ :created_at => Time.parse('08-08-2008 12:45'), :name => req.url, :protected => false,
14
+ :followers_count => rand(2001), :friends_count => rand(401), :statuses_count => rand(120), :favourites_count => rand(50) }.to_json
15
+ when TwitterFollowersRequest, TwitterFriendsRequest, TwitterFavoritesRequest
16
+ (0..req.max_items).map{|i| { :fetched => req.url, :id => i } }.to_json
17
+ else
18
+ if (req[:page].to_i > 1) && (rand(8) == 0)
19
+ [].to_json
20
+ else
21
+ (0..req.max_items).map{|i| { :fetched => req.url, :id => i } }.to_json
22
+ end
23
+ end
24
+ end
25
+
26
+ def get req
27
+ super req
28
+ req.contents = fake_contents(req)
29
+ req
30
+ end
31
+ end
@@ -0,0 +1,75 @@
1
+ module Wuclan
2
+ module Twitter
3
+ module Scrape
4
+
5
+ #
6
+ # API request for the full list of a user's followers (IDs only).
7
+ # One call gives the whole list.
8
+ #
9
+ # Produces a (possibly very large) number of AFollowsB.
10
+ #
11
+ class TwitterFollowersIdsRequest < Wuclan::Twitter::Scrape::Base
12
+ self.resource_path = 'followers/ids'
13
+ self.hard_request_limit = 1
14
+ self.max_items = NO_LIMIT
15
+ def items_count(thing) thing.followers_count == 0 ? 0 : 1 end
16
+ def make_url() "http://twitter.com/#{resource_path}/#{twitter_user_id}.json" end
17
+ def key
18
+ twitter_user_id
19
+ end
20
+
21
+ # followers_ids should be an array of user_ids
22
+ def healthy?()
23
+ parsed_contents && parsed_contents.is_a?(Array)
24
+ end
25
+
26
+ #
27
+ # unpacks the raw API response, yielding all the relationships.
28
+ #
29
+ def parse *args, &block
30
+ parsed_contents.each do |user_b_id|
31
+ user_b_id = "%010d"%user_b_id.to_i
32
+ # B is a follower: B follows user.
33
+ yield AFollowsB.new(user_b_id, user_a_id)
34
+ end
35
+ end
36
+ end
37
+
38
+ #
39
+ # API request for the full list of a user's friends (IDs only).
40
+ # One call gives the whole list.
41
+ #
42
+ # Produces a (possibly very large) number of AFollowsB.
43
+ #
44
+ class TwitterFriendsIdsRequest < Wuclan::Twitter::Scrape::Base
45
+ self.resource_path = 'friends/ids'
46
+ self.hard_request_limit = 1
47
+ self.max_items = NO_LIMIT
48
+ def items_count(thing) thing.friends_count == 0 ? 0 : 1 end
49
+ def make_url() "http://twitter.com/#{resource_path}/#{twitter_user_id}.json" end
50
+ def key
51
+ twitter_user_id
52
+ end
53
+
54
+ #
55
+ # friends_ids should be an array of user_id's
56
+ #
57
+ def healthy?()
58
+ parsed_contents && parsed_contents.is_a?(Array)
59
+ end
60
+
61
+ #
62
+ # unpacks the raw API response, yielding all the relationships.
63
+ #
64
+ def parse *args, &block
65
+ parsed_contents.each do |user_b_id|
66
+ user_b_id = "%010d"%user_b_id.to_i
67
+ # B is a friend: user follows B
68
+ yield AFollowsB.new(user_a_id, user_b_id)
69
+ end
70
+ end
71
+ end
72
+
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,135 @@
1
+ module Wuclan
2
+ module Twitter
3
+ module Scrape
4
+
5
+ #
6
+ # API request for the timeline from a user's followers.
7
+ #
8
+ # Produces max 100 TwitterUser,Profile,Style, their most recent Tweet,
9
+ # and an AFollowsB link
10
+ #
11
+ # Before early 2009, produced TwitterUserPartials, not full records
12
+ #
13
+ # http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-statuses%C2%A0followers
14
+ #
15
+ class TwitterFollowersRequest < Wuclan::Twitter::Scrape::Base
16
+ self.resource_path = 'statuses/followers'
17
+ self.hard_request_limit = NO_LIMIT
18
+ self.max_items = 100
19
+ def items_count(thing) thing.followers_count end
20
+
21
+ # set max_total_items from the favourites_count.
22
+ def set_total_items twitter_user_info
23
+ self.max_total_items = twitter_user_info['followers_count'].to_i rescue nil
24
+ end
25
+
26
+ # Extracted JSON should be an array
27
+ def healthy?()
28
+ parsed_contents && parsed_contents.is_a?(Array)
29
+ end
30
+
31
+ #
32
+ # unpacks the raw API response, yielding all the interesting objects
33
+ # and relationships within.
34
+ #
35
+ def parse *args, &block
36
+ return unless healthy?
37
+ parsed_contents.each do |hsh|
38
+ json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
39
+ next unless json_obj && json_obj.healthy?
40
+ #
41
+ # Extract user, tweet and relationship
42
+ yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
43
+ json_obj.each(&block)
44
+ end
45
+ end
46
+ end
47
+
48
+ #
49
+ # API request for the timeline from a user's friends.
50
+ #
51
+ # Produces max 100 TwitterUser,Profile,Style and their most recent Tweet
52
+ #
53
+ # Before early 2009, produced TwitterUserPartials, not full records
54
+ #
55
+ # http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-statuses%C2%A0friends
56
+ #
57
+ class TwitterFriendsRequest < Wuclan::Twitter::Scrape::Base
58
+ self.resource_path = 'statuses/friends'
59
+ self.hard_request_limit = NO_LIMIT
60
+ self.max_items = 100
61
+ def items_count(thing) thing.friends_count end
62
+
63
+ # set max_total_items from the friends_count.
64
+ def set_total_items twitter_user_info
65
+ self.max_total_items = twitter_user_info['friends_count'].to_i rescue nil
66
+ end
67
+
68
+ # Extracted JSON should be an array
69
+ def healthy?()
70
+ parsed_contents && parsed_contents.is_a?(Array)
71
+ end
72
+
73
+ #
74
+ # unpacks the raw API response, yielding all the interesting objects
75
+ # and relationships within.
76
+ #
77
+ def parse *args, &block
78
+ return unless healthy?
79
+ parsed_contents.each do |hsh|
80
+ json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
81
+ next unless json_obj && json_obj.healthy?
82
+ #
83
+ # Extract user, tweet and relationship
84
+ yield AFollowsB.new(self.twitter_user_id, json_obj.user.id) if json_obj.user
85
+ json_obj.each(&block)
86
+ end
87
+ end
88
+ end
89
+
90
+ #
91
+ # API request for the tweets favorited by the given user. At 20 requests
92
+ # per page, this is the worst bargain on the Twitter API call market.
93
+ #
94
+ # Produces max 20 TwitterUser,Profile,Style and the favorited Tweet.
95
+ #
96
+ # Before early 2009, produced TwitterUserPartials, not full records
97
+ #
98
+ # http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-favorites
99
+ #
100
+ class TwitterFavoritesRequest < Wuclan::Twitter::Scrape::Base
101
+ self.resource_path = 'favorites'
102
+ self.hard_request_limit = NO_LIMIT
103
+ self.max_items = 20
104
+ def items_count(thing) thing.favourites_count end
105
+
106
+ # Extracted JSON should be an array
107
+ def healthy?()
108
+ parsed_contents && parsed_contents.is_a?(Array)
109
+ end
110
+
111
+ # set max_total_items from the favourites_count.
112
+ def set_total_items twitter_user_info
113
+ self.max_total_items = twitter_user_info['favourites_count'].to_i rescue nil
114
+ end
115
+
116
+ #
117
+ # unpacks the raw API response, yielding all the interesting objects
118
+ # and relationships within.
119
+ #
120
+ def parse *args, &block
121
+ return unless healthy?
122
+ parsed_contents.each do |hsh|
123
+ json_obj = JsonTweetWithUser.new(hsh, 'scraped_at' => scraped_at)
124
+ next unless json_obj && json_obj.healthy?
125
+ #
126
+ # Extract user, tweet and relationship
127
+ yield AFavoritesB.new(self.twitter_user_id, json_obj.user.id, json_obj.tweet.id) if json_obj.user && json_obj.tweet
128
+ json_obj.each(&block)
129
+ end
130
+ end
131
+ end
132
+
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,124 @@
1
+ require 'wukong/encoding'
2
+ module Wuclan::Twitter::Scrape
3
+ class JsonUserTweetPair
4
+ include Wuclan::Twitter::Model
5
+ attr_accessor :raw, :moreinfo
6
+ def initialize raw, moreinfo
7
+ self.raw = raw
8
+ self.moreinfo = moreinfo
9
+ fix_raw_user!
10
+ fix_raw_tweet!
11
+ # p ['new', self.class, raw, moreinfo]
12
+ end
13
+
14
+ # Extracted JSON should be an array
15
+ def healthy?()
16
+ raw && raw.is_a?(Hash) && (raw_tweet.nil? || raw_tweet.is_a?(Hash))
17
+ end
18
+
19
+ #
20
+ # generate all the contained TwitterXXX objects
21
+ #
22
+ def each
23
+ if is_partial?
24
+ yield user
25
+ else
26
+ yield user
27
+ yield user_profile
28
+ yield user_style
29
+ end
30
+ yield tweet if tweet
31
+ end
32
+
33
+ #
34
+ # Before mid-2009, most calls returned only the fields in
35
+ # TwitterUserPartial. After a mid-2009 API update, most calls return a full
36
+ # user record: TwitterUser, TwitterUserStyle and TwitterUserProfile
37
+ #
38
+ # This method tries to guess, based on the fields in the raw_user, which it has.
39
+ #
40
+ def is_partial?
41
+ not raw_user.include?('friends_count')
42
+ end
43
+
44
+
45
+ def tweet
46
+ Tweet.from_hash raw_tweet if raw_tweet
47
+ end
48
+
49
+ # create TwitterUser object from raw info
50
+ def user
51
+ if is_partial?
52
+ TwitterUserPartial.from_hash raw_user
53
+ else
54
+ TwitterUser.from_hash raw_user
55
+ end
56
+ end
57
+ def user_profile
58
+ TwitterUserProfile.from_hash raw_user
59
+ end
60
+ def user_style
61
+ TwitterUserStyle.from_hash raw_user
62
+ end
63
+
64
+ #
65
+ # Standardize the raw user hash's fields for further processing with Wukong
66
+ #
67
+ def fix_raw_user!
68
+ return unless raw_user
69
+ raw_user['scraped_at'] = self.moreinfo['scraped_at']
70
+ raw_user['created_at'] = ModelCommon.flatten_date(raw_user['created_at'])
71
+ raw_user['id'] = ModelCommon.zeropad_id( raw_user['id'])
72
+ raw_user['protected'] = ModelCommon.unbooleanize(raw_user['protected'])
73
+ raw_user['profile_background_tile'] = ModelCommon.unbooleanize(raw['profile_background_tile']) unless raw_user['profile_background_tile'].nil?
74
+ Wukong.encode_components raw_user, 'name', 'location', 'description', 'url'
75
+ # There are several users with bogus screen names
76
+ # These we need to **URL encode** -- not XML-encode.
77
+ if raw_user['screen_name'] !~ /\A\w+\z/
78
+ raw_user['screen_name'] = Wukong.encode_str(raw_user['screen_name'], :url)
79
+ end
80
+ end
81
+
82
+ #
83
+ # Standardize the raw tweet hash's fields for further processing with Wukong
84
+ #
85
+ def fix_raw_tweet!
86
+ return unless raw_tweet
87
+ raw_tweet['id'] = ModelCommon.zeropad_id( raw_tweet['id'])
88
+ raw_tweet['created_at'] = ModelCommon.flatten_date(raw_tweet['created_at'])
89
+ raw_tweet['favorited'] = ModelCommon.unbooleanize(raw_tweet['favorited'])
90
+ raw_tweet['truncated'] = ModelCommon.unbooleanize(raw_tweet['truncated'])
91
+ raw_tweet['twitter_user_id'] = ModelCommon.zeropad_id( raw_tweet['twitter_user_id'] )
92
+ raw_tweet['in_reply_to_user_id'] = ModelCommon.zeropad_id( raw_tweet['in_reply_to_user_id']) unless raw_tweet['in_reply_to_user_id'].blank? || (raw_tweet['in_reply_to_user_id'].to_i == 0)
93
+ raw_tweet['in_reply_to_status_id'] = ModelCommon.zeropad_id( raw_tweet['in_reply_to_status_id']) unless raw_tweet['in_reply_to_status_id'].blank? || (raw_tweet['in_reply_to_status_id'].to_i == 0)
94
+ Wukong.encode_components raw_tweet, 'text', 'in_reply_to_screen_name'
95
+ end
96
+ end
97
+ end
98
+
99
+
100
+ class JsonUserWithTweet < JsonUserTweetPair
101
+
102
+ def raw_tweet
103
+ return @raw_tweet if @raw_tweet
104
+ @raw_tweet = raw['status']
105
+ @raw_tweet['twitter_user_id'] = raw_user['id'] if @raw_tweet
106
+ @raw_tweet
107
+ end
108
+ def raw_user
109
+ @raw_user ||= raw
110
+ end
111
+ end
112
+
113
+
114
+ class JsonTweetWithUser < JsonUserTweetPair
115
+
116
+ def raw_tweet
117
+ @raw_tweet ||= raw
118
+ end
119
+ def raw_user
120
+ return @raw_user if @raw_user
121
+ @raw_user = raw['user']
122
+ @raw_user
123
+ end
124
+ end