wuclan 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'set'
|
2
|
+
class TwitterRequestStream < Monkeyshines::RequestStream::SimpleRequestStream
|
3
|
+
DEFAULT_REQUEST_SCOPE = Wuclan::Twitter::Scrape
|
4
|
+
TwitterRequestStream::DEFAULT_OPTIONS = { :klass => TwitterUserRequest, }
|
5
|
+
|
6
|
+
def initialize _options={}
|
7
|
+
super _options
|
8
|
+
self.request_klasses = options[:fetches]
|
9
|
+
end
|
10
|
+
|
11
|
+
# Set the list of follow-on requests
|
12
|
+
# 'followers_ids,friends_ids'
|
13
|
+
def request_klasses=(klass_names)
|
14
|
+
@request_klasses = FactoryModule.list_of_classes(DEFAULT_REQUEST_SCOPE, klass_names, 'twitter', 'request').to_set
|
15
|
+
@request_klasses.delete TwitterUserRequest
|
16
|
+
end
|
17
|
+
|
18
|
+
# Get the user and then get all other requested classes.
|
19
|
+
# The user's parameters (followers_count, etc.) fix the items to request
|
20
|
+
# The users' numeric ID replaces the supplied identifier (the first request
|
21
|
+
# can be a screen_name, but we need the numeric ID for followers_request's, etc.
|
22
|
+
def each_request twitter_user_id, *args
|
23
|
+
user_req = TwitterUserRequest.new(twitter_user_id)
|
24
|
+
yield(user_req)
|
25
|
+
return unless user_req.healthy?
|
26
|
+
twitter_user_id = user_req.parsed_contents['id'].to_i if (user_req.parsed_contents['id'].to_i > 0)
|
27
|
+
@request_klasses.each do |request_klass|
|
28
|
+
req = request_klass.new(twitter_user_id)
|
29
|
+
req.set_total_items user_req.parsed_contents
|
30
|
+
yield req
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# for the given user_id,
|
36
|
+
# gets the user
|
37
|
+
# and then each of the requests in more_request_klasses
|
38
|
+
#
|
39
|
+
def each *args, &block
|
40
|
+
request_store.each do |*raw_job_args|
|
41
|
+
self.each_request(*raw_job_args, &block)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
class TwitterSearchFakeFetcher < Monkeyshines::Fetcher::FakeFetcher
|
3
|
+
cattr_accessor :items_rate
|
4
|
+
def self.fake_time item_on_page, base=nil
|
5
|
+
base ||= 86_400
|
6
|
+
base - (item_on_page.to_f / items_rate)
|
7
|
+
end
|
8
|
+
|
9
|
+
def fake_contents req
|
10
|
+
max_time = self.class.fake_time((req.page - 1) * 105)
|
11
|
+
max_id = max_time.to_i
|
12
|
+
case req.query_term
|
13
|
+
when '_no_results'
|
14
|
+
return { :max_id => -1, :results => [],}
|
15
|
+
when '_one_result'
|
16
|
+
n_results = 1
|
17
|
+
else
|
18
|
+
n_results = 100
|
19
|
+
end
|
20
|
+
{ :max_id => max_id,
|
21
|
+
# :next_page => "?page=2&max_id=#{max_id}&rpp=100&q=#{req.query_term}",
|
22
|
+
:results => (0 ... n_results).map{|i| {
|
23
|
+
:text => "%s-%04d-%03d"%[req.query_term, req.page, i],
|
24
|
+
:created_at => Time.now - (86_400 - self.class.fake_time(i, max_time)),
|
25
|
+
:id => (self.class.fake_time(i, max_id)*100).to_i } } }
|
26
|
+
end
|
27
|
+
|
28
|
+
def get req
|
29
|
+
super req
|
30
|
+
req.contents = fake_contents(req).to_json
|
31
|
+
req
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# TwitterSearchRequestStream.class_eval do
|
36
|
+
# def do_faking job
|
37
|
+
# TwitterSearchFakeFetcher.items_rate = (1 / job.scheduling.delay) || 1
|
38
|
+
# # job.scheduling.prev_max = (TwitterSearchFakeFetcher.fake_time(rand(15) * 105)*100).to_i
|
39
|
+
# p [
|
40
|
+
# job.scheduling.prev_max,
|
41
|
+
# TwitterSearchFakeFetcher.fake_time(0).to_i
|
42
|
+
# ]
|
43
|
+
# end
|
44
|
+
# end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#
|
2
|
+
# #
|
3
|
+
# #
|
4
|
+
# # query terms must be URL-encoded
|
5
|
+
# # (use '+' for space; %23 # %27 ' etc)
|
6
|
+
# #
|
7
|
+
# def initialize *args
|
8
|
+
# super *args
|
9
|
+
# raise "Query term missing" if self.query_term.blank?
|
10
|
+
# self[:query_term].strip!
|
11
|
+
# [:priority, :prev_items, :prev_span_min, :prev_span_max].each{|attr| self[attr] = self[attr].to_i if self[attr] }
|
12
|
+
# self[:prev_rate] = self[:prev_rate].to_f
|
13
|
+
# self[:priority] = DEFAULT_PRIORITY if (self[:priority] == 0)
|
14
|
+
# self[:prev_rate] = nil if (self[:prev_rate] < 1e-6)
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# class TwitterSearchStream < Monkeyshines::RequestStream::SimpleRequestStream
|
18
|
+
# #
|
19
|
+
# # for the given user_id,
|
20
|
+
# # gets the user
|
21
|
+
# # and then each of the requests in more_request_klasses
|
22
|
+
# #
|
23
|
+
# def each *args, &block
|
24
|
+
# request_store.each do |*raw_job_args|
|
25
|
+
# job = klass.new(*raw_job_args)
|
26
|
+
# # do_faking(job)
|
27
|
+
# job.each_request(*args, &block)
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
# end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class TwitterSearchJob < Edamame::Job
|
2
|
+
#
|
3
|
+
# Pagination
|
4
|
+
#
|
5
|
+
include Monkeyshines::ScrapeRequestCore::Paginating
|
6
|
+
include Monkeyshines::ScrapeRequestCore::PaginatedWithLimit
|
7
|
+
include Monkeyshines::ScrapeRequestCore::PaginatedWithRate
|
8
|
+
# API max pages
|
9
|
+
self.hard_request_limit = 15
|
10
|
+
|
11
|
+
# Items to get each re-visit. If there are up to 50 items per page,
|
12
|
+
# target_items_per_job of 1000 will try to reschedule so that its return visit
|
13
|
+
# makes about twenty page requests.
|
14
|
+
#
|
15
|
+
# For Twitter, 1500 is the max, so 1000 gives a safety margin.
|
16
|
+
self.target_items_per_job = 1000
|
17
|
+
|
18
|
+
# creates the paginated request
|
19
|
+
def request_for_page page, info=nil
|
20
|
+
req = TwitterSearchRequest.new(obj[:key], page)
|
21
|
+
req.url << "&rpp=#{req.max_items}"
|
22
|
+
req.url << "&max_id=#{sess_span.min - 1}" if sess_span.min
|
23
|
+
req
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Wuclan
|
2
|
+
module Twitter
|
3
|
+
module Scrape
|
4
|
+
#
|
5
|
+
# ScrapeRequest for the twitter Search API.
|
6
|
+
#
|
7
|
+
# Examines the parsed contents to describe result
|
8
|
+
#
|
9
|
+
class TwitterSearchRequest < Monkeyshines::ScrapeRequest
|
10
|
+
# Contents are JSON
|
11
|
+
include Monkeyshines::RawJsonContents
|
12
|
+
# Pagination
|
13
|
+
include Monkeyshines::ScrapeRequestCore::Paginated
|
14
|
+
# API max items per response
|
15
|
+
self.max_items = 100
|
16
|
+
# API max pages
|
17
|
+
self.hard_request_limit = 15
|
18
|
+
|
19
|
+
def initialize *args
|
20
|
+
if args.first =~ %r{\Ahttp://.*q=([^&]+)&}
|
21
|
+
super $1, nil, {}, *args
|
22
|
+
else
|
23
|
+
super *args
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
#
|
29
|
+
#
|
30
|
+
def make_url
|
31
|
+
"http://search.twitter.com/search.json?q=#{query_term}"
|
32
|
+
end
|
33
|
+
|
34
|
+
def query_term
|
35
|
+
identifier
|
36
|
+
end
|
37
|
+
def key
|
38
|
+
identifier
|
39
|
+
end
|
40
|
+
|
41
|
+
# Checks that the response parses and has the right data structure.
|
42
|
+
# if healthy? is true things should generally work
|
43
|
+
def healthy?
|
44
|
+
items && items.is_a?(Array)
|
45
|
+
end
|
46
|
+
|
47
|
+
#
|
48
|
+
# Rescheduling
|
49
|
+
#
|
50
|
+
|
51
|
+
# Extract the actual search items returned
|
52
|
+
def items
|
53
|
+
parsed_contents['results'] if parsed_contents
|
54
|
+
end
|
55
|
+
# Span of IDs. Assumes the response has the ids in sort order oldest to newest
|
56
|
+
# (which the twitter API provides)
|
57
|
+
def span
|
58
|
+
[items.last['id'], items.first['id']] rescue nil
|
59
|
+
end
|
60
|
+
# Span of created_at times covered by this request.
|
61
|
+
# Useful for rate estimation.
|
62
|
+
def timespan
|
63
|
+
[Time.parse(items.last['created_at']).utc, Time.parse(items.first['created_at']).utc] rescue nil
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#
|
2
|
+
# TwitterSearchJob for the twitter Search API
|
3
|
+
#
|
4
|
+
# * Manages a series of paginated requests from first result back to last item in
|
5
|
+
# previous scrape scrape_job.
|
6
|
+
#
|
7
|
+
#
|
8
|
+
class TwitterSearchRequestStream < Monkeyshines::RequestStream::EdamameQueue
|
9
|
+
# priority for search jobs if not otherwise given
|
10
|
+
QUEUE_PRIORITY = 65536
|
11
|
+
|
12
|
+
def each *args, &block
|
13
|
+
work(queue_request_timeout, TwitterSearchJob) do |job|
|
14
|
+
# do_faking(qjob)
|
15
|
+
job.each_request(&block)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Wuclan
|
2
|
+
module Twitter
|
3
|
+
module Scrape
|
4
|
+
|
5
|
+
class TimelineRequest < Wuclan::Twitter::Scrape::Base
|
6
|
+
|
7
|
+
# Extracted JSON should be an array
|
8
|
+
def healthy?()
|
9
|
+
parsed_contents && parsed_contents.is_a?(Array)
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# unpacks the raw API response, yielding all the interesting objects
|
14
|
+
# and relationships within.
|
15
|
+
#
|
16
|
+
def parse *args, &block
|
17
|
+
return unless healthy?
|
18
|
+
parsed_contents.each do |hsh|
|
19
|
+
json_obj = JsonTweetWithUser.new(hsh, 'scraped_at' => scraped_at)
|
20
|
+
next unless json_obj && json_obj.healthy?
|
21
|
+
# Extract user, tweet and relationship
|
22
|
+
json_obj.each(&block)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
#
|
29
|
+
# API request for a user's status timeline.
|
30
|
+
# Maximum 16 pages, 200 a pop.
|
31
|
+
#
|
32
|
+
# Produces up to 200 Tweets.
|
33
|
+
#
|
34
|
+
# http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-statuses-user_timeline
|
35
|
+
#
|
36
|
+
class TwitterUserTimelineRequest < Wuclan::Twitter::Scrape::TimelineRequest
|
37
|
+
self.resource_path = 'statuses/user_timeline'
|
38
|
+
self.hard_request_limit = 16
|
39
|
+
self.max_items = 200
|
40
|
+
def items_count(thing) thing.status_count end
|
41
|
+
|
42
|
+
# Url from properties
|
43
|
+
def make_url
|
44
|
+
"http://twitter.com/#{resource_path}/#{twitter_user_id}.json?page=#{page}&count=#{max_items}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# set max_total_items from the statuses_count.
|
48
|
+
def set_total_items twitter_user_info
|
49
|
+
self.max_total_items = twitter_user_info['statuses_count'].to_i rescue nil
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# API request for public timeline
|
55
|
+
#
|
56
|
+
# Not available any more after May 2009 -- use Hosebird
|
57
|
+
#
|
58
|
+
class TwitterPublicTimelineRequest < Wuclan::Twitter::Scrape::TimelineRequest
|
59
|
+
self.resource_path = 'statuses/public_timeline'
|
60
|
+
self.hard_request_limit = 1
|
61
|
+
self.max_items = 600
|
62
|
+
def items_count(thing) 1 end
|
63
|
+
def make_url() "http://twitter.com/#{resource_path}.json" end
|
64
|
+
end
|
65
|
+
|
66
|
+
# class HosebirdRequest < Wuclan::Twitter::Scrape::Base
|
67
|
+
# #self.resource_path = 'statuses/public_timeline'
|
68
|
+
# end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Wuclan
|
2
|
+
module Twitter
|
3
|
+
module Scrape
|
4
|
+
|
5
|
+
#
|
6
|
+
# API request for a user profile.
|
7
|
+
#
|
8
|
+
# Produces a TwitterUser,Profile,Style
|
9
|
+
#
|
10
|
+
# http://apiwiki.twitter.com/Twitter-REST-API-Method%3A-users%C2%A0show
|
11
|
+
#
|
12
|
+
#
|
13
|
+
class TwitterUserRequest < Wuclan::Twitter::Scrape::Base
|
14
|
+
self.resource_path = 'users/show'
|
15
|
+
self.hard_request_limit = 1
|
16
|
+
self.max_items = 1
|
17
|
+
def items_count(thing) 1 end
|
18
|
+
|
19
|
+
# Extracted JSON should be a single user_with_tweet hash
|
20
|
+
def healthy?()
|
21
|
+
parsed_contents && parsed_contents.is_a?(Hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Generate request URL
|
25
|
+
def make_url
|
26
|
+
"http://twitter.com/#{resource_path}/#{twitter_user_id}.json"
|
27
|
+
end
|
28
|
+
|
29
|
+
def key
|
30
|
+
twitter_user_id
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
# unpacks the raw API response, yielding all the interesting objects
|
35
|
+
# and relationships within.
|
36
|
+
#
|
37
|
+
def parse *args, &block
|
38
|
+
return unless healthy?
|
39
|
+
json_obj = JsonUserWithTweet.new(parsed_contents, 'scraped_at' => scraped_at)
|
40
|
+
next unless json_obj && json_obj.healthy?
|
41
|
+
# Extract user and tweet
|
42
|
+
json_obj.each(&block)
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# language: http://en.wikipedia.org/wiki/ISO_639-1
|
51
|
+
#
|
52
|
+
# * Find tweets containing a word: http://search.twitter.com/search.atom?q=twitter
|
53
|
+
# * Find tweets from a user: http://search.twitter.com/search.atom?q=from%3Aalexiskold
|
54
|
+
# * Find tweets to a user: http://search.twitter.com/search.atom?q=to%3Atechcrunch
|
55
|
+
# * Find tweets referencing a user: http://search.twitter.com/search.atom?q=%40mashable
|
56
|
+
# * Find tweets containing a hashtag: http://search.twitter.com/search.atom?q=%23haiku
|
57
|
+
# * Combine any of the operators together: http://search.twitter.com/search.atom?q=movie+%3A%29
|
58
|
+
#
|
59
|
+
# * lang: restricts tweets to the given language, given by an ISO 639-1 code. Ex: http://search.twitter.com/search.atom?lang=en&q=devo
|
60
|
+
# * rpp: the number of tweets to return per page, up to a max of 100. Ex: http://search.twitter.com/search.atom?lang=en&q=devo&rpp=15
|
61
|
+
# * page: the page number (starting at 1) to return, up to a max of roughly 1500 results (based on rpp * page)
|
62
|
+
# * since_id: returns tweets with status ids greater than the given id.
|
63
|
+
# * geocode: returns tweets by users located within a given radius of the given latitude/longitude, where the user's location is taken from their Twitter profile. The parameter value is specified by "latitide,longitude,radius", where radius units must be specified as either "mi" (miles) or "km" (kilometers). Ex: http://search.twitter.com/search.atom?geocode=40.757929%2C-73.985506%2C25km. Note that you cannot use the near operator via the API to geocode arbitrary locations; however you can use this geocode parameter to search near geocodes directly.
|
64
|
+
# * show_user: when "true", adds "<user>:" to the beginning of the tweet. This is useful for readers that do not display Atom's author field. The default is "false".
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Wuclan
|
2
|
+
module Twitter
|
3
|
+
module Scrape
|
4
|
+
# Search API
|
5
|
+
autoload :TwitterSearchRequest, 'wuclan/twitter/scrape/twitter_search_request'
|
6
|
+
autoload :TwitterSearchJob, 'wuclan/twitter/scrape/twitter_search_job'
|
7
|
+
# Main API
|
8
|
+
autoload :Base, 'wuclan/twitter/scrape/base'
|
9
|
+
autoload :TwitterUserRequest, 'wuclan/twitter/scrape/twitter_user_request'
|
10
|
+
autoload :TwitterFollowersRequest, 'wuclan/twitter/scrape/twitter_followers_request'
|
11
|
+
autoload :TwitterFriendsRequest, 'wuclan/twitter/scrape/twitter_followers_request'
|
12
|
+
autoload :TwitterFavoritesRequest, 'wuclan/twitter/scrape/twitter_followers_request'
|
13
|
+
autoload :TwitterFollowersIdsRequest, 'wuclan/twitter/scrape/twitter_ff_ids_request'
|
14
|
+
autoload :TwitterFriendsIdsRequest, 'wuclan/twitter/scrape/twitter_ff_ids_request'
|
15
|
+
autoload :TwitterUserTimelineRequest, 'wuclan/twitter/scrape/twitter_timeline_request'
|
16
|
+
autoload :TwitterPublicTimelineRequest, 'wuclan/twitter/scrape/twitter_timeline_request'
|
17
|
+
autoload :JsonUserWithTweet, 'wuclan/twitter/scrape/twitter_json_response'
|
18
|
+
autoload :JsonTweetWithUser, 'wuclan/twitter/scrape/twitter_json_response'
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
autoload :TwitterRequestStream, 'wuclan/twitter/scrape/twitter_request_stream'
|
24
|
+
autoload :TwitterFakeFetcher, 'wuclan/twitter/scrape/twitter_fake_fetcher'
|
25
|
+
autoload :TwitterSearchRequestStream, 'wuclan/twitter/scrape/twitter_search_request_stream'
|
26
|
+
autoload :TwitterSearchFakeFetcher, 'wuclan/twitter/scrape/twitter_search_fake_fetcher'
|
27
|
+
autoload :TwitterSearchJob, 'wuclan/twitter/scrape/twitter_search_job'
|
data/lib/wuclan.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
|
data/spec/spec_helper.rb
ADDED
data/spec/wuclan_spec.rb
ADDED
data/wuclan.gemspec
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{wuclan}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Philip (flip) Kromer"]
|
12
|
+
s.date = %q{2009-10-12}
|
13
|
+
s.description = %q{Massive-scale social network analysis. Nothing to f with.}
|
14
|
+
s.email = %q{flip@infochimps.org}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.textile",
|
17
|
+
"README.textile"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"LICENSE.textile",
|
21
|
+
"README.textile",
|
22
|
+
"examples/analyze/strong_links/gen_multi_edge.rb",
|
23
|
+
"examples/analyze/strong_links/main.rb",
|
24
|
+
"examples/analyze/word_count/dump_schema.rb",
|
25
|
+
"examples/analyze/word_count/freq_user.rb",
|
26
|
+
"examples/analyze/word_count/freq_whole_corpus.rb",
|
27
|
+
"examples/analyze/word_count/word_count.pig",
|
28
|
+
"examples/analyze/word_count/word_count.rb",
|
29
|
+
"examples/lastfm/scrape/load_lastfm.rb",
|
30
|
+
"examples/lastfm/scrape/scrape_lastfm.rb",
|
31
|
+
"examples/lastfm/scrape/seed.tsv",
|
32
|
+
"examples/twitter/old/load_twitter_search_jobs.rb",
|
33
|
+
"examples/twitter/old/scrape_twitter_api.rb",
|
34
|
+
"examples/twitter/old/scrape_twitter_search.rb",
|
35
|
+
"examples/twitter/old/scrape_twitter_trending.rb",
|
36
|
+
"examples/twitter/parse/parse_twitter_requests.rb",
|
37
|
+
"examples/twitter/parse/parse_twitter_search_requests.rb",
|
38
|
+
"examples/twitter/scrape_twitter_api/scrape_twitter_api.rb",
|
39
|
+
"examples/twitter/scrape_twitter_api/seed.tsv",
|
40
|
+
"examples/twitter/scrape_twitter_api/start_cache_twitter.sh",
|
41
|
+
"examples/twitter/scrape_twitter_api/support/make_request_stats.rb",
|
42
|
+
"examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb",
|
43
|
+
"examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig",
|
44
|
+
"examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv",
|
45
|
+
"examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv",
|
46
|
+
"examples/twitter/scrape_twitter_hosebird/edamame-killall",
|
47
|
+
"examples/twitter/scrape_twitter_hosebird/foo.rb",
|
48
|
+
"examples/twitter/scrape_twitter_hosebird/ps_emulation.rb",
|
49
|
+
"examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb",
|
50
|
+
"examples/twitter/scrape_twitter_hosebird/test_spewer.rb",
|
51
|
+
"examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml",
|
52
|
+
"examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb",
|
53
|
+
"examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb",
|
54
|
+
"examples/twitter/scrape_twitter_search/scrape_twitter_search.rb",
|
55
|
+
"examples/twitter/scrape_twitter_search/twitter_search_daemons.god",
|
56
|
+
"lib/old/twitter_api.rb",
|
57
|
+
"lib/wuclan.rb",
|
58
|
+
"lib/wuclan/delicious/delicious_html_request.rb",
|
59
|
+
"lib/wuclan/delicious/delicious_models.rb",
|
60
|
+
"lib/wuclan/delicious/delicious_request.rb",
|
61
|
+
"lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb",
|
62
|
+
"lib/wuclan/friendster.rb",
|
63
|
+
"lib/wuclan/lastfm.rb",
|
64
|
+
"lib/wuclan/lastfm/model/base.rb",
|
65
|
+
"lib/wuclan/lastfm/model/sample_responses.txt",
|
66
|
+
"lib/wuclan/lastfm/scrape.rb",
|
67
|
+
"lib/wuclan/lastfm/scrape/base.rb",
|
68
|
+
"lib/wuclan/lastfm/scrape/concrete.rb",
|
69
|
+
"lib/wuclan/lastfm/scrape/lastfm_job.rb",
|
70
|
+
"lib/wuclan/lastfm/scrape/lastfm_request_stream.rb",
|
71
|
+
"lib/wuclan/lastfm/scrape/recursive_requests.rb",
|
72
|
+
"lib/wuclan/metrics.rb",
|
73
|
+
"lib/wuclan/metrics/user_graph_metrics.rb",
|
74
|
+
"lib/wuclan/metrics/user_metrics.rb",
|
75
|
+
"lib/wuclan/metrics/user_metrics_basic.rb",
|
76
|
+
"lib/wuclan/metrics/user_scraping_metrics.rb",
|
77
|
+
"lib/wuclan/myspace.rb",
|
78
|
+
"lib/wuclan/open_social.rb",
|
79
|
+
"lib/wuclan/open_social/model/base.rb",
|
80
|
+
"lib/wuclan/open_social/scrape/base.rb",
|
81
|
+
"lib/wuclan/open_social/scrape_request.rb",
|
82
|
+
"lib/wuclan/rdf_output/relationship_rdf.rb",
|
83
|
+
"lib/wuclan/rdf_output/text_element_rdf.rb",
|
84
|
+
"lib/wuclan/rdf_output/tweet_rdf.rb",
|
85
|
+
"lib/wuclan/rdf_output/twitter_rdf.rb",
|
86
|
+
"lib/wuclan/rdf_output/twitter_user_rdf.rb",
|
87
|
+
"lib/wuclan/shorturl/shorturl_request.rb",
|
88
|
+
"lib/wuclan/twitter.rb",
|
89
|
+
"lib/wuclan/twitter/api_response_examples.textile",
|
90
|
+
"lib/wuclan/twitter/api_response_examples.textile",
|
91
|
+
"lib/wuclan/twitter/model.rb",
|
92
|
+
"lib/wuclan/twitter/model/base.rb",
|
93
|
+
"lib/wuclan/twitter/model/multi_edge.rb",
|
94
|
+
"lib/wuclan/twitter/model/relationship.rb",
|
95
|
+
"lib/wuclan/twitter/model/text_element.rb",
|
96
|
+
"lib/wuclan/twitter/model/text_element/extract_info_tests.rb",
|
97
|
+
"lib/wuclan/twitter/model/text_element/grok_tweets.rb",
|
98
|
+
"lib/wuclan/twitter/model/text_element/more_regexes.rb",
|
99
|
+
"lib/wuclan/twitter/model/tweet.rb",
|
100
|
+
"lib/wuclan/twitter/model/tweet/tokenize.rb",
|
101
|
+
"lib/wuclan/twitter/model/tweet/tweet_regexes.rb",
|
102
|
+
"lib/wuclan/twitter/model/tweet/tweet_token.rb",
|
103
|
+
"lib/wuclan/twitter/model/twitter_user.rb",
|
104
|
+
"lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb",
|
105
|
+
"lib/wuclan/twitter/parse/ff_ids_parser.rb",
|
106
|
+
"lib/wuclan/twitter/parse/friends_followers_parser.rb",
|
107
|
+
"lib/wuclan/twitter/parse/generic_json_parser.rb",
|
108
|
+
"lib/wuclan/twitter/parse/json_tweet.rb",
|
109
|
+
"lib/wuclan/twitter/parse/json_twitter_user.rb",
|
110
|
+
"lib/wuclan/twitter/parse/public_timeline_parser.rb",
|
111
|
+
"lib/wuclan/twitter/parse/twitter_search_parse.rb",
|
112
|
+
"lib/wuclan/twitter/parse/user_parser.rb",
|
113
|
+
"lib/wuclan/twitter/scrape.rb",
|
114
|
+
"lib/wuclan/twitter/scrape/base.rb",
|
115
|
+
"lib/wuclan/twitter/scrape/old_skool_request_classes.rb",
|
116
|
+
"lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb",
|
117
|
+
"lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb",
|
118
|
+
"lib/wuclan/twitter/scrape/twitter_followers_request.rb",
|
119
|
+
"lib/wuclan/twitter/scrape/twitter_json_response.rb",
|
120
|
+
"lib/wuclan/twitter/scrape/twitter_request_stream.rb",
|
121
|
+
"lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb",
|
122
|
+
"lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb",
|
123
|
+
"lib/wuclan/twitter/scrape/twitter_search_job.rb",
|
124
|
+
"lib/wuclan/twitter/scrape/twitter_search_request.rb",
|
125
|
+
"lib/wuclan/twitter/scrape/twitter_search_request_stream.rb",
|
126
|
+
"lib/wuclan/twitter/scrape/twitter_timeline_request.rb",
|
127
|
+
"lib/wuclan/twitter/scrape/twitter_user_request.rb",
|
128
|
+
"spec/spec_helper.rb",
|
129
|
+
"spec/wuclan_spec.rb",
|
130
|
+
"wuclan.gemspec"
|
131
|
+
]
|
132
|
+
s.homepage = %q{http://github.com/mrflip/wuclan}
|
133
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
134
|
+
s.require_paths = ["lib"]
|
135
|
+
s.rubygems_version = %q{1.3.5}
|
136
|
+
s.summary = %q{Massive-scale social network analysis. Nothing to f with.}
|
137
|
+
s.test_files = [
|
138
|
+
"spec/spec_helper.rb",
|
139
|
+
"spec/wuclan_spec.rb",
|
140
|
+
"examples/analyze/strong_links/gen_multi_edge.rb",
|
141
|
+
"examples/analyze/strong_links/main.rb",
|
142
|
+
"examples/analyze/word_count/dump_schema.rb",
|
143
|
+
"examples/analyze/word_count/freq_user.rb",
|
144
|
+
"examples/analyze/word_count/freq_whole_corpus.rb",
|
145
|
+
"examples/analyze/word_count/word_count.rb",
|
146
|
+
"examples/lastfm/scrape/load_lastfm.rb",
|
147
|
+
"examples/lastfm/scrape/scrape_lastfm.rb",
|
148
|
+
"examples/twitter/old/load_twitter_search_jobs.rb",
|
149
|
+
"examples/twitter/old/scrape_twitter_api.rb",
|
150
|
+
"examples/twitter/old/scrape_twitter_search.rb",
|
151
|
+
"examples/twitter/old/scrape_twitter_trending.rb",
|
152
|
+
"examples/twitter/parse/parse_twitter_requests.rb",
|
153
|
+
"examples/twitter/parse/parse_twitter_search_requests.rb",
|
154
|
+
"examples/twitter/scrape_twitter_api/scrape_twitter_api.rb",
|
155
|
+
"examples/twitter/scrape_twitter_api/support/make_request_stats.rb",
|
156
|
+
"examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb",
|
157
|
+
"examples/twitter/scrape_twitter_hosebird/foo.rb",
|
158
|
+
"examples/twitter/scrape_twitter_hosebird/ps_emulation.rb",
|
159
|
+
"examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb",
|
160
|
+
"examples/twitter/scrape_twitter_hosebird/test_spewer.rb",
|
161
|
+
"examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb",
|
162
|
+
"examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb",
|
163
|
+
"examples/twitter/scrape_twitter_search/scrape_twitter_search.rb"
|
164
|
+
]
|
165
|
+
|
166
|
+
if s.respond_to? :specification_version then
|
167
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
168
|
+
s.specification_version = 3
|
169
|
+
|
170
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
171
|
+
s.add_runtime_dependency(%q<wukong>, [">= 0"])
|
172
|
+
s.add_runtime_dependency(%q<monkeyshines>, [">= 0"])
|
173
|
+
s.add_runtime_dependency(%q<edamame>, [">= 0"])
|
174
|
+
else
|
175
|
+
s.add_dependency(%q<wukong>, [">= 0"])
|
176
|
+
s.add_dependency(%q<monkeyshines>, [">= 0"])
|
177
|
+
s.add_dependency(%q<edamame>, [">= 0"])
|
178
|
+
end
|
179
|
+
else
|
180
|
+
s.add_dependency(%q<wukong>, [">= 0"])
|
181
|
+
s.add_dependency(%q<monkeyshines>, [">= 0"])
|
182
|
+
s.add_dependency(%q<edamame>, [">= 0"])
|
183
|
+
end
|
184
|
+
end
|