wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
require 'wuclan/models/tweet/tweet_regexes'
|
|
2
|
+
module Wuclan::Models
|
|
3
|
+
|
|
4
|
+
class TweetToken < TypedStruct.new(
|
|
5
|
+
[:word, String],
|
|
6
|
+
[:user_id, Integer],
|
|
7
|
+
[:tweet_id, Integer],
|
|
8
|
+
[:freq, Integer]
|
|
9
|
+
)
|
|
10
|
+
include ModelCommon
|
|
11
|
+
include TweetRegexes
|
|
12
|
+
class_inheritable_accessor :extract_re
|
|
13
|
+
|
|
14
|
+
def initialize *args
|
|
15
|
+
super *args
|
|
16
|
+
freq = 1 if freq.blank? && (! word.blank?)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def num_key_fields() 5 end
|
|
20
|
+
def numeric_id_fields() [] ; end
|
|
21
|
+
|
|
22
|
+
# crawl through the string
|
|
23
|
+
# remove each token, leave a space behind
|
|
24
|
+
def self.extract_tokens! str
|
|
25
|
+
toks = []
|
|
26
|
+
str.gsub!(extract_re){|tok| toks << $1.strip ; ' ' }
|
|
27
|
+
toks
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class SmilieToken < TweetToken
|
|
32
|
+
self.extract_re = RE_SMILIES
|
|
33
|
+
end
|
|
34
|
+
class UrlToken < TweetToken
|
|
35
|
+
self.extract_re = RE_URL
|
|
36
|
+
end
|
|
37
|
+
class RtToken < TweetToken
|
|
38
|
+
self.extract_re = RE_RETWEET
|
|
39
|
+
def self.extract_tokens! str
|
|
40
|
+
super.map{|str| str = 'RT_@'+str }
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
class AtsignToken < TweetToken
|
|
44
|
+
self.extract_re = RE_ATSIGNS
|
|
45
|
+
def self.extract_tokens! str
|
|
46
|
+
super.map{|str| str = '@'+str }
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
class HashtagToken < TweetToken
|
|
50
|
+
self.extract_re = RE_HASHTAGS
|
|
51
|
+
def self.extract_tokens! str
|
|
52
|
+
super.map{|str| str = '#'+str }
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
class WordToken < TweetToken
|
|
56
|
+
self.extract_re = nil
|
|
57
|
+
#
|
|
58
|
+
# This is pretty simpleminded.
|
|
59
|
+
#
|
|
60
|
+
# returns all words of three or more letters.
|
|
61
|
+
# * terminal 't and 's (as in "don't" and "it's") are tokenised together
|
|
62
|
+
# *
|
|
63
|
+
#
|
|
64
|
+
# * FIXME -- this doesn't leave str as blank, as it should to behave like
|
|
65
|
+
# the other ! methods
|
|
66
|
+
def self.extract_tokens! str
|
|
67
|
+
return [] unless str
|
|
68
|
+
str = str.downcase;
|
|
69
|
+
# kill off all punctuation except 's
|
|
70
|
+
# this includes hyphens (words are split)
|
|
71
|
+
str = str.gsub(/[^\w\'@]+/, ' ').gsub(/\'([st])\b/, '!\1').gsub(/\'/, ' ').gsub(/!/, "'")
|
|
72
|
+
# Busticate at whitespace
|
|
73
|
+
words = str.strip.split(/\s+/)
|
|
74
|
+
#
|
|
75
|
+
words.reject{|w| w.blank? || (w.length < 3) }
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
module Wuclan::Twitter::Model
|
|
2
|
+
|
|
3
|
+
#
|
|
4
|
+
# Tweet
|
|
5
|
+
#
|
|
6
|
+
# Text and metadata for a twitter status update
|
|
7
|
+
#
|
|
8
|
+
class Tweet < TypedStruct.new(
|
|
9
|
+
[:id, Integer ],
|
|
10
|
+
[:created_at, Bignum ],
|
|
11
|
+
[:twitter_user_id, Integer ],
|
|
12
|
+
[:favorited, Integer ],
|
|
13
|
+
[:truncated, Integer ],
|
|
14
|
+
[:in_reply_to_user_id, Integer ],
|
|
15
|
+
[:in_reply_to_status_id, Integer ],
|
|
16
|
+
[:text, String ],
|
|
17
|
+
[:source, String ],
|
|
18
|
+
[:in_reply_to_screen_name, String ]
|
|
19
|
+
)
|
|
20
|
+
include ModelCommon
|
|
21
|
+
|
|
22
|
+
#
|
|
23
|
+
# Memoized; if you change text you have to flush
|
|
24
|
+
#
|
|
25
|
+
def decoded_text
|
|
26
|
+
@decoded_text ||= text.wukong_decode
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Key on id
|
|
30
|
+
def num_key_fields() 1 end
|
|
31
|
+
def numeric_id_fields() [:id, :twitter_user_id, :in_reply_to_status_id, :in_reply_to_user_id] ; end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
#
|
|
36
|
+
# SearchTweet
|
|
37
|
+
#
|
|
38
|
+
# Text and metadata for a twitter status update pulled from the search API
|
|
39
|
+
#
|
|
40
|
+
class SearchTweet < TypedStruct.new(
|
|
41
|
+
[:id, Integer ],
|
|
42
|
+
[:created_at, Bignum ],
|
|
43
|
+
[:twitter_user_id, Integer ],
|
|
44
|
+
[:favorited, Integer ],
|
|
45
|
+
[:truncated, Integer ],
|
|
46
|
+
[:in_reply_to_user_id, Integer ],
|
|
47
|
+
[:in_reply_to_status_id, Integer ],
|
|
48
|
+
[:text, String ],
|
|
49
|
+
[:source, String ],
|
|
50
|
+
[:in_reply_to_screen_name, String ],
|
|
51
|
+
[:in_reply_to_sid, Integer ],
|
|
52
|
+
[:twitter_user_screen_name, String ],
|
|
53
|
+
[:twitter_user_sid, Integer ],
|
|
54
|
+
[:iso_language_code, String ]
|
|
55
|
+
)
|
|
56
|
+
include ModelCommon
|
|
57
|
+
|
|
58
|
+
#
|
|
59
|
+
# Memoized; if you change text you have to flush
|
|
60
|
+
#
|
|
61
|
+
def decoded_text
|
|
62
|
+
@decoded_text ||= text.wukong_decode
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def from_sid() twitter_user_sid end
|
|
66
|
+
def from_user() twitter_user_screen_name end
|
|
67
|
+
def to_sid() in_reply_to_sid end
|
|
68
|
+
def to_sid() in_reply_to_screen_name end
|
|
69
|
+
|
|
70
|
+
# Key on id
|
|
71
|
+
def num_key_fields() 1 end
|
|
72
|
+
def numeric_id_fields() [:id, :twitter_user_id, :in_reply_to_status_id, :in_reply_to_user_id] ; end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# Color::RGB.from_html("fed")
|
|
4
|
+
# Color::RGB.from_html("#fed")
|
|
5
|
+
# Color::RGB.from_html("#cabbed")
|
|
6
|
+
# Color::RGB.from_html("cabbed")
|
|
7
|
+
|
|
8
|
+
# Following stolen from the 'color-tools' gem
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# File lib/color/rgb.rb, line 45
|
|
12
|
+
def from_html(html_colour)
|
|
13
|
+
html_colour = html_colour.gsub(%r{[#;]}, '')
|
|
14
|
+
case html_colour.size
|
|
15
|
+
when 3
|
|
16
|
+
colours = html_colour.scan(%r{[0-9A-Fa-f]}).map { |el| (el * 2).to_i(16) }
|
|
17
|
+
when 6
|
|
18
|
+
colours = html_colour.scan(%r<[0-9A-Fa-f]{2}>).map { |el| el.to_i(16) }
|
|
19
|
+
else
|
|
20
|
+
raise ArgumentError
|
|
21
|
+
end
|
|
22
|
+
colours
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# File lib/color/rgb.rb, line 167
|
|
27
|
+
def to_hsl r, g, b
|
|
28
|
+
min = [ r, g, b ].min
|
|
29
|
+
max = [ r, g, b ].max
|
|
30
|
+
delta = (max - min).to_f
|
|
31
|
+
|
|
32
|
+
lum = (max + min) / 2.0
|
|
33
|
+
|
|
34
|
+
if delta <= 1e-5 # close to 0.0, so it's a grey
|
|
35
|
+
hue = 0
|
|
36
|
+
sat = 0
|
|
37
|
+
else
|
|
38
|
+
if (lum - 0.5) <= 1e-5
|
|
39
|
+
sat = delta / (max + min).to_f
|
|
40
|
+
else
|
|
41
|
+
sat = delta / (2 - max - min).to_f
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
if r == max
|
|
45
|
+
hue = (g - b) / delta.to_f
|
|
46
|
+
elsif g == max
|
|
47
|
+
hue = (2.0 + b - r) / delta.to_f
|
|
48
|
+
elsif (b - max) <= 1e-5
|
|
49
|
+
hue = (4.0 + r - g) / delta.to_f
|
|
50
|
+
end
|
|
51
|
+
hue /= 6.0
|
|
52
|
+
|
|
53
|
+
hue += 1 if hue < 0
|
|
54
|
+
hue -= 1 if hue > 1
|
|
55
|
+
end
|
|
56
|
+
[ hue, sat, lum ]
|
|
57
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
|
|
2
|
+
module Wuclan::Twitter::Model
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# Mixin: common methods for each of the user representations / partitions
|
|
6
|
+
#
|
|
7
|
+
module TwitterUserCommon
|
|
8
|
+
#
|
|
9
|
+
# Key on id
|
|
10
|
+
#
|
|
11
|
+
def num_key_fields() 1 end
|
|
12
|
+
|
|
13
|
+
#
|
|
14
|
+
# Fields that can change value
|
|
15
|
+
#
|
|
16
|
+
def mutable_fields
|
|
17
|
+
# everything but 0 (id) and 1 (scraped_at)
|
|
18
|
+
to_a[2..-1]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def decoded_name
|
|
22
|
+
@decoded_name ||= (name ? name.wukong_decode : '')
|
|
23
|
+
end
|
|
24
|
+
def decoded_location
|
|
25
|
+
@decoded_location ||= (location ? location.wukong_decode : '')
|
|
26
|
+
end
|
|
27
|
+
def decoded_description
|
|
28
|
+
@decoded_description ||= (description ? description.wukong_decode : '')
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#
|
|
34
|
+
# Fundamental information on a user.
|
|
35
|
+
#
|
|
36
|
+
class TwitterUser < TypedStruct.new(
|
|
37
|
+
[:id, Integer],
|
|
38
|
+
[:scraped_at, Bignum],
|
|
39
|
+
[:screen_name, String],
|
|
40
|
+
[:protected, Integer],
|
|
41
|
+
[:followers_count, Integer],
|
|
42
|
+
[:friends_count, Integer],
|
|
43
|
+
[:statuses_count, Integer],
|
|
44
|
+
[:favourites_count, Integer],
|
|
45
|
+
[:created_at, Bignum]
|
|
46
|
+
)
|
|
47
|
+
include ModelCommon
|
|
48
|
+
include TwitterUserCommon
|
|
49
|
+
alias_method :tweets_count, :statuses_count
|
|
50
|
+
alias_method :favorites_count, :favourites_count
|
|
51
|
+
#
|
|
52
|
+
# Rate info
|
|
53
|
+
#
|
|
54
|
+
def friends_per_day() friends_count.to_i / days_since_created end
|
|
55
|
+
def followers_per_day() followers_count.to_i / days_since_created end
|
|
56
|
+
def favorites_per_day() favorites_count.to_i / days_since_created end
|
|
57
|
+
def tweets_per_day() tweets_count.to_i / days_since_created end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
#
|
|
61
|
+
# Outside of a users/show page, when a user is mentioned
|
|
62
|
+
# only this subset of fields appear.
|
|
63
|
+
#
|
|
64
|
+
class TwitterUserPartial < TypedStruct.new(
|
|
65
|
+
[:id, Integer], # appear in TwitterUser
|
|
66
|
+
[:scraped_at, Bignum],
|
|
67
|
+
[:screen_name, String],
|
|
68
|
+
[:protected, Integer],
|
|
69
|
+
[:followers_count, Integer],
|
|
70
|
+
[:name, String], # appear in TwitterUserProfile
|
|
71
|
+
[:url, String],
|
|
72
|
+
[:location, String],
|
|
73
|
+
[:description, String],
|
|
74
|
+
[:profile_image_url, String] # appear in TwitterUserStyle
|
|
75
|
+
)
|
|
76
|
+
include ModelCommon
|
|
77
|
+
include TwitterUserCommon
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
#
|
|
81
|
+
# User-set information about a user
|
|
82
|
+
#
|
|
83
|
+
class TwitterUserProfile < TypedStruct.new(
|
|
84
|
+
[:id, Integer],
|
|
85
|
+
[:scraped_at, Bignum],
|
|
86
|
+
[:name, String],
|
|
87
|
+
[:url, String],
|
|
88
|
+
[:location, String],
|
|
89
|
+
[:description, String],
|
|
90
|
+
[:time_zone, String],
|
|
91
|
+
[:utc_offset, String]
|
|
92
|
+
)
|
|
93
|
+
include ModelCommon
|
|
94
|
+
include TwitterUserCommon
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
#
|
|
98
|
+
# How the user has styled their page
|
|
99
|
+
#
|
|
100
|
+
class TwitterUserStyle < TypedStruct.new(
|
|
101
|
+
[:id, Integer],
|
|
102
|
+
[:scraped_at, Bignum],
|
|
103
|
+
[:profile_background_color, String],
|
|
104
|
+
[:profile_text_color, String],
|
|
105
|
+
[:profile_link_color, String],
|
|
106
|
+
[:profile_sidebar_border_color, String],
|
|
107
|
+
[:profile_sidebar_fill_color, String],
|
|
108
|
+
[:profile_background_tile, String],
|
|
109
|
+
[:profile_background_image_url, String],
|
|
110
|
+
[:profile_image_url, String]
|
|
111
|
+
)
|
|
112
|
+
include ModelCommon
|
|
113
|
+
include TwitterUserCommon
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
#
|
|
117
|
+
# For passing around just screen_name => id mapping
|
|
118
|
+
#
|
|
119
|
+
class TwitterUserId < TypedStruct.new(
|
|
120
|
+
[:id, Integer],
|
|
121
|
+
[:screen_name, String],
|
|
122
|
+
[:full, Integer],
|
|
123
|
+
[:followers_count, Integer],
|
|
124
|
+
[:created_at, Bignum],
|
|
125
|
+
[:protected, Integer],
|
|
126
|
+
[:status, String]
|
|
127
|
+
)
|
|
128
|
+
include ModelCommon
|
|
129
|
+
include TwitterUserCommon
|
|
130
|
+
def num_key_fields() 1 end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
#
|
|
134
|
+
# For passing around just screen_name => id mapping
|
|
135
|
+
#
|
|
136
|
+
class TwitterUserSearchId < TypedStruct.new(
|
|
137
|
+
[:screen_name, String],
|
|
138
|
+
[:sid, Integer],
|
|
139
|
+
[:id, Integer]
|
|
140
|
+
)
|
|
141
|
+
include ModelCommon
|
|
142
|
+
include TwitterUserCommon
|
|
143
|
+
def num_key_fields() 1 end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Twitter
|
|
3
|
+
module Model
|
|
4
|
+
autoload :ModelCommon, 'wuclan/twitter/model/base'
|
|
5
|
+
autoload :TwitterUser, 'wuclan/twitter/model/twitter_user'
|
|
6
|
+
autoload :TwitterUserPartial, 'wuclan/twitter/model/twitter_user'
|
|
7
|
+
autoload :TwitterUserProfile, 'wuclan/twitter/model/twitter_user'
|
|
8
|
+
autoload :TwitterUserStyle, 'wuclan/twitter/model/twitter_user'
|
|
9
|
+
autoload :TwitterUserSearchId, 'wuclan/twitter/model/twitter_user'
|
|
10
|
+
autoload :TwitterUserId, 'wuclan/twitter/model/twitter_user'
|
|
11
|
+
autoload :Tweet, 'wuclan/twitter/model/tweet'
|
|
12
|
+
autoload :SearchTweet, 'wuclan/twitter/model/tweet'
|
|
13
|
+
autoload :AFollowsB, 'wuclan/twitter/model/relationship'
|
|
14
|
+
autoload :AFavoritesB, 'wuclan/twitter/model/relationship'
|
|
15
|
+
autoload :ARepliesB, 'wuclan/twitter/model/relationship'
|
|
16
|
+
autoload :AAtsignsB, 'wuclan/twitter/model/relationship'
|
|
17
|
+
autoload :AAtsignsBId, 'wuclan/twitter/model/relationship'
|
|
18
|
+
autoload :ARepliesBName, 'wuclan/twitter/model/relationship'
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module JsonModel
|
|
3
|
+
|
|
4
|
+
# ===========================================================================
|
|
5
|
+
#
|
|
6
|
+
# Public timeline is an array of tweets => users
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
class FfIdsParser < GenericJsonParser
|
|
10
|
+
|
|
11
|
+
# friends_ids or followers_ids is an array of user_id's
|
|
12
|
+
def healthy?()
|
|
13
|
+
contents && contents.is_a?(Array)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def each &block
|
|
17
|
+
contents.each do |user_b_id|
|
|
18
|
+
user_b_id = "%010d"%user_b_id.to_i
|
|
19
|
+
case context.to_s
|
|
20
|
+
when 'followers_ids' then yield AFollowsB.new(user_b_id, user_a_id)
|
|
21
|
+
when 'friends_ids' then yield AFollowsB.new(user_a_id, user_b_id)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module JsonModel
|
|
3
|
+
# ===========================================================================
|
|
4
|
+
#
|
|
5
|
+
# Friends or Followers is a flat list of users => tweets
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
class FriendsFollowersParser < GenericJsonParser
|
|
9
|
+
attr_accessor :scraped_at, :context, :owning_user_id
|
|
10
|
+
def initialize raw, context, scraped_at, owning_user_id, *ignore
|
|
11
|
+
super raw
|
|
12
|
+
self.context = context.to_sym
|
|
13
|
+
self.scraped_at = scraped_at
|
|
14
|
+
self.owning_user_id = owning_user_id
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Extracted JSON should be an array
|
|
18
|
+
def healthy?() raw && raw.is_a?(Array) end
|
|
19
|
+
|
|
20
|
+
def generate_relationship user, tweet
|
|
21
|
+
case context
|
|
22
|
+
when :followers then AFollowsB.new( user.id, owning_user_id)
|
|
23
|
+
when :friends then AFollowsB.new( owning_user_id, user.id)
|
|
24
|
+
when :favorites then AFavoritesB.new(owning_user_id, user.id, (tweet ? tweet.id : nil))
|
|
25
|
+
else raise "Can't make a relationship out of #{context}. Perhaps better communication is the key."
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
#
|
|
30
|
+
# Enumerate over users (each having one tweet)
|
|
31
|
+
#
|
|
32
|
+
def each &block
|
|
33
|
+
raw.each do |hsh|
|
|
34
|
+
case context
|
|
35
|
+
when :favorites then parsed = JsonTweet.new( hsh, nil)
|
|
36
|
+
else parsed = JsonTwitterUser.new(hsh, scraped_at)
|
|
37
|
+
end
|
|
38
|
+
next unless parsed && parsed.healthy?
|
|
39
|
+
user_b = parsed.generate_user_partial
|
|
40
|
+
tweet = parsed.generate_tweet
|
|
41
|
+
[ user_b,
|
|
42
|
+
tweet,
|
|
43
|
+
generate_relationship(user_b, tweet)
|
|
44
|
+
].compact.each do |obj|
|
|
45
|
+
yield obj
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module JsonModel
|
|
3
|
+
class GenericJsonParser
|
|
4
|
+
attr_accessor :raw
|
|
5
|
+
def initialize raw
|
|
6
|
+
self.raw = raw
|
|
7
|
+
end
|
|
8
|
+
def healthy?() raw && raw.is_a?(Hash) end
|
|
9
|
+
|
|
10
|
+
#
|
|
11
|
+
# Safely parse the json object and instantiate with the raw hash
|
|
12
|
+
#
|
|
13
|
+
def self.new_from_json json_str, *args
|
|
14
|
+
return unless json_str
|
|
15
|
+
begin
|
|
16
|
+
raw = JSON.load(json_str) or return
|
|
17
|
+
rescue Exception => e;
|
|
18
|
+
warn e
|
|
19
|
+
return
|
|
20
|
+
end
|
|
21
|
+
self.new raw, *args
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
require 'wukong/encoding'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module JsonModel
|
|
4
|
+
#
|
|
5
|
+
# The JSON tweets records come off the wire a bit more heavyweight than we'd like.
|
|
6
|
+
#
|
|
7
|
+
# A sample JSON file, reformatted for clarity:
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
# {
|
|
11
|
+
# "id" : 1012519767,
|
|
12
|
+
# "created_at" : "Wed Nov 19 07:16:58 +0000 2008",
|
|
13
|
+
# // twitter_user_id
|
|
14
|
+
# "favorited" : false,
|
|
15
|
+
# "truncated" : false,
|
|
16
|
+
# "in_reply_to_user_id" : null,
|
|
17
|
+
# "in_reply_to_status_id" : null,
|
|
18
|
+
# "text" : "[Our lander (RIP) had the best name. The next rover to Mars, @MarsScienceLab, needs a name. A contest for kids: http:\/\/is.gd\/85rQ ]"
|
|
19
|
+
# "source" : "web",
|
|
20
|
+
# }
|
|
21
|
+
#
|
|
22
|
+
class JsonTweet < GenericJsonParser
|
|
23
|
+
attr_accessor :raw
|
|
24
|
+
def initialize raw, twitter_user_id = nil
|
|
25
|
+
self.raw = raw; return unless healthy?
|
|
26
|
+
if twitter_user_id
|
|
27
|
+
raw['twitter_user_id'] = twitter_user_id
|
|
28
|
+
elsif raw['user'] && raw['user']['id']
|
|
29
|
+
raw['twitter_user_id'] = raw['user']['id']
|
|
30
|
+
end
|
|
31
|
+
self.fix_raw!
|
|
32
|
+
end
|
|
33
|
+
def healthy?() raw && raw.is_a?(Hash) end
|
|
34
|
+
|
|
35
|
+
#
|
|
36
|
+
#
|
|
37
|
+
# Make the data easier for batch flat-record processing
|
|
38
|
+
#
|
|
39
|
+
def fix_raw!
|
|
40
|
+
raw['id'] = ModelCommon.zeropad_id( raw['id'])
|
|
41
|
+
raw['created_at'] = ModelCommon.flatten_date(raw['created_at'])
|
|
42
|
+
raw['favorited'] = ModelCommon.unbooleanize(raw['favorited'])
|
|
43
|
+
raw['truncated'] = ModelCommon.unbooleanize(raw['truncated'])
|
|
44
|
+
raw['twitter_user_id'] = ModelCommon.zeropad_id(raw['twitter_user_id'] )
|
|
45
|
+
raw['in_reply_to_user_id'] = ModelCommon.zeropad_id(raw['in_reply_to_user_id']) unless raw['in_reply_to_user_id'].blank? || (raw['in_reply_to_user_id'].to_i == 0)
|
|
46
|
+
raw['in_reply_to_status_id'] = ModelCommon.zeropad_id(raw['in_reply_to_status_id']) unless raw['in_reply_to_status_id'].blank? || (raw['in_reply_to_status_id'].to_i == 0)
|
|
47
|
+
Wukong.encode_components raw, 'text'
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def generate_tweet
|
|
51
|
+
return unless healthy?
|
|
52
|
+
Tweet.from_hash(raw)
|
|
53
|
+
end
|
|
54
|
+
#
|
|
55
|
+
# produce the included last tweet
|
|
56
|
+
#
|
|
57
|
+
def generate_user_partial
|
|
58
|
+
raw_user = raw['user'] or return
|
|
59
|
+
JsonTwitterUser.new(raw_user, raw['created_at']).generate_user_partial
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
require 'wukong/encoding'
|
|
2
|
+
module Wuclan
|
|
3
|
+
module JsonModel
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# The JSON user records come off the wire a bit more heavyweight than we'd like.
|
|
7
|
+
#
|
|
8
|
+
# We vertically partition the single user record into three, as described above:
|
|
9
|
+
# one with the fundamental info, one with user's personal info (name, location,
|
|
10
|
+
# etc) and one with the styling they've applied to their homepage.
|
|
11
|
+
#
|
|
12
|
+
# A sample JSON file, reformatted for clarity:
|
|
13
|
+
#
|
|
14
|
+
# {
|
|
15
|
+
# "id" : 14693823,
|
|
16
|
+
# // scraped_at added in processing
|
|
17
|
+
# "screen_name" : "MarsPhoenix"
|
|
18
|
+
# "protected" : false,
|
|
19
|
+
# "followers_count" : 39452,
|
|
20
|
+
# "friends_count" : 3,
|
|
21
|
+
# "statuses_count" : 609,
|
|
22
|
+
# "favourites_count" : 5,
|
|
23
|
+
# "created_at" : "Thu May 08 00:17:54 +0000 2008",
|
|
24
|
+
#
|
|
25
|
+
# // "id" : 14693823,
|
|
26
|
+
# // scraped_at added in processing
|
|
27
|
+
# "name" : "MarsPhoenix",
|
|
28
|
+
# "url" : "http:\/\/tinyurl.com\/5wwaru",
|
|
29
|
+
# "location" : "Mars, Solar System",
|
|
30
|
+
# "description" : "I dig Mars! ",
|
|
31
|
+
# "time_zone" : "Pacific Time (US & Canada)",
|
|
32
|
+
# "utc_offset" : -28800,
|
|
33
|
+
#
|
|
34
|
+
# // "id" : 14693823,
|
|
35
|
+
# // scraped_at added in processing
|
|
36
|
+
# "profile_background_color" : "9ae4e8",
|
|
37
|
+
# "profile_text_color" : "000000",
|
|
38
|
+
# "profile_link_color" : "0000ff",
|
|
39
|
+
# "profile_sidebar_border_color" : "87bc44",
|
|
40
|
+
# "profile_sidebar_fill_color" : "e0ff92",
|
|
41
|
+
# "profile_background_tile" : true,
|
|
42
|
+
# "profile_image_url" : "http:\/\/s3.amazonaws.com\/twitter_production\/profile_images\/55133915\/PIA09942_normal.jpg",
|
|
43
|
+
# "profile_background_image_url" : "http:\/\/s3.amazonaws.com\/twitter_production\/profile_background_images\/3069906\/PSP_008591_2485_RGB_Lander_Detail_516-387.jpg",
|
|
44
|
+
#
|
|
45
|
+
# // Sometimes:
|
|
46
|
+
# "status" : { ... a tweet record: see tweet.tsv ... }
|
|
47
|
+
#
|
|
48
|
+
# }
|
|
49
|
+
#
|
|
50
|
+
class JsonTwitterUser
|
|
51
|
+
attr_accessor :raw
|
|
52
|
+
def initialize raw, scraped_at
|
|
53
|
+
self.raw = raw; return unless healthy?
|
|
54
|
+
self.raw['scraped_at'] = scraped_at
|
|
55
|
+
self.fix_raw!
|
|
56
|
+
end
|
|
57
|
+
def healthy?() raw && raw.is_a?(Hash) end
|
|
58
|
+
|
|
59
|
+
# user id from the raw hash
|
|
60
|
+
def twitter_user_id
|
|
61
|
+
raw['id']
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# Make the data easier for batch flat-record processing
|
|
66
|
+
#
|
|
67
|
+
def fix_raw!
|
|
68
|
+
raw['created_at'] = ModelCommon.flatten_date(raw['created_at'])
|
|
69
|
+
raw['id'] = ModelCommon.zeropad_id(raw['id'])
|
|
70
|
+
raw['protected'] = ModelCommon.unbooleanize(raw['protected'])
|
|
71
|
+
Wukong.encode_components raw, 'name', 'location', 'description', 'url'
|
|
72
|
+
# There are several users with bogus screen names
|
|
73
|
+
# These we need to **URL encode** -- not XML-encode.
|
|
74
|
+
if raw['screen_name'] !~ /\A\w+\z/
|
|
75
|
+
raw['screen_name'] = Wukong.encode_str(raw['screen_name'], :url)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
#
|
|
80
|
+
#
|
|
81
|
+
# Expand a user .json record into model instances
|
|
82
|
+
#
|
|
83
|
+
# Ex.
|
|
84
|
+
# # Parse a complete twitter users/show/foo.json record
|
|
85
|
+
# twitter_user, twitter_user_profile, twitter_user_style =
|
|
86
|
+
# JsonUser.generate_user_classes TwitterUser, TwitterUserProfile, TwitterUserStyle
|
|
87
|
+
#
|
|
88
|
+
# # just get the id and screen_name
|
|
89
|
+
# JsonUser.generate_user_classes TwitterUserId
|
|
90
|
+
#
|
|
91
|
+
def generate_user_classes *klasses
|
|
92
|
+
return [] unless healthy?
|
|
93
|
+
klasses.map do |klass|
|
|
94
|
+
klass.from_hash(raw)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
#
|
|
98
|
+
# Create TwitterUser, TwitterUserProfile, and TwitterUserStyle
|
|
99
|
+
# instances from this hash
|
|
100
|
+
#
|
|
101
|
+
def generate_user_profile_and_style
|
|
102
|
+
generate_user_classes TwitterUser, TwitterUserProfile, TwitterUserStyle
|
|
103
|
+
end
|
|
104
|
+
#
|
|
105
|
+
# Create TwitterUserPartial from this hash -- use this when you only have a
|
|
106
|
+
# partial listing, for instance in the public timeline or another user's
|
|
107
|
+
# followers list
|
|
108
|
+
#
|
|
109
|
+
def generate_user_partial
|
|
110
|
+
generate_user_classes(TwitterUserPartial).first
|
|
111
|
+
end
|
|
112
|
+
#
|
|
113
|
+
# produce the included last tweet
|
|
114
|
+
#
|
|
115
|
+
def generate_tweet
|
|
116
|
+
raw_tweet = raw['status']
|
|
117
|
+
JsonTweet.new(raw_tweet, twitter_user_id).generate_tweet if raw_tweet
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
end
|
|
122
|
+
end
|