wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,79 @@
1
+ require 'wuclan/models/tweet/tweet_regexes'
2
+ module Wuclan::Models
3
+
4
+ class TweetToken < TypedStruct.new(
5
+ [:word, String],
6
+ [:user_id, Integer],
7
+ [:tweet_id, Integer],
8
+ [:freq, Integer]
9
+ )
10
+ include ModelCommon
11
+ include TweetRegexes
12
+ class_inheritable_accessor :extract_re
13
+
14
+ def initialize *args
15
+ super *args
16
+ freq = 1 if freq.blank? && (! word.blank?)
17
+ end
18
+
19
+ def num_key_fields() 5 end
20
+ def numeric_id_fields() [] ; end
21
+
22
+ # crawl through the string
23
+ # remove each token, leave a space behind
24
+ def self.extract_tokens! str
25
+ toks = []
26
+ str.gsub!(extract_re){|tok| toks << $1.strip ; ' ' }
27
+ toks
28
+ end
29
+ end
30
+
31
+ class SmilieToken < TweetToken
32
+ self.extract_re = RE_SMILIES
33
+ end
34
+ class UrlToken < TweetToken
35
+ self.extract_re = RE_URL
36
+ end
37
+ class RtToken < TweetToken
38
+ self.extract_re = RE_RETWEET
39
+ def self.extract_tokens! str
40
+ super.map{|str| str = 'RT_@'+str }
41
+ end
42
+ end
43
+ class AtsignToken < TweetToken
44
+ self.extract_re = RE_ATSIGNS
45
+ def self.extract_tokens! str
46
+ super.map{|str| str = '@'+str }
47
+ end
48
+ end
49
+ class HashtagToken < TweetToken
50
+ self.extract_re = RE_HASHTAGS
51
+ def self.extract_tokens! str
52
+ super.map{|str| str = '#'+str }
53
+ end
54
+ end
55
+ class WordToken < TweetToken
56
+ self.extract_re = nil
57
+ #
58
+ # This is pretty simpleminded.
59
+ #
60
+ # returns all words of three or more letters.
61
+ # * terminal 't and 's (as in "don't" and "it's") are tokenised together
62
+ # *
63
+ #
64
+ # * FIXME -- this doesn't leave str as blank, as it should to behave like
65
+ # the other ! methods
66
+ def self.extract_tokens! str
67
+ return [] unless str
68
+ str = str.downcase;
69
+ # kill off all punctuation except 's
70
+ # this includes hyphens (words are split)
71
+ str = str.gsub(/[^\w\'@]+/, ' ').gsub(/\'([st])\b/, '!\1').gsub(/\'/, ' ').gsub(/!/, "'")
72
+ # Busticate at whitespace
73
+ words = str.strip.split(/\s+/)
74
+ #
75
+ words.reject{|w| w.blank? || (w.length < 3) }
76
+ end
77
+ end
78
+
79
+ end
@@ -0,0 +1,74 @@
1
+ module Wuclan::Twitter::Model
2
+
3
+ #
4
+ # Tweet
5
+ #
6
+ # Text and metadata for a twitter status update
7
+ #
8
+ class Tweet < TypedStruct.new(
9
+ [:id, Integer ],
10
+ [:created_at, Bignum ],
11
+ [:twitter_user_id, Integer ],
12
+ [:favorited, Integer ],
13
+ [:truncated, Integer ],
14
+ [:in_reply_to_user_id, Integer ],
15
+ [:in_reply_to_status_id, Integer ],
16
+ [:text, String ],
17
+ [:source, String ],
18
+ [:in_reply_to_screen_name, String ]
19
+ )
20
+ include ModelCommon
21
+
22
+ #
23
+ # Memoized; if you change text you have to flush
24
+ #
25
+ def decoded_text
26
+ @decoded_text ||= text.wukong_decode
27
+ end
28
+
29
+ # Key on id
30
+ def num_key_fields() 1 end
31
+ def numeric_id_fields() [:id, :twitter_user_id, :in_reply_to_status_id, :in_reply_to_user_id] ; end
32
+ end
33
+
34
+
35
+ #
36
+ # SearchTweet
37
+ #
38
+ # Text and metadata for a twitter status update pulled from the search API
39
+ #
40
+ class SearchTweet < TypedStruct.new(
41
+ [:id, Integer ],
42
+ [:created_at, Bignum ],
43
+ [:twitter_user_id, Integer ],
44
+ [:favorited, Integer ],
45
+ [:truncated, Integer ],
46
+ [:in_reply_to_user_id, Integer ],
47
+ [:in_reply_to_status_id, Integer ],
48
+ [:text, String ],
49
+ [:source, String ],
50
+ [:in_reply_to_screen_name, String ],
51
+ [:in_reply_to_sid, Integer ],
52
+ [:twitter_user_screen_name, String ],
53
+ [:twitter_user_sid, Integer ],
54
+ [:iso_language_code, String ]
55
+ )
56
+ include ModelCommon
57
+
58
+ #
59
+ # Memoized; if you change text you have to flush
60
+ #
61
+ def decoded_text
62
+ @decoded_text ||= text.wukong_decode
63
+ end
64
+
65
+ def from_sid() twitter_user_sid end
66
+ def from_user() twitter_user_screen_name end
67
+ def to_sid() in_reply_to_sid end
68
+ def to_sid() in_reply_to_screen_name end
69
+
70
+ # Key on id
71
+ def num_key_fields() 1 end
72
+ def numeric_id_fields() [:id, :twitter_user_id, :in_reply_to_status_id, :in_reply_to_user_id] ; end
73
+ end
74
+ end
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Color::RGB.from_html("fed")
4
+ # Color::RGB.from_html("#fed")
5
+ # Color::RGB.from_html("#cabbed")
6
+ # Color::RGB.from_html("cabbed")
7
+
8
+ # Following stolen from the 'color-tools' gem
9
+
10
+
11
+ # File lib/color/rgb.rb, line 45
12
+ def from_html(html_colour)
13
+ html_colour = html_colour.gsub(%r{[#;]}, '')
14
+ case html_colour.size
15
+ when 3
16
+ colours = html_colour.scan(%r{[0-9A-Fa-f]}).map { |el| (el * 2).to_i(16) }
17
+ when 6
18
+ colours = html_colour.scan(%r<[0-9A-Fa-f]{2}>).map { |el| el.to_i(16) }
19
+ else
20
+ raise ArgumentError
21
+ end
22
+ colours
23
+ end
24
+
25
+
26
+ # File lib/color/rgb.rb, line 167
27
+ def to_hsl r, g, b
28
+ min = [ r, g, b ].min
29
+ max = [ r, g, b ].max
30
+ delta = (max - min).to_f
31
+
32
+ lum = (max + min) / 2.0
33
+
34
+ if delta <= 1e-5 # close to 0.0, so it's a grey
35
+ hue = 0
36
+ sat = 0
37
+ else
38
+ if (lum - 0.5) <= 1e-5
39
+ sat = delta / (max + min).to_f
40
+ else
41
+ sat = delta / (2 - max - min).to_f
42
+ end
43
+
44
+ if r == max
45
+ hue = (g - b) / delta.to_f
46
+ elsif g == max
47
+ hue = (2.0 + b - r) / delta.to_f
48
+ elsif (b - max) <= 1e-5
49
+ hue = (4.0 + r - g) / delta.to_f
50
+ end
51
+ hue /= 6.0
52
+
53
+ hue += 1 if hue < 0
54
+ hue -= 1 if hue > 1
55
+ end
56
+ [ hue, sat, lum ]
57
+ end
@@ -0,0 +1,145 @@
1
+
2
+ module Wuclan::Twitter::Model
3
+
4
+ #
5
+ # Mixin: common methods for each of the user representations / partitions
6
+ #
7
+ module TwitterUserCommon
8
+ #
9
+ # Key on id
10
+ #
11
+ def num_key_fields() 1 end
12
+
13
+ #
14
+ # Fields that can change value
15
+ #
16
+ def mutable_fields
17
+ # everything but 0 (id) and 1 (scraped_at)
18
+ to_a[2..-1]
19
+ end
20
+
21
+ def decoded_name
22
+ @decoded_name ||= (name ? name.wukong_decode : '')
23
+ end
24
+ def decoded_location
25
+ @decoded_location ||= (location ? location.wukong_decode : '')
26
+ end
27
+ def decoded_description
28
+ @decoded_description ||= (description ? description.wukong_decode : '')
29
+ end
30
+
31
+ end
32
+
33
+ #
34
+ # Fundamental information on a user.
35
+ #
36
+ class TwitterUser < TypedStruct.new(
37
+ [:id, Integer],
38
+ [:scraped_at, Bignum],
39
+ [:screen_name, String],
40
+ [:protected, Integer],
41
+ [:followers_count, Integer],
42
+ [:friends_count, Integer],
43
+ [:statuses_count, Integer],
44
+ [:favourites_count, Integer],
45
+ [:created_at, Bignum]
46
+ )
47
+ include ModelCommon
48
+ include TwitterUserCommon
49
+ alias_method :tweets_count, :statuses_count
50
+ alias_method :favorites_count, :favourites_count
51
+ #
52
+ # Rate info
53
+ #
54
+ def friends_per_day() friends_count.to_i / days_since_created end
55
+ def followers_per_day() followers_count.to_i / days_since_created end
56
+ def favorites_per_day() favorites_count.to_i / days_since_created end
57
+ def tweets_per_day() tweets_count.to_i / days_since_created end
58
+ end
59
+
60
+ #
61
+ # Outside of a users/show page, when a user is mentioned
62
+ # only this subset of fields appear.
63
+ #
64
+ class TwitterUserPartial < TypedStruct.new(
65
+ [:id, Integer], # appear in TwitterUser
66
+ [:scraped_at, Bignum],
67
+ [:screen_name, String],
68
+ [:protected, Integer],
69
+ [:followers_count, Integer],
70
+ [:name, String], # appear in TwitterUserProfile
71
+ [:url, String],
72
+ [:location, String],
73
+ [:description, String],
74
+ [:profile_image_url, String] # appear in TwitterUserStyle
75
+ )
76
+ include ModelCommon
77
+ include TwitterUserCommon
78
+ end
79
+
80
+ #
81
+ # User-set information about a user
82
+ #
83
+ class TwitterUserProfile < TypedStruct.new(
84
+ [:id, Integer],
85
+ [:scraped_at, Bignum],
86
+ [:name, String],
87
+ [:url, String],
88
+ [:location, String],
89
+ [:description, String],
90
+ [:time_zone, String],
91
+ [:utc_offset, String]
92
+ )
93
+ include ModelCommon
94
+ include TwitterUserCommon
95
+ end
96
+
97
+ #
98
+ # How the user has styled their page
99
+ #
100
+ class TwitterUserStyle < TypedStruct.new(
101
+ [:id, Integer],
102
+ [:scraped_at, Bignum],
103
+ [:profile_background_color, String],
104
+ [:profile_text_color, String],
105
+ [:profile_link_color, String],
106
+ [:profile_sidebar_border_color, String],
107
+ [:profile_sidebar_fill_color, String],
108
+ [:profile_background_tile, String],
109
+ [:profile_background_image_url, String],
110
+ [:profile_image_url, String]
111
+ )
112
+ include ModelCommon
113
+ include TwitterUserCommon
114
+ end
115
+
116
+ #
117
+ # For passing around just screen_name => id mapping
118
+ #
119
+ class TwitterUserId < TypedStruct.new(
120
+ [:id, Integer],
121
+ [:screen_name, String],
122
+ [:full, Integer],
123
+ [:followers_count, Integer],
124
+ [:created_at, Bignum],
125
+ [:protected, Integer],
126
+ [:status, String]
127
+ )
128
+ include ModelCommon
129
+ include TwitterUserCommon
130
+ def num_key_fields() 1 end
131
+ end
132
+
133
+ #
134
+ # For passing around just screen_name => id mapping
135
+ #
136
+ class TwitterUserSearchId < TypedStruct.new(
137
+ [:screen_name, String],
138
+ [:sid, Integer],
139
+ [:id, Integer]
140
+ )
141
+ include ModelCommon
142
+ include TwitterUserCommon
143
+ def num_key_fields() 1 end
144
+ end
145
+ end
@@ -0,0 +1,21 @@
1
+ module Wuclan
2
+ module Twitter
3
+ module Model
4
+ autoload :ModelCommon, 'wuclan/twitter/model/base'
5
+ autoload :TwitterUser, 'wuclan/twitter/model/twitter_user'
6
+ autoload :TwitterUserPartial, 'wuclan/twitter/model/twitter_user'
7
+ autoload :TwitterUserProfile, 'wuclan/twitter/model/twitter_user'
8
+ autoload :TwitterUserStyle, 'wuclan/twitter/model/twitter_user'
9
+ autoload :TwitterUserSearchId, 'wuclan/twitter/model/twitter_user'
10
+ autoload :TwitterUserId, 'wuclan/twitter/model/twitter_user'
11
+ autoload :Tweet, 'wuclan/twitter/model/tweet'
12
+ autoload :SearchTweet, 'wuclan/twitter/model/tweet'
13
+ autoload :AFollowsB, 'wuclan/twitter/model/relationship'
14
+ autoload :AFavoritesB, 'wuclan/twitter/model/relationship'
15
+ autoload :ARepliesB, 'wuclan/twitter/model/relationship'
16
+ autoload :AAtsignsB, 'wuclan/twitter/model/relationship'
17
+ autoload :AAtsignsBId, 'wuclan/twitter/model/relationship'
18
+ autoload :ARepliesBName, 'wuclan/twitter/model/relationship'
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ module Wuclan
2
+ module JsonModel
3
+
4
+ # ===========================================================================
5
+ #
6
+ # Public timeline is an array of tweets => users
7
+ #
8
+ #
9
+ class FfIdsParser < GenericJsonParser
10
+
11
+ # friends_ids or followers_ids is an array of user_id's
12
+ def healthy?()
13
+ contents && contents.is_a?(Array)
14
+ end
15
+
16
+ def each &block
17
+ contents.each do |user_b_id|
18
+ user_b_id = "%010d"%user_b_id.to_i
19
+ case context.to_s
20
+ when 'followers_ids' then yield AFollowsB.new(user_b_id, user_a_id)
21
+ when 'friends_ids' then yield AFollowsB.new(user_a_id, user_b_id)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,52 @@
1
+ module Wuclan
2
+ module JsonModel
3
+ # ===========================================================================
4
+ #
5
+ # Friends or Followers is a flat list of users => tweets
6
+ #
7
+ #
8
+ class FriendsFollowersParser < GenericJsonParser
9
+ attr_accessor :scraped_at, :context, :owning_user_id
10
+ def initialize raw, context, scraped_at, owning_user_id, *ignore
11
+ super raw
12
+ self.context = context.to_sym
13
+ self.scraped_at = scraped_at
14
+ self.owning_user_id = owning_user_id
15
+ end
16
+
17
+ # Extracted JSON should be an array
18
+ def healthy?() raw && raw.is_a?(Array) end
19
+
20
+ def generate_relationship user, tweet
21
+ case context
22
+ when :followers then AFollowsB.new( user.id, owning_user_id)
23
+ when :friends then AFollowsB.new( owning_user_id, user.id)
24
+ when :favorites then AFavoritesB.new(owning_user_id, user.id, (tweet ? tweet.id : nil))
25
+ else raise "Can't make a relationship out of #{context}. Perhaps better communication is the key."
26
+ end
27
+ end
28
+
29
+ #
30
+ # Enumerate over users (each having one tweet)
31
+ #
32
+ def each &block
33
+ raw.each do |hsh|
34
+ case context
35
+ when :favorites then parsed = JsonTweet.new( hsh, nil)
36
+ else parsed = JsonTwitterUser.new(hsh, scraped_at)
37
+ end
38
+ next unless parsed && parsed.healthy?
39
+ user_b = parsed.generate_user_partial
40
+ tweet = parsed.generate_tweet
41
+ [ user_b,
42
+ tweet,
43
+ generate_relationship(user_b, tweet)
44
+ ].compact.each do |obj|
45
+ yield obj
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,26 @@
1
+ module Wuclan
2
+ module JsonModel
3
+ class GenericJsonParser
4
+ attr_accessor :raw
5
+ def initialize raw
6
+ self.raw = raw
7
+ end
8
+ def healthy?() raw && raw.is_a?(Hash) end
9
+
10
+ #
11
+ # Safely parse the json object and instantiate with the raw hash
12
+ #
13
+ def self.new_from_json json_str, *args
14
+ return unless json_str
15
+ begin
16
+ raw = JSON.load(json_str) or return
17
+ rescue Exception => e;
18
+ warn e
19
+ return
20
+ end
21
+ self.new raw, *args
22
+ end
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,63 @@
1
+ require 'wukong/encoding'
2
+ module Wuclan
3
+ module JsonModel
4
+ #
5
+ # The JSON tweets records come off the wire a bit more heavyweight than we'd like.
6
+ #
7
+ # A sample JSON file, reformatted for clarity:
8
+ #
9
+ #
10
+ # {
11
+ # "id" : 1012519767,
12
+ # "created_at" : "Wed Nov 19 07:16:58 +0000 2008",
13
+ # // twitter_user_id
14
+ # "favorited" : false,
15
+ # "truncated" : false,
16
+ # "in_reply_to_user_id" : null,
17
+ # "in_reply_to_status_id" : null,
18
+ # "text" : "[Our lander (RIP) had the best name. The next rover to Mars, @MarsScienceLab, needs a name. A contest for kids: http:\/\/is.gd\/85rQ ]"
19
+ # "source" : "web",
20
+ # }
21
+ #
22
+ class JsonTweet < GenericJsonParser
23
+ attr_accessor :raw
24
+ def initialize raw, twitter_user_id = nil
25
+ self.raw = raw; return unless healthy?
26
+ if twitter_user_id
27
+ raw['twitter_user_id'] = twitter_user_id
28
+ elsif raw['user'] && raw['user']['id']
29
+ raw['twitter_user_id'] = raw['user']['id']
30
+ end
31
+ self.fix_raw!
32
+ end
33
+ def healthy?() raw && raw.is_a?(Hash) end
34
+
35
+ #
36
+ #
37
+ # Make the data easier for batch flat-record processing
38
+ #
39
+ def fix_raw!
40
+ raw['id'] = ModelCommon.zeropad_id( raw['id'])
41
+ raw['created_at'] = ModelCommon.flatten_date(raw['created_at'])
42
+ raw['favorited'] = ModelCommon.unbooleanize(raw['favorited'])
43
+ raw['truncated'] = ModelCommon.unbooleanize(raw['truncated'])
44
+ raw['twitter_user_id'] = ModelCommon.zeropad_id(raw['twitter_user_id'] )
45
+ raw['in_reply_to_user_id'] = ModelCommon.zeropad_id(raw['in_reply_to_user_id']) unless raw['in_reply_to_user_id'].blank? || (raw['in_reply_to_user_id'].to_i == 0)
46
+ raw['in_reply_to_status_id'] = ModelCommon.zeropad_id(raw['in_reply_to_status_id']) unless raw['in_reply_to_status_id'].blank? || (raw['in_reply_to_status_id'].to_i == 0)
47
+ Wukong.encode_components raw, 'text'
48
+ end
49
+
50
+ def generate_tweet
51
+ return unless healthy?
52
+ Tweet.from_hash(raw)
53
+ end
54
+ #
55
+ # produce the included last tweet
56
+ #
57
+ def generate_user_partial
58
+ raw_user = raw['user'] or return
59
+ JsonTwitterUser.new(raw_user, raw['created_at']).generate_user_partial
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,122 @@
1
+ require 'wukong/encoding'
2
+ module Wuclan
3
+ module JsonModel
4
+
5
+ #
6
+ # The JSON user records come off the wire a bit more heavyweight than we'd like.
7
+ #
8
+ # We vertically partition the single user record into three, as described above:
9
+ # one with the fundamental info, one with user's personal info (name, location,
10
+ # etc) and one with the styling they've applied to their homepage.
11
+ #
12
+ # A sample JSON file, reformatted for clarity:
13
+ #
14
+ # {
15
+ # "id" : 14693823,
16
+ # // scraped_at added in processing
17
+ # "screen_name" : "MarsPhoenix"
18
+ # "protected" : false,
19
+ # "followers_count" : 39452,
20
+ # "friends_count" : 3,
21
+ # "statuses_count" : 609,
22
+ # "favourites_count" : 5,
23
+ # "created_at" : "Thu May 08 00:17:54 +0000 2008",
24
+ #
25
+ # // "id" : 14693823,
26
+ # // scraped_at added in processing
27
+ # "name" : "MarsPhoenix",
28
+ # "url" : "http:\/\/tinyurl.com\/5wwaru",
29
+ # "location" : "Mars, Solar System",
30
+ # "description" : "I dig Mars! ",
31
+ # "time_zone" : "Pacific Time (US & Canada)",
32
+ # "utc_offset" : -28800,
33
+ #
34
+ # // "id" : 14693823,
35
+ # // scraped_at added in processing
36
+ # "profile_background_color" : "9ae4e8",
37
+ # "profile_text_color" : "000000",
38
+ # "profile_link_color" : "0000ff",
39
+ # "profile_sidebar_border_color" : "87bc44",
40
+ # "profile_sidebar_fill_color" : "e0ff92",
41
+ # "profile_background_tile" : true,
42
+ # "profile_image_url" : "http:\/\/s3.amazonaws.com\/twitter_production\/profile_images\/55133915\/PIA09942_normal.jpg",
43
+ # "profile_background_image_url" : "http:\/\/s3.amazonaws.com\/twitter_production\/profile_background_images\/3069906\/PSP_008591_2485_RGB_Lander_Detail_516-387.jpg",
44
+ #
45
+ # // Sometimes:
46
+ # "status" : { ... a tweet record: see tweet.tsv ... }
47
+ #
48
+ # }
49
+ #
50
+ class JsonTwitterUser
51
+ attr_accessor :raw
52
+ def initialize raw, scraped_at
53
+ self.raw = raw; return unless healthy?
54
+ self.raw['scraped_at'] = scraped_at
55
+ self.fix_raw!
56
+ end
57
+ def healthy?() raw && raw.is_a?(Hash) end
58
+
59
+ # user id from the raw hash
60
+ def twitter_user_id
61
+ raw['id']
62
+ end
63
+
64
+ #
65
+ # Make the data easier for batch flat-record processing
66
+ #
67
+ def fix_raw!
68
+ raw['created_at'] = ModelCommon.flatten_date(raw['created_at'])
69
+ raw['id'] = ModelCommon.zeropad_id(raw['id'])
70
+ raw['protected'] = ModelCommon.unbooleanize(raw['protected'])
71
+ Wukong.encode_components raw, 'name', 'location', 'description', 'url'
72
+ # There are several users with bogus screen names
73
+ # These we need to **URL encode** -- not XML-encode.
74
+ if raw['screen_name'] !~ /\A\w+\z/
75
+ raw['screen_name'] = Wukong.encode_str(raw['screen_name'], :url)
76
+ end
77
+ end
78
+
79
+ #
80
+ #
81
+ # Expand a user .json record into model instances
82
+ #
83
+ # Ex.
84
+ # # Parse a complete twitter users/show/foo.json record
85
+ # twitter_user, twitter_user_profile, twitter_user_style =
86
+ # JsonUser.generate_user_classes TwitterUser, TwitterUserProfile, TwitterUserStyle
87
+ #
88
+ # # just get the id and screen_name
89
+ # JsonUser.generate_user_classes TwitterUserId
90
+ #
91
+ def generate_user_classes *klasses
92
+ return [] unless healthy?
93
+ klasses.map do |klass|
94
+ klass.from_hash(raw)
95
+ end
96
+ end
97
+ #
98
+ # Create TwitterUser, TwitterUserProfile, and TwitterUserStyle
99
+ # instances from this hash
100
+ #
101
+ def generate_user_profile_and_style
102
+ generate_user_classes TwitterUser, TwitterUserProfile, TwitterUserStyle
103
+ end
104
+ #
105
+ # Create TwitterUserPartial from this hash -- use this when you only have a
106
+ # partial listing, for instance in the public timeline or another user's
107
+ # followers list
108
+ #
109
+ def generate_user_partial
110
+ generate_user_classes(TwitterUserPartial).first
111
+ end
112
+ #
113
+ # produce the included last tweet
114
+ #
115
+ def generate_tweet
116
+ raw_tweet = raw['status']
117
+ JsonTweet.new(raw_tweet, twitter_user_id).generate_tweet if raw_tweet
118
+ end
119
+ end
120
+
121
+ end
122
+ end