wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,84 @@
1
+ module TwitterFriends
2
+ module TwitterRdf
3
+
4
+ #
5
+ # RDF-formatted date
6
+ #
7
+ def self.encode_datetime dt
8
+ begin
9
+ DateTime.parse(dt).to_s
10
+ rescue ArgumentError => e
11
+ nil
12
+ end
13
+ end
14
+
15
+ #
16
+ # Emit a component (subject or object) with the right semantic encoding
17
+ #
18
+ # Use :boolskip if a false property should just be left out.
19
+ #
20
+ def rdf_component val, type
21
+ case type
22
+ when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
23
+ when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
24
+ when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
25
+ when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
26
+ when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
27
+ when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
28
+ when :str then %Q{"#{val}"}
29
+ else raise "Don't know how to encode #{type}"
30
+ end
31
+ end
32
+
33
+ #
34
+ # Express relationship (predicate) in RDF
35
+ #
36
+ def rdf_pred pred
37
+ case pred
38
+ when :created_at then %Q{<http://twitter.com/##{pred}>}
39
+ else %Q{<http://twitter.com/##{pred}>}
40
+ end
41
+ end
42
+
43
+ #
44
+ # RDF Triple string for the given (subject, object, predicate)
45
+ # http://www.w3.org/TR/rdf-testcases/#ntriples
46
+ #
47
+ def self.rdf_triple subj, pred, obj, comment=nil
48
+ comment = "\t# " + comment.to_s unless comment.blank?
49
+ %Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
50
+ end
51
+
52
+ def mutable?(attr)
53
+ false
54
+ end
55
+
56
+ #
57
+ # Extract [subject, predicate, object, (extra)] tuples.
58
+ #
59
+ # (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
60
+ #
61
+ def to_rdf3_tuples
62
+ members_with_types.map do |attr, type|
63
+ next if self[attr].blank?
64
+ subj = rdf_resource
65
+ pred = rdf_pred(attr)
66
+ obj = rdf_component(self[attr], type) or next
67
+ comment = scraped_at if mutable?(attr)
68
+ [subj, pred, obj, comment]
69
+ end.compact
70
+ end
71
+
72
+ #
73
+ # Convert an object to an rdf triple.
74
+ #
75
+ # Appends scraped at to #mutable? attributes
76
+ #
77
+ def to_rdf3
78
+ to_rdf3_tuples.map do |tuple|
79
+ self.class.rdf_triple tuple
80
+ end.join("\n")
81
+ end
82
+
83
+ end
84
+ end
@@ -0,0 +1,12 @@
1
+ module TwitterFriends::StructModel
2
+ module TwitterUserCommon
3
+ def rdf_resource
4
+ @rdf_resource ||= rdf_component(id, :user)
5
+ end
6
+ end
7
+ [TwitterUser, TwitterUserProfile, TwitterUserStyle, TwitterUserPartial].each do |klass|
8
+ klass.class_eval do
9
+ include TwitterFriends::TwitterRdf
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,271 @@
1
+ require 'net/http'
2
+ require 'addressable/uri'
3
+ #
4
+ #
5
+ # SELECT 'expanded_url', short_url, IFNULL(dest_url,""), IFNULL(scraped_at,"")
6
+ # FROM expanded_urls
7
+ # INTO OUTFILE '~/ics/pool/social/network/twitter_friends/fixd/dump/expanded_urls-20090113.tsv' ;
8
+ #
9
+ #
10
+
11
+ module TwitterFriends
12
+ module Scrape
13
+ include TwitterFriends::StructModel::ModelCommon
14
+
15
+ class ExpandedUrl < Struct.new(:src_url, :dest_url, :scraped_at)
16
+ # src_url uniquely identifies us
17
+ def num_key_fields() 1 end
18
+
19
+ #
20
+ # These are all the characters that belong in a URL
21
+ #
22
+ RE_URL_SANE_CHARS =
23
+ Addressable::URI::CharacterClasses::UNRESERVED +
24
+ Addressable::URI::CharacterClasses::RESERVED + '%'
25
+ #
26
+ # These are illegal but *are* found in URLs. We're going to let them through.
27
+ # Note that ' ' space is one of the tolerated miscreants.
28
+ #
29
+ RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS = '\{\}\|\^\` '
30
+ #
31
+ # Replace all url-insane characters by their %encoding. We don't really
32
+ # care here whether the URLs do anything: we just want to remove stuff that
33
+ # absosmurfly don't belong.
34
+ #
35
+ # This code is stolen from Addressable::URI, which unfortunately has a bug
36
+ # in exactly this method (fixed here). (http://addressable.rubyforge.org)
37
+ # Note that we are /not/ re-encoding characters like '%' -- it's assumed
38
+ # that the url is encoded, but perhaps poorly.
39
+ #
40
+ # In practice the illegal characters most often seen are those in
41
+ # RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS plus
42
+ # <>"\t\\
43
+ #
44
+ def self.scrub_url url
45
+ url.gsub(/[^#{RE_URL_SANE_CHARS+RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS}]/) do |sequence|
46
+ sequence.unpack('C*').map{ |c| ("%%%02x"%c).upcase }.join("")
47
+ end
48
+ end
49
+
50
+ #
51
+ # Handle some known edge cases / simplifications with short urls
52
+ #
53
+ def fix_src_url!
54
+ fix_isgd_url!
55
+ end
56
+ #
57
+ # is.gd urls use a terminal '-' to indicate 'preview' -- but
58
+ # we want the destination, so strip that.
59
+ #
60
+ def fix_isgd_url!
61
+ self.src_url.gsub!(%r{(http://is.gd/\w+)[-/]}, '\1')
62
+ end
63
+
64
+ #
65
+ # The major shortening services
66
+ #
67
+ # Do any of the mainstream shorteners use in-band characters besides \w
68
+ # alphanum and - dash? (idek.net uses a ~ and pastoid.com a + but they
69
+ # are not popular enough to justify the annoyance of allowing extra
70
+ # chars).
71
+ #
72
+ TINY_URLISHES_RE = %r{\Ahttp://(
73
+ | tinyurl.com # 4969626
74
+ | is.gd # 406718
75
+ | bit.ly # 298590
76
+ | twurl.nl # 169796
77
+ | snipurl.com # 107961
78
+ | tr.im # 38793
79
+ | snurl.com # 37576
80
+ | snipr.com # 26897
81
+ | jijr.com # 20965
82
+ | cli.gs # 19700
83
+ | budurl.com # 19402
84
+ | xrl.us # 11621
85
+ # | tiny.cc # 9140 # tiny.cc borks fetcher
86
+ | zi.ma # 8148
87
+ | s3nt.com # 6922
88
+ | ow.ly # 6848
89
+ | poprl.com # 6666
90
+ | piurl.com # 5262
91
+ | ur1.ca # 4435
92
+ | short.to # 4105
93
+ | urlenco.de # 4087
94
+ | zz.gd # 4045
95
+ | rubyurl.com # 3766
96
+ | uris.jp # 2749
97
+ | ub0.cc # 2607
98
+ | twurl.cc # 2545
99
+ | moourl.com # 2280
100
+ | rurl.org # 2271
101
+ | url.ie # 2156
102
+ )/([\w\-]+)}ix
103
+ def self.match_tinyurlish url
104
+ m = TINY_URLISHES_RE.match(url) or return
105
+ host, path = m.captures
106
+ "http://#{host.downcase}/#{path}"
107
+ end
108
+
109
+ #
110
+ # If the base part looks like a tinyurlish, return an instantiated object
111
+ # Otherwise, return nil
112
+ #
113
+ # This will happily turn
114
+ # http://tinyurl.com/aaASDF/A-BUNCH_OF_BOGOSITY
115
+ # into just the http://tinyurl.com/aaASDF
116
+ #
117
+ def self.new_if_tinyurlish url
118
+ src_url = match_tinyurlish(url) or return
119
+ new(src_url, nil, nil)
120
+ end
121
+
122
+ end
123
+ end
124
+ end
125
+
126
+ #
127
+ # Frequency of host part from ~ 6M URLs.
128
+ # Just a rough guide -- don't go launchin' yer SEO campaign using these numbers.
129
+ #
130
+ # 4969626 tinyurl.com
131
+ # 406718 is.gd
132
+ # 298590 bit.ly
133
+ # 169796 twurl.nl
134
+ # 107961 snipurl.com
135
+ # 38793 tr.im
136
+ # 37576 snurl.com
137
+ # 26897 snipr.com
138
+ # 20965 jijr.com
139
+ # 19700 cli.gs
140
+ # 19402 budurl.com
141
+ # 11621 xrl.us
142
+ # 9140 tiny.cc
143
+ # 8148 zi.ma
144
+ # 6922 s3nt.com
145
+ # 6848 ow.ly
146
+ # 6666 poprl.com
147
+ # 5262 piurl.com
148
+ # 4435 ur1.ca
149
+ # 4105 short.to
150
+ # 4087 urlenco.de
151
+ # 4045 zz.gd
152
+ # 3766 rubyurl.com
153
+ # 2749 uris.jp
154
+ # 2607 ub0.cc
155
+ # 2545 twurl.cc
156
+ # 2280 moourl.com
157
+ # 2271 rurl.org
158
+ # 2156 url.ie
159
+ #
160
+ # 235192 ff.im
161
+ # 82062 bkite.com
162
+ # 81792 blip.fm
163
+ # 53928 ping.fm
164
+ # 28826 loopt.us
165
+ # 13724 ad.vu
166
+ # 8438 tgr.me
167
+ # 8418 adjix.com
168
+ # 5061 www.url.inc
169
+ # pastoid.com
170
+ #
171
+ # 339312 twitpic.com
172
+ # 28282 rsstotwitter.com
173
+ # 26641 twitter.com
174
+ # 22263 www.nicovideo.jp
175
+ # 21897 www.flickr.com
176
+ # 20910 live.nicovideo.jp
177
+ # 18604 book.akahoshitakuya.com
178
+ # 16674 movapic.com
179
+ # 15844 jobfeedr.com
180
+ # 15049 u.mavrev.com
181
+ # 14537 f.hatena.ne.jp
182
+ # 14454 www.last.fm
183
+ # 12003 be
184
+ # 11548 www.desktoptopia.com
185
+ # 10712 raptr.com
186
+ # 10340 hellotxt.com
187
+ # 10266 deals.clhmedia.com
188
+ # 9910 mrtweet.net
189
+ # 9818 echos.tumblr.com
190
+ # 9378 echomas.tumblr.com
191
+ # 9330 flickr.com
192
+ # 8695 weather.livedoor.com
193
+ # 8525 d.hatena.ne.jp
194
+ # 7524 radiopopbitch.com
195
+ # 7501 qik.com
196
+ # 7161 aweber.com
197
+ # 7086 www.myspace.com
198
+ # 6990 activerain.com
199
+ # 6811 ruwt.tv
200
+ # 6722 bbc.co.uk
201
+ # 6344 www.amazon.com
202
+ # 6328 photohito.com
203
+ # 6142 techwatching.com
204
+ # 6117 kexplorer.com
205
+ # 6009 EzineArticles.com
206
+ # 5964 www.squidoo.com
207
+ # 5929 news.bbc.co.uk
208
+ # 5756 mobypicture.com
209
+ # 5489 www.youtube.com
210
+ # 5454 robotbling.com
211
+ # 5433 www.timesoftheinternet.com
212
+ # 5182 www.blogtv.com
213
+ # 5105 tiny12.tv
214
+ # 5084 www.imdb.com
215
+ # 4894 www.ustream.tv
216
+ # 4800 vimeo.com
217
+ # 4796 yes.com
218
+ # 4665 5ver.com
219
+ # 4596 www.absurdtrivia.com
220
+ # 4585 twittgroups.com
221
+ # 4525 funp.com
222
+ # 4472 en.wikipedia.org
223
+ # 4431 hypem.com
224
+ # 4313 anond.hatelabo.jp
225
+ # 4222 twitxr.com
226
+ # 4045 twitter.grader.com
227
+ # 3987 yourinternetradio.com
228
+ # 3976 TwitPWR.com
229
+ # 3964 sfbay.craigslist.org
230
+ # 3876 x.imeem.com
231
+ # 3757 www.invertia.com
232
+ # 3556 timesurl.at
233
+ # 3531 www.jb.man.ac.uk
234
+ # 3528 bossalive.com
235
+ # 3410 buzztter.com
236
+ # 3337 www.accuweather.com
237
+ # 3324 drawr.net
238
+ # 3285 xkcd.com
239
+ # 3270 maps.google.com
240
+ # 3243 tobtr.com
241
+ # 3182 www.cnn.com
242
+ # 3180 www.stickam.com
243
+ # 3177 www.dailymugshot.com
244
+ # 3163 r.reuters.com
245
+ # 2963 148apps.com
246
+ # 2885 unvlog.com
247
+ # 2853 tweetwasters.com
248
+ # 2778 eloglife.net
249
+ # 2758 dihitt.com.br
250
+ # 2751 openzap.com
251
+ # 2727 blip.tv
252
+ # 2699 www.sailingxperience.com
253
+ # 2682 eepics.com
254
+ # 2638 blog.livedoor.jp
255
+ # 2552 iphone.robotbling.com
256
+ # 2528 phodroid.com
257
+ # 2490 twitter.digsby.com
258
+ # 2420 plazes.com
259
+ # 2391 www.google.com
260
+ # 2311 www.msnbc.msn.com
261
+ # 2228 gamerdna.com
262
+ # 2227 gyazo.com
263
+ # 2197 www.vimeo.com
264
+ # 2184 entertonement.com
265
+ # 2157 c2.koukokukaigisitsu.com
266
+ #
267
+
268
+ # def spread_key() self.src_url[-3..-1] end
269
+ # def output_form spread=false
270
+ # spread ? ("%s-%s\t%s"%[resource_name, spread_key, to_tsv]) : super()
271
+ # end