wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module TwitterFriends
|
|
2
|
+
module TwitterRdf
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# RDF-formatted date
|
|
6
|
+
#
|
|
7
|
+
def self.encode_datetime dt
|
|
8
|
+
begin
|
|
9
|
+
DateTime.parse(dt).to_s
|
|
10
|
+
rescue ArgumentError => e
|
|
11
|
+
nil
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#
|
|
16
|
+
# Emit a component (subject or object) with the right semantic encoding
|
|
17
|
+
#
|
|
18
|
+
# Use :boolskip if a false property should just be left out.
|
|
19
|
+
#
|
|
20
|
+
def rdf_component val, type
|
|
21
|
+
case type
|
|
22
|
+
when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
|
|
23
|
+
when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
|
|
24
|
+
when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
|
|
25
|
+
when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
|
|
26
|
+
when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
|
|
27
|
+
when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
|
|
28
|
+
when :str then %Q{"#{val}"}
|
|
29
|
+
else raise "Don't know how to encode #{type}"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#
|
|
34
|
+
# Express relationship (predicate) in RDF
|
|
35
|
+
#
|
|
36
|
+
def rdf_pred pred
|
|
37
|
+
case pred
|
|
38
|
+
when :created_at then %Q{<http://twitter.com/##{pred}>}
|
|
39
|
+
else %Q{<http://twitter.com/##{pred}>}
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
#
|
|
44
|
+
# RDF Triple string for the given (subject, object, predicate)
|
|
45
|
+
# http://www.w3.org/TR/rdf-testcases/#ntriples
|
|
46
|
+
#
|
|
47
|
+
def self.rdf_triple subj, pred, obj, comment=nil
|
|
48
|
+
comment = "\t# " + comment.to_s unless comment.blank?
|
|
49
|
+
%Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def mutable?(attr)
|
|
53
|
+
false
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
#
|
|
57
|
+
# Extract [subject, predicate, object, (extra)] tuples.
|
|
58
|
+
#
|
|
59
|
+
# (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
|
|
60
|
+
#
|
|
61
|
+
def to_rdf3_tuples
|
|
62
|
+
members_with_types.map do |attr, type|
|
|
63
|
+
next if self[attr].blank?
|
|
64
|
+
subj = rdf_resource
|
|
65
|
+
pred = rdf_pred(attr)
|
|
66
|
+
obj = rdf_component(self[attr], type) or next
|
|
67
|
+
comment = scraped_at if mutable?(attr)
|
|
68
|
+
[subj, pred, obj, comment]
|
|
69
|
+
end.compact
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
#
|
|
73
|
+
# Convert an object to an rdf triple.
|
|
74
|
+
#
|
|
75
|
+
# Appends scraped at to #mutable? attributes
|
|
76
|
+
#
|
|
77
|
+
def to_rdf3
|
|
78
|
+
to_rdf3_tuples.map do |tuple|
|
|
79
|
+
self.class.rdf_triple tuple
|
|
80
|
+
end.join("\n")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
module TwitterFriends::StructModel
|
|
2
|
+
module TwitterUserCommon
|
|
3
|
+
def rdf_resource
|
|
4
|
+
@rdf_resource ||= rdf_component(id, :user)
|
|
5
|
+
end
|
|
6
|
+
end
|
|
7
|
+
[TwitterUser, TwitterUserProfile, TwitterUserStyle, TwitterUserPartial].each do |klass|
|
|
8
|
+
klass.class_eval do
|
|
9
|
+
include TwitterFriends::TwitterRdf
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
require 'net/http'
|
|
2
|
+
require 'addressable/uri'
|
|
3
|
+
#
|
|
4
|
+
#
|
|
5
|
+
# SELECT 'expanded_url', short_url, IFNULL(dest_url,""), IFNULL(scraped_at,"")
|
|
6
|
+
# FROM expanded_urls
|
|
7
|
+
# INTO OUTFILE '~/ics/pool/social/network/twitter_friends/fixd/dump/expanded_urls-20090113.tsv' ;
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
module TwitterFriends
|
|
12
|
+
module Scrape
|
|
13
|
+
include TwitterFriends::StructModel::ModelCommon
|
|
14
|
+
|
|
15
|
+
class ExpandedUrl < Struct.new(:src_url, :dest_url, :scraped_at)
|
|
16
|
+
# src_url uniquely identifies us
|
|
17
|
+
def num_key_fields() 1 end
|
|
18
|
+
|
|
19
|
+
#
|
|
20
|
+
# These are all the characters that belong in a URL
|
|
21
|
+
#
|
|
22
|
+
RE_URL_SANE_CHARS =
|
|
23
|
+
Addressable::URI::CharacterClasses::UNRESERVED +
|
|
24
|
+
Addressable::URI::CharacterClasses::RESERVED + '%'
|
|
25
|
+
#
|
|
26
|
+
# These are illegal but *are* found in URLs. We're going to let them through.
|
|
27
|
+
# Note that ' ' space is one of the tolerated miscreants.
|
|
28
|
+
#
|
|
29
|
+
RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS = '\{\}\|\^\` '
|
|
30
|
+
#
|
|
31
|
+
# Replace all url-insane characters by their %encoding. We don't really
|
|
32
|
+
# care here whether the URLs do anything: we just want to remove stuff that
|
|
33
|
+
# absosmurfly don't belong.
|
|
34
|
+
#
|
|
35
|
+
# This code is stolen from Addressable::URI, which unfortunately has a bug
|
|
36
|
+
# in exactly this method (fixed here). (http://addressable.rubyforge.org)
|
|
37
|
+
# Note that we are /not/ re-encoding characters like '%' -- it's assumed
|
|
38
|
+
# that the url is encoded, but perhaps poorly.
|
|
39
|
+
#
|
|
40
|
+
# In practice the illegal characters most often seen are those in
|
|
41
|
+
# RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS plus
|
|
42
|
+
# <>"\t\\
|
|
43
|
+
#
|
|
44
|
+
def self.scrub_url url
|
|
45
|
+
url.gsub(/[^#{RE_URL_SANE_CHARS+RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS}]/) do |sequence|
|
|
46
|
+
sequence.unpack('C*').map{ |c| ("%%%02x"%c).upcase }.join("")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
#
|
|
51
|
+
# Handle some known edge cases / simplifications with short urls
|
|
52
|
+
#
|
|
53
|
+
def fix_src_url!
|
|
54
|
+
fix_isgd_url!
|
|
55
|
+
end
|
|
56
|
+
#
|
|
57
|
+
# is.gd urls use a terminal '-' to indicate 'preview' -- but
|
|
58
|
+
# we want the destination, so strip that.
|
|
59
|
+
#
|
|
60
|
+
def fix_isgd_url!
|
|
61
|
+
self.src_url.gsub!(%r{(http://is.gd/\w+)[-/]}, '\1')
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# The major shortening services
|
|
66
|
+
#
|
|
67
|
+
# Do any of the mainstream shorteners use in-band characters besides \w
|
|
68
|
+
# alphanum and - dash? (idek.net uses a ~ and pastoid.com a + but they
|
|
69
|
+
# are not popular enough to justify the annoyance of allowing extra
|
|
70
|
+
# chars).
|
|
71
|
+
#
|
|
72
|
+
TINY_URLISHES_RE = %r{\Ahttp://(
|
|
73
|
+
| tinyurl.com # 4969626
|
|
74
|
+
| is.gd # 406718
|
|
75
|
+
| bit.ly # 298590
|
|
76
|
+
| twurl.nl # 169796
|
|
77
|
+
| snipurl.com # 107961
|
|
78
|
+
| tr.im # 38793
|
|
79
|
+
| snurl.com # 37576
|
|
80
|
+
| snipr.com # 26897
|
|
81
|
+
| jijr.com # 20965
|
|
82
|
+
| cli.gs # 19700
|
|
83
|
+
| budurl.com # 19402
|
|
84
|
+
| xrl.us # 11621
|
|
85
|
+
# | tiny.cc # 9140 # tiny.cc borks fetcher
|
|
86
|
+
| zi.ma # 8148
|
|
87
|
+
| s3nt.com # 6922
|
|
88
|
+
| ow.ly # 6848
|
|
89
|
+
| poprl.com # 6666
|
|
90
|
+
| piurl.com # 5262
|
|
91
|
+
| ur1.ca # 4435
|
|
92
|
+
| short.to # 4105
|
|
93
|
+
| urlenco.de # 4087
|
|
94
|
+
| zz.gd # 4045
|
|
95
|
+
| rubyurl.com # 3766
|
|
96
|
+
| uris.jp # 2749
|
|
97
|
+
| ub0.cc # 2607
|
|
98
|
+
| twurl.cc # 2545
|
|
99
|
+
| moourl.com # 2280
|
|
100
|
+
| rurl.org # 2271
|
|
101
|
+
| url.ie # 2156
|
|
102
|
+
)/([\w\-]+)}ix
|
|
103
|
+
def self.match_tinyurlish url
|
|
104
|
+
m = TINY_URLISHES_RE.match(url) or return
|
|
105
|
+
host, path = m.captures
|
|
106
|
+
"http://#{host.downcase}/#{path}"
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
#
|
|
110
|
+
# If the base part looks like a tinyurlish, return an instantiated object
|
|
111
|
+
# Otherwise, return nil
|
|
112
|
+
#
|
|
113
|
+
# This will happily turn
|
|
114
|
+
# http://tinyurl.com/aaASDF/A-BUNCH_OF_BOGOSITY
|
|
115
|
+
# into just the http://tinyurl.com/aaASDF
|
|
116
|
+
#
|
|
117
|
+
def self.new_if_tinyurlish url
|
|
118
|
+
src_url = match_tinyurlish(url) or return
|
|
119
|
+
new(src_url, nil, nil)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
#
|
|
127
|
+
# Frequency of host part from ~ 6M URLs.
|
|
128
|
+
# Just a rough guide -- don't go launchin' yer SEO campaign using these numbers.
|
|
129
|
+
#
|
|
130
|
+
# 4969626 tinyurl.com
|
|
131
|
+
# 406718 is.gd
|
|
132
|
+
# 298590 bit.ly
|
|
133
|
+
# 169796 twurl.nl
|
|
134
|
+
# 107961 snipurl.com
|
|
135
|
+
# 38793 tr.im
|
|
136
|
+
# 37576 snurl.com
|
|
137
|
+
# 26897 snipr.com
|
|
138
|
+
# 20965 jijr.com
|
|
139
|
+
# 19700 cli.gs
|
|
140
|
+
# 19402 budurl.com
|
|
141
|
+
# 11621 xrl.us
|
|
142
|
+
# 9140 tiny.cc
|
|
143
|
+
# 8148 zi.ma
|
|
144
|
+
# 6922 s3nt.com
|
|
145
|
+
# 6848 ow.ly
|
|
146
|
+
# 6666 poprl.com
|
|
147
|
+
# 5262 piurl.com
|
|
148
|
+
# 4435 ur1.ca
|
|
149
|
+
# 4105 short.to
|
|
150
|
+
# 4087 urlenco.de
|
|
151
|
+
# 4045 zz.gd
|
|
152
|
+
# 3766 rubyurl.com
|
|
153
|
+
# 2749 uris.jp
|
|
154
|
+
# 2607 ub0.cc
|
|
155
|
+
# 2545 twurl.cc
|
|
156
|
+
# 2280 moourl.com
|
|
157
|
+
# 2271 rurl.org
|
|
158
|
+
# 2156 url.ie
|
|
159
|
+
#
|
|
160
|
+
# 235192 ff.im
|
|
161
|
+
# 82062 bkite.com
|
|
162
|
+
# 81792 blip.fm
|
|
163
|
+
# 53928 ping.fm
|
|
164
|
+
# 28826 loopt.us
|
|
165
|
+
# 13724 ad.vu
|
|
166
|
+
# 8438 tgr.me
|
|
167
|
+
# 8418 adjix.com
|
|
168
|
+
# 5061 www.url.inc
|
|
169
|
+
# pastoid.com
|
|
170
|
+
#
|
|
171
|
+
# 339312 twitpic.com
|
|
172
|
+
# 28282 rsstotwitter.com
|
|
173
|
+
# 26641 twitter.com
|
|
174
|
+
# 22263 www.nicovideo.jp
|
|
175
|
+
# 21897 www.flickr.com
|
|
176
|
+
# 20910 live.nicovideo.jp
|
|
177
|
+
# 18604 book.akahoshitakuya.com
|
|
178
|
+
# 16674 movapic.com
|
|
179
|
+
# 15844 jobfeedr.com
|
|
180
|
+
# 15049 u.mavrev.com
|
|
181
|
+
# 14537 f.hatena.ne.jp
|
|
182
|
+
# 14454 www.last.fm
|
|
183
|
+
# 12003 be
|
|
184
|
+
# 11548 www.desktoptopia.com
|
|
185
|
+
# 10712 raptr.com
|
|
186
|
+
# 10340 hellotxt.com
|
|
187
|
+
# 10266 deals.clhmedia.com
|
|
188
|
+
# 9910 mrtweet.net
|
|
189
|
+
# 9818 echos.tumblr.com
|
|
190
|
+
# 9378 echomas.tumblr.com
|
|
191
|
+
# 9330 flickr.com
|
|
192
|
+
# 8695 weather.livedoor.com
|
|
193
|
+
# 8525 d.hatena.ne.jp
|
|
194
|
+
# 7524 radiopopbitch.com
|
|
195
|
+
# 7501 qik.com
|
|
196
|
+
# 7161 aweber.com
|
|
197
|
+
# 7086 www.myspace.com
|
|
198
|
+
# 6990 activerain.com
|
|
199
|
+
# 6811 ruwt.tv
|
|
200
|
+
# 6722 bbc.co.uk
|
|
201
|
+
# 6344 www.amazon.com
|
|
202
|
+
# 6328 photohito.com
|
|
203
|
+
# 6142 techwatching.com
|
|
204
|
+
# 6117 kexplorer.com
|
|
205
|
+
# 6009 EzineArticles.com
|
|
206
|
+
# 5964 www.squidoo.com
|
|
207
|
+
# 5929 news.bbc.co.uk
|
|
208
|
+
# 5756 mobypicture.com
|
|
209
|
+
# 5489 www.youtube.com
|
|
210
|
+
# 5454 robotbling.com
|
|
211
|
+
# 5433 www.timesoftheinternet.com
|
|
212
|
+
# 5182 www.blogtv.com
|
|
213
|
+
# 5105 tiny12.tv
|
|
214
|
+
# 5084 www.imdb.com
|
|
215
|
+
# 4894 www.ustream.tv
|
|
216
|
+
# 4800 vimeo.com
|
|
217
|
+
# 4796 yes.com
|
|
218
|
+
# 4665 5ver.com
|
|
219
|
+
# 4596 www.absurdtrivia.com
|
|
220
|
+
# 4585 twittgroups.com
|
|
221
|
+
# 4525 funp.com
|
|
222
|
+
# 4472 en.wikipedia.org
|
|
223
|
+
# 4431 hypem.com
|
|
224
|
+
# 4313 anond.hatelabo.jp
|
|
225
|
+
# 4222 twitxr.com
|
|
226
|
+
# 4045 twitter.grader.com
|
|
227
|
+
# 3987 yourinternetradio.com
|
|
228
|
+
# 3976 TwitPWR.com
|
|
229
|
+
# 3964 sfbay.craigslist.org
|
|
230
|
+
# 3876 x.imeem.com
|
|
231
|
+
# 3757 www.invertia.com
|
|
232
|
+
# 3556 timesurl.at
|
|
233
|
+
# 3531 www.jb.man.ac.uk
|
|
234
|
+
# 3528 bossalive.com
|
|
235
|
+
# 3410 buzztter.com
|
|
236
|
+
# 3337 www.accuweather.com
|
|
237
|
+
# 3324 drawr.net
|
|
238
|
+
# 3285 xkcd.com
|
|
239
|
+
# 3270 maps.google.com
|
|
240
|
+
# 3243 tobtr.com
|
|
241
|
+
# 3182 www.cnn.com
|
|
242
|
+
# 3180 www.stickam.com
|
|
243
|
+
# 3177 www.dailymugshot.com
|
|
244
|
+
# 3163 r.reuters.com
|
|
245
|
+
# 2963 148apps.com
|
|
246
|
+
# 2885 unvlog.com
|
|
247
|
+
# 2853 tweetwasters.com
|
|
248
|
+
# 2778 eloglife.net
|
|
249
|
+
# 2758 dihitt.com.br
|
|
250
|
+
# 2751 openzap.com
|
|
251
|
+
# 2727 blip.tv
|
|
252
|
+
# 2699 www.sailingxperience.com
|
|
253
|
+
# 2682 eepics.com
|
|
254
|
+
# 2638 blog.livedoor.jp
|
|
255
|
+
# 2552 iphone.robotbling.com
|
|
256
|
+
# 2528 phodroid.com
|
|
257
|
+
# 2490 twitter.digsby.com
|
|
258
|
+
# 2420 plazes.com
|
|
259
|
+
# 2391 www.google.com
|
|
260
|
+
# 2311 www.msnbc.msn.com
|
|
261
|
+
# 2228 gamerdna.com
|
|
262
|
+
# 2227 gyazo.com
|
|
263
|
+
# 2197 www.vimeo.com
|
|
264
|
+
# 2184 entertonement.com
|
|
265
|
+
# 2157 c2.koukokukaigisitsu.com
|
|
266
|
+
#
|
|
267
|
+
|
|
268
|
+
# def spread_key() self.src_url[-3..-1] end
|
|
269
|
+
# def output_form spread=false
|
|
270
|
+
# spread ? ("%s-%s\t%s"%[resource_name, spread_key, to_tsv]) : super()
|
|
271
|
+
# end
|