wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
require 'date'
|
|
2
|
+
module Wuclan::Twitter::Model
|
|
3
|
+
module ModelCommon
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# By default, take the front num_key_fields of the flattened struct
|
|
7
|
+
#
|
|
8
|
+
def key
|
|
9
|
+
to_a[0..(num_key_fields-1)].join("-")
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# ===========================================================================
|
|
13
|
+
#
|
|
14
|
+
# Metrics
|
|
15
|
+
#
|
|
16
|
+
def scrape_age
|
|
17
|
+
(DateTime.now - DateTime.parse_safely(scraped_at)).to_f
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def days_since_created
|
|
21
|
+
(DateTime.now - DateTime.parse_safely(created_at)).to_f
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# ===========================================================================
|
|
25
|
+
#
|
|
26
|
+
# Field conversions
|
|
27
|
+
#
|
|
28
|
+
# Make the data easier for batch flat-record processing
|
|
29
|
+
#
|
|
30
|
+
|
|
31
|
+
#
|
|
32
|
+
# Convert date into flat, uniform format
|
|
33
|
+
# This method is idempotent: repeated calls give same result.
|
|
34
|
+
#
|
|
35
|
+
def self.flatten_date dt
|
|
36
|
+
return dt if dt =~ /\d{14}/
|
|
37
|
+
DateTime.parse(dt).to_flat if dt
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
#
|
|
41
|
+
# Zero-pad IDs to a full 10 digits (the max digits for an unsigned 32-bit
|
|
42
|
+
# integer).
|
|
43
|
+
#
|
|
44
|
+
# nil id will be encoded as 0. Shit happens and we'd rather be idempotent
|
|
45
|
+
# than picky.
|
|
46
|
+
#
|
|
47
|
+
# Note that sometime in 2010 (or sooner, depending on its growth rate: in 2008
|
|
48
|
+
# Dec it was 1.8M/day) the status_id will exceed 32 bits. Something will
|
|
49
|
+
# happen then.
|
|
50
|
+
# This method is idempotent: repeated calls give same result.
|
|
51
|
+
#
|
|
52
|
+
def self.zeropad_id id
|
|
53
|
+
id ||= 0
|
|
54
|
+
'%010d' % [id.to_i]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
#
|
|
58
|
+
# Express boolean as 1 (true) or 0 (false). In contravention of typical ruby
|
|
59
|
+
# semantics (but in a way that is more robust for wukong-like batch
|
|
60
|
+
# processing), the number 0, the string '0', nil and false are all considered
|
|
61
|
+
# false. (This also makes the method idempotent: repeated calls give same result.)
|
|
62
|
+
#
|
|
63
|
+
def self.unbooleanize bool
|
|
64
|
+
case bool
|
|
65
|
+
when 0, '0', false, nil
|
|
66
|
+
then 0
|
|
67
|
+
else 1
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Models
|
|
3
|
+
class Edge < TypedStruct.new(
|
|
4
|
+
[:src, Integer],
|
|
5
|
+
[:dest, Integer]
|
|
6
|
+
)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
class MultiEdge < TypedStruct.new(
|
|
10
|
+
[:src, Integer],
|
|
11
|
+
[:dest, Integer],
|
|
12
|
+
[:a_follows_b, Integer],
|
|
13
|
+
[:b_follows_a, Integer],
|
|
14
|
+
[:a_replies_b, Integer],
|
|
15
|
+
[:b_replies_a, Integer],
|
|
16
|
+
[:a_atsigns_b, Integer],
|
|
17
|
+
[:b_atsigns_a, Integer],
|
|
18
|
+
[:a_retweets_b, Integer],
|
|
19
|
+
[:b_retweets_a, Integer],
|
|
20
|
+
[:a_favorites_b, Integer],
|
|
21
|
+
[:b_favorites_a, Integer]
|
|
22
|
+
)
|
|
23
|
+
def strong?
|
|
24
|
+
# Symmetric relationship, easier
|
|
25
|
+
( (a_follows_b && b_follows_a) &&
|
|
26
|
+
() )
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
module Wuclan::Twitter::Model
|
|
2
|
+
# features common to all user-user relationships.
|
|
3
|
+
module RelationshipBase
|
|
4
|
+
module ClassMethods
|
|
5
|
+
def rel_name
|
|
6
|
+
@rel_name ||= resource_name.to_s.gsub(/a_(.*)_b/, '\1')
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.included base
|
|
11
|
+
base.class_eval{ extend ClassMethods }
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Follower/Friend relationship
|
|
16
|
+
class AFollowsB < TypedStruct.new(
|
|
17
|
+
[:user_a_id, Integer],
|
|
18
|
+
[:user_b_id, Integer]
|
|
19
|
+
)
|
|
20
|
+
include ModelCommon
|
|
21
|
+
include RelationshipBase
|
|
22
|
+
# Key on the user-user pair
|
|
23
|
+
def num_key_fields() 2 end
|
|
24
|
+
def numeric_id_fields() [:user_a_id, :user_b_id] ; end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# User ==favorites_tweet=> tweet ==by_user=>b
|
|
28
|
+
class AFavoritesB < TypedStruct.new(
|
|
29
|
+
[:user_a_id, Integer],
|
|
30
|
+
[:user_b_id, Integer],
|
|
31
|
+
[:status_id, Integer]
|
|
32
|
+
)
|
|
33
|
+
include ModelCommon
|
|
34
|
+
include RelationshipBase
|
|
35
|
+
# Key on user_a-user_b-status_id (really just user_a-status_id is enough)
|
|
36
|
+
def num_key_fields() 3 end
|
|
37
|
+
def numeric_id_fields() [:user_a_id, :user_b_id, :status_id] ; end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Direct (threaded) replies: occur at the start of a tweet.
|
|
41
|
+
class ARepliesB < TypedStruct.new(
|
|
42
|
+
[:user_a_id, Integer],
|
|
43
|
+
[:user_b_id, Integer],
|
|
44
|
+
[:status_id, Integer],
|
|
45
|
+
[:in_reply_to_status_id, Integer]
|
|
46
|
+
)
|
|
47
|
+
include ModelCommon
|
|
48
|
+
include RelationshipBase
|
|
49
|
+
# Key on user_a-user_b-status_id
|
|
50
|
+
def num_key_fields() 3 end
|
|
51
|
+
def numeric_id_fields() [:user_a_id, :user_b_id, :status_id, :in_reply_to_status_id] ; end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Direct (threaded) replies: occur at the start of a tweet.
|
|
55
|
+
class ARepliesBName < TypedStruct.new(
|
|
56
|
+
[:user_a_name, Integer],
|
|
57
|
+
[:user_b_name, Integer],
|
|
58
|
+
[:status_id, Integer],
|
|
59
|
+
[:in_reply_to_status_id, Integer],
|
|
60
|
+
[:user_a_sid, Integer],
|
|
61
|
+
[:user_b_sid, Integer]
|
|
62
|
+
)
|
|
63
|
+
include ModelCommon
|
|
64
|
+
include RelationshipBase
|
|
65
|
+
# Key on user_a-user_b-status_id
|
|
66
|
+
def num_key_fields() 3 end
|
|
67
|
+
def numeric_id_fields() [:user_a_id, :user_b_id, :status_id, :in_reply_to_status_id] ; end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Atsign mentions anywhere in the tweet
|
|
71
|
+
# note we have no user_b_id for @foo
|
|
72
|
+
class AAtsignsB < TypedStruct.new(
|
|
73
|
+
[:user_a_id, Integer],
|
|
74
|
+
[:user_b_name, String],
|
|
75
|
+
[:status_id, Integer]
|
|
76
|
+
)
|
|
77
|
+
include ModelCommon
|
|
78
|
+
include RelationshipBase
|
|
79
|
+
# Key on user_a-user_b-status_id
|
|
80
|
+
def num_key_fields() 3 end
|
|
81
|
+
def numeric_id_fields() [:user_a_id, :status_id] ; end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Atsign mentions anywhere in the tweet
|
|
85
|
+
# note we have no user_b_id for @foo
|
|
86
|
+
class AAtsignsBId < TypedStruct.new(
|
|
87
|
+
[:user_a_id, Integer],
|
|
88
|
+
[:user_b_id, Integer],
|
|
89
|
+
[:status_id, Integer]
|
|
90
|
+
)
|
|
91
|
+
include ModelCommon
|
|
92
|
+
include RelationshipBase
|
|
93
|
+
# Key on user_a-user_b-status_id
|
|
94
|
+
def num_key_fields() 3 end
|
|
95
|
+
def numeric_id_fields() [:user_a_id, :user_b_id, :status_id] ; end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
#
|
|
100
|
+
# A re-tweet is /sent/ by user_a, repeating an earlier message by user_b
|
|
101
|
+
# Any tweet containing text roughly similar to
|
|
102
|
+
# RT @user <stuff>
|
|
103
|
+
# with equivalently for RT: retweet, via, retweeting
|
|
104
|
+
#
|
|
105
|
+
# !!! OR !!!
|
|
106
|
+
#
|
|
107
|
+
# A retweet whore request, something like
|
|
108
|
+
# pls RT Hey lookit me
|
|
109
|
+
#
|
|
110
|
+
# We just pass along both in the same data structure; the heuristic is poor
|
|
111
|
+
# enough that we leave it to later steps to be clever. (Note retweets and
|
|
112
|
+
# non-retweet-whore-requests have user_b_name set and unset respectively.)
|
|
113
|
+
#
|
|
114
|
+
# +user_a_id:+ the user who sent the re-tweet
|
|
115
|
+
# +status_id:+ the id of the tweet *containing* the re-tweet (for the ID of the original tweet you're on your own.)
|
|
116
|
+
# +user_b_name:+ the user citied as originating: RT @user_b_name
|
|
117
|
+
# +please_flag:+ a 1 if the text contains 'please' or 'plz' as a stand-alone word
|
|
118
|
+
# +text:+ the *full* text of the tweet
|
|
119
|
+
#
|
|
120
|
+
class ARetweetsB < TypedStruct.new(
|
|
121
|
+
[:user_a_id, Integer],
|
|
122
|
+
[:user_b_name, String],
|
|
123
|
+
[:status_id, Integer],
|
|
124
|
+
[:please_flag, Integer],
|
|
125
|
+
[:text, String]
|
|
126
|
+
)
|
|
127
|
+
include ModelCommon
|
|
128
|
+
include RelationshipBase
|
|
129
|
+
|
|
130
|
+
def initialize *args
|
|
131
|
+
super *args
|
|
132
|
+
self.please_flag = ModelCommon.unbooleanize(self.please_flag)
|
|
133
|
+
end
|
|
134
|
+
# Key on retweeting_user-user-tweet_id
|
|
135
|
+
def num_key_fields() 3 end
|
|
136
|
+
def numeric_id_fields() [:user_a_id, :status_id] ; end
|
|
137
|
+
#
|
|
138
|
+
# If there's no user we'll assume this
|
|
139
|
+
# is a retweet and not an rtwhore.
|
|
140
|
+
#
|
|
141
|
+
def is_retweet?
|
|
142
|
+
! user_b_name.blank?
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
class ARetweetsBId < TypedStruct.new(
|
|
147
|
+
[:user_a_id, Integer],
|
|
148
|
+
[:user_b_id, Integer],
|
|
149
|
+
[:status_id, Integer],
|
|
150
|
+
[:please_flag, Integer],
|
|
151
|
+
[:text, String]
|
|
152
|
+
)
|
|
153
|
+
include ModelCommon
|
|
154
|
+
include RelationshipBase
|
|
155
|
+
|
|
156
|
+
def initialize *args
|
|
157
|
+
super *args
|
|
158
|
+
self.please_flag = ModelCommon.unbooleanize(self.please_flag)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Key on retweeting_user-user-tweet_id
|
|
162
|
+
def num_key_fields() 3 end
|
|
163
|
+
def numeric_id_fields() [:user_a_id, :user_b_id, :status_id] ; end
|
|
164
|
+
|
|
165
|
+
#
|
|
166
|
+
# If there's no user we'll assume this
|
|
167
|
+
# is a retweet and not an rtwhore.
|
|
168
|
+
#
|
|
169
|
+
def is_retweet?
|
|
170
|
+
! user_b_name.blank?
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
|
|
2
|
+
# ===========================================================================
|
|
3
|
+
#
|
|
4
|
+
# tests = [
|
|
5
|
+
# 'http://foo.us/',
|
|
6
|
+
# 'http://foo.us/a',
|
|
7
|
+
# 'http://foo.us/a?q=a',
|
|
8
|
+
# 'http://foo.us/a#a',
|
|
9
|
+
# 'http://foo.us/a?q=a&b=3#a',
|
|
10
|
+
# 'http://foo.us/a;2/~m-_.\':%+@,;?q=a&b=3#a',
|
|
11
|
+
# 'http://foo.us/a?q=a?',
|
|
12
|
+
# 'http://foo.us/=a#a',
|
|
13
|
+
# 'http://foo.us/a&?q=a&b=3#a',
|
|
14
|
+
# 'http://foo.us/a;2/~m-_.\':%+@,;?q=a&b=3#a&',
|
|
15
|
+
# ]
|
|
16
|
+
# tests.each do |test_str|
|
|
17
|
+
# p test_str.scan(RE_URL)
|
|
18
|
+
# end
|
|
19
|
+
#
|
|
20
|
+
# atsign_tests = [
|
|
21
|
+
# '@foo hello',
|
|
22
|
+
# ' @foo @hello ',
|
|
23
|
+
# ' @foo, @hello ',
|
|
24
|
+
# '-@foo,@hello',
|
|
25
|
+
# '@foo@bar ',
|
|
26
|
+
# 'a basdf@foo b',
|
|
27
|
+
# 'http://@foo',
|
|
28
|
+
# 'foo@bar @bar@foo @zz+',
|
|
29
|
+
# ].each do |test_str|
|
|
30
|
+
# p test_str.scan(RE_ATSIGNS)
|
|
31
|
+
# end
|
|
32
|
+
#
|
|
33
|
+
# hash_tag_tests = [
|
|
34
|
+
# '#downtown',
|
|
35
|
+
# '#downtown?',
|
|
36
|
+
# '#downtown.',
|
|
37
|
+
# '#downtown]',
|
|
38
|
+
# '#downtown}',
|
|
39
|
+
# '#downtown)',
|
|
40
|
+
# '#downtown,',
|
|
41
|
+
# '#downtown;',
|
|
42
|
+
# '#downtown\'',
|
|
43
|
+
# '#downtown\'s',
|
|
44
|
+
# '#downtown_',
|
|
45
|
+
# '#down+town',
|
|
46
|
+
# '#down_town',
|
|
47
|
+
# '#down-town',
|
|
48
|
+
# '#www.downtown.com',
|
|
49
|
+
# '#www.downtown.com.',
|
|
50
|
+
# '##',
|
|
51
|
+
# '#.',
|
|
52
|
+
# '#taxonomy:binomial=Alcedo_atthis',
|
|
53
|
+
# '#geo:lat=52.478342',
|
|
54
|
+
# '#geo:lon=53.609130',
|
|
55
|
+
# 'a#www.downtown.com.',
|
|
56
|
+
# ' #www.downtown.com.',
|
|
57
|
+
# ' =#www.downtown.com.!',
|
|
58
|
+
# ].each do |test_str|
|
|
59
|
+
# p test_str.scan(RE_HASHTAGS)
|
|
60
|
+
# end
|
|
61
|
+
#
|
|
62
|
+
#
|
|
63
|
+
# # #downtown
|
|
64
|
+
# # #downtown
|
|
65
|
+
# # #downtown
|
|
66
|
+
# # #downtown
|
|
67
|
+
# # #downtown
|
|
68
|
+
# # #downtown
|
|
69
|
+
# # #downtown
|
|
70
|
+
# # #downtown
|
|
71
|
+
# # #downtown
|
|
72
|
+
# # #downtown
|
|
73
|
+
# # #downtown_
|
|
74
|
+
# # #down+town
|
|
75
|
+
# # #down_town
|
|
76
|
+
# # #down-town
|
|
77
|
+
# # #www.downtown.com
|
|
78
|
+
# # #www.downtown.com
|
|
79
|
+
# #
|
|
80
|
+
# #
|
|
81
|
+
# # #taxonomy:binomial=Alcedo_atthis
|
|
82
|
+
# # #geo:lat=52.478342
|
|
83
|
+
# # #geo:lon=53.609130
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
require 'wuclan/grok/tweet_regexes'
|
|
2
|
+
require 'wuclan/models'
|
|
3
|
+
require 'wukong/encoding'
|
|
4
|
+
include Wuclan::Grok::TweetRegexes
|
|
5
|
+
include Wuclan::Models
|
|
6
|
+
|
|
7
|
+
Tweet.class_eval do
|
|
8
|
+
|
|
9
|
+
#
|
|
10
|
+
#
|
|
11
|
+
#
|
|
12
|
+
def tweet_len
|
|
13
|
+
decoded_text.length
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# Any mention of another user, whether at the beginning of a line (and thus
|
|
18
|
+
# *also* an ARepliesB), a retweet, or just somewhere in the body of the text
|
|
19
|
+
#
|
|
20
|
+
def replies
|
|
21
|
+
unless (in_reply_to_user_id.blank?) || (in_reply_to_user_id.to_i == 0)
|
|
22
|
+
ARepliesB.new(twitter_user_id, in_reply_to_user_id, self.id, in_reply_to_status_id)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
#
|
|
27
|
+
# Any mention of another user, whether at the beginning of a line (and thus
|
|
28
|
+
# *also* an ARepliesB), a retweet, or just somewhere in the body of the text
|
|
29
|
+
#
|
|
30
|
+
def atsigns
|
|
31
|
+
matches = decoded_text.scan(RE_ATSIGNS)
|
|
32
|
+
matches.map do |user_b_name|
|
|
33
|
+
user_b_name = user_b_name.first.wukong_encode
|
|
34
|
+
AAtsignsB.new(twitter_user_id, user_b_name, self.id)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
#
|
|
39
|
+
# Remember that a retweet could be an actual retweet, a retweet whore request,
|
|
40
|
+
# or a retweet of a retweet whore request.
|
|
41
|
+
#
|
|
42
|
+
# Or, it could have just fooled us.
|
|
43
|
+
#
|
|
44
|
+
# Anyway you can take it from here.
|
|
45
|
+
#
|
|
46
|
+
def retweets
|
|
47
|
+
please_flag = RE_RTWHORE.match(decoded_text)
|
|
48
|
+
retweet_match = RE_RETWEET.match(decoded_text)
|
|
49
|
+
return unless please_flag || retweet_match
|
|
50
|
+
user_b_name = retweet_match.captures.first.wukong_encode if retweet_match
|
|
51
|
+
ARetweetsB.new(twitter_user_id, user_b_name, self.id, please_flag, self.text)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
#
|
|
55
|
+
# Hashtags indicate a topic: #hashtag
|
|
56
|
+
#
|
|
57
|
+
def hashtags
|
|
58
|
+
matches = decoded_text.scan(RE_HASHTAGS)
|
|
59
|
+
matches.map do |hashtag_text|
|
|
60
|
+
hashtag_text = hashtag_text.first.wukong_encode
|
|
61
|
+
Hashtag.new(hashtag_text, self.id, twitter_user_id)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
#
|
|
66
|
+
# URLs within a tweet.
|
|
67
|
+
# can be multiple per tweet.
|
|
68
|
+
#
|
|
69
|
+
# Uses a regexp more selective than all canonically allowed - see
|
|
70
|
+
# tweet_regexes
|
|
71
|
+
#
|
|
72
|
+
def tweet_urls
|
|
73
|
+
matches = decoded_text.scan(RE_URL)
|
|
74
|
+
matches.map do |tweet_url_text|
|
|
75
|
+
tweet_url_text = tweet_url_text.first.wukong_encode
|
|
76
|
+
TweetUrl.new(tweet_url_text, self.id, twitter_user_id)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def text_elements
|
|
81
|
+
# replies # done in tweet??
|
|
82
|
+
# atsigns
|
|
83
|
+
# tweet_url
|
|
84
|
+
# hashtags
|
|
85
|
+
# tweet length
|
|
86
|
+
# words
|
|
87
|
+
[
|
|
88
|
+
replies,
|
|
89
|
+
atsigns,
|
|
90
|
+
retweets,
|
|
91
|
+
hashtags,
|
|
92
|
+
tweet_urls,
|
|
93
|
+
].compact.flatten
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
end
|