twitter_to_csv 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -2
- data/.rvmrc +1 -1
- data/Gemfile +1 -3
- data/README.markdown +34 -4
- data/bin/twitter_to_csv +60 -11
- data/lib/twitter_to_csv/afinn/AFINN-111.txt +2478 -0
- data/lib/twitter_to_csv/afinn/AFINN-96.txt +1480 -0
- data/lib/twitter_to_csv/afinn/AFINN-README.txt +43 -0
- data/lib/twitter_to_csv/csv_builder.rb +137 -17
- data/lib/twitter_to_csv/twitter_watcher.rb +24 -16
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +253 -9
- data/twitter_to_csv.gemspec +4 -3
- metadata +53 -13
@@ -0,0 +1,43 @@
|
|
1
|
+
AFINN is a list of English words rated for valence with an integer
|
2
|
+
between minus five (negative) and plus five (positive). The words have
|
3
|
+
been manually labeled by Finn Årup Nielsen in 2009-2011. The file
|
4
|
+
is tab-separated. There are two versions:
|
5
|
+
|
6
|
+
AFINN-111: Newest version with 2477 words and phrases.
|
7
|
+
|
8
|
+
AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
|
9
|
+
are 1480 lines, as some words are listed twice. The word list in not
|
10
|
+
entirely in alphabetic ordering.
|
11
|
+
|
12
|
+
An evaluation of the word list is available in:
|
13
|
+
|
14
|
+
Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
|
15
|
+
sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
|
16
|
+
|
17
|
+
The list was used in:
|
18
|
+
|
19
|
+
Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
|
20
|
+
Michael Etter, "Good Friends, Bad News - Affect and Virality in
|
21
|
+
Twitter", The 2011 International Workshop on Social Computing,
|
22
|
+
Network, and Services (SocialComNet 2011).
|
23
|
+
|
24
|
+
|
25
|
+
This database of words is copyright protected and distributed under
|
26
|
+
"Open Database License (ODbL) v1.0"
|
27
|
+
http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
|
28
|
+
copyleft license.
|
29
|
+
|
30
|
+
See comments on the word list here:
|
31
|
+
http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
|
32
|
+
|
33
|
+
|
34
|
+
In Python the file may be read into a dictionary with:
|
35
|
+
|
36
|
+
>>> afinn = dict(map(lambda (k,v): (k,int(v)),
|
37
|
+
[ line.split('\t') for line in open("AFINN-111.txt") ]))
|
38
|
+
>>> afinn["Good".lower()]
|
39
|
+
3
|
40
|
+
>>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split()))
|
41
|
+
2
|
42
|
+
|
43
|
+
|
@@ -1,17 +1,18 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'pp'
|
3
|
+
require 'elif'
|
4
|
+
require 'time'
|
3
5
|
|
4
6
|
module TwitterToCsv
|
5
7
|
class CsvBuilder
|
6
8
|
attr_accessor :options, :sampled_fields
|
7
9
|
|
8
|
-
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
|
9
|
-
URL_REGEX = %r"\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s\`\!\(\)\[\]\{\};:'\".,<>\?«»“”‘’]))"i
|
10
|
-
|
11
10
|
def initialize(options = {})
|
12
11
|
@options = options
|
13
12
|
@sampled_fields = {}
|
14
13
|
@num_samples = 0
|
14
|
+
@retweet_counts = {}
|
15
|
+
@retweet_hour_counts = {}
|
15
16
|
end
|
16
17
|
|
17
18
|
def run(&block)
|
@@ -35,22 +36,79 @@ module TwitterToCsv
|
|
35
36
|
end
|
36
37
|
end
|
37
38
|
|
39
|
+
def within_time_window?(status)
|
40
|
+
if options[:start] || options[:end]
|
41
|
+
created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
|
42
|
+
return false if options[:start] && created_at < options[:start]
|
43
|
+
return false if options[:end] && created_at >= options[:end]
|
44
|
+
end
|
45
|
+
true
|
46
|
+
end
|
47
|
+
|
48
|
+
def display_rolledup_status?(status)
|
49
|
+
created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
|
50
|
+
@newest_status_at = created_at if @newest_status_at.nil?
|
51
|
+
|
52
|
+
if status['retweeted_status'] && status['retweeted_status']['id']
|
53
|
+
# This is a retweet.
|
54
|
+
original_created_at = status['retweeted_status']['created_at'].is_a?(Time) ? status['retweeted_status']['created_at'] : Time.parse(status['retweeted_status']['created_at'])
|
55
|
+
if !options[:retweet_window] || created_at <= original_created_at + options[:retweet_window] * 60 * 60 * 24
|
56
|
+
@retweet_counts[status['retweeted_status']['id']] ||= 0
|
57
|
+
@retweet_counts[status['retweeted_status']['id']] = status['retweeted_status']['retweet_count'] if status['retweeted_status']['retweet_count'] > @retweet_counts[status['retweeted_status']['id']]
|
58
|
+
|
59
|
+
|
60
|
+
if options[:retweet_counts_at]
|
61
|
+
@retweet_hour_counts[status['retweeted_status']['id']] ||= options[:retweet_counts_at].map { 0 }
|
62
|
+
options[:retweet_counts_at].each.with_index do |hour_mark, index|
|
63
|
+
if created_at <= original_created_at + hour_mark * 60 * 60 && status['retweeted_status']['retweet_count'] > @retweet_hour_counts[status['retweeted_status']['id']][index]
|
64
|
+
@retweet_hour_counts[status['retweeted_status']['id']][index] = status['retweeted_status']['retweet_count']
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
false
|
70
|
+
else
|
71
|
+
# This is an original status.
|
72
|
+
if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
|
73
|
+
if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
|
74
|
+
status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
75
|
+
status['_retweet_hour_counts'] = @retweet_hour_counts.delete(status['id']) if options[:retweet_counts_at]
|
76
|
+
true
|
77
|
+
else
|
78
|
+
false
|
79
|
+
end
|
80
|
+
else
|
81
|
+
false
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
38
86
|
def handle_status(status, &block)
|
39
|
-
if
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
87
|
+
if within_time_window?(status)
|
88
|
+
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
89
|
+
if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
|
90
|
+
log_json(status) if options[:json]
|
91
|
+
log_csv(status) if options[:csv]
|
92
|
+
yield_status(status, &block) if block
|
93
|
+
sample_fields(status) if options[:sample_fields]
|
94
|
+
analyze_gaps(status, options[:analyze_gaps]) if options[:analyze_gaps]
|
95
|
+
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
96
|
+
end
|
97
|
+
end
|
45
98
|
end
|
46
99
|
end
|
47
100
|
|
48
101
|
def log_csv_header
|
49
102
|
header_labels = options[:fields].dup
|
50
103
|
|
51
|
-
|
52
|
-
|
53
|
-
|
104
|
+
header_labels += ["average_sentiment", "sentiment_words"] if options[:compute_sentiment]
|
105
|
+
header_labels << "word_count" if options[:compute_word_count]
|
106
|
+
|
107
|
+
options[:retweet_counts_at].each { |hours| header_labels << "retweets_at_#{hours}_hours" } if options[:retweet_counts_at]
|
108
|
+
|
109
|
+
options[:url_columns].times { |i| header_labels << "url_#{i+1}" } if options[:url_columns] && options[:url_columns] > 0
|
110
|
+
options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
|
111
|
+
options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
|
54
112
|
|
55
113
|
options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
56
114
|
end
|
@@ -70,22 +128,84 @@ module TwitterToCsv
|
|
70
128
|
}.to_s
|
71
129
|
end
|
72
130
|
|
131
|
+
row += compute_sentiment(status["text"]) if options[:compute_sentiment]
|
132
|
+
|
133
|
+
row << status["text"].split(/\s+/).length if options[:compute_word_count]
|
134
|
+
|
135
|
+
row += status["_retweet_hour_counts"] if options[:retweet_counts_at]
|
136
|
+
|
73
137
|
if options[:url_columns] && options[:url_columns] > 0
|
74
|
-
urls = status[
|
138
|
+
urls = (status["entities"] && (status["entities"]["urls"] || []).map {|i| i["expanded_url"] || i["url"] }) || []
|
75
139
|
options[:url_columns].times { |i| row << urls[i].to_s }
|
76
140
|
end
|
77
141
|
|
142
|
+
if options[:hashtag_columns] && options[:hashtag_columns] > 0
|
143
|
+
hashes = (status["entities"] && (status["entities"]["hashtags"] || []).map {|i| i["text"] }) || []
|
144
|
+
options[:hashtag_columns].times { |i| row << hashes[i].to_s }
|
145
|
+
end
|
146
|
+
|
147
|
+
if options[:user_mention_columns] && options[:user_mention_columns] > 0
|
148
|
+
users = (status["entities"] && (status["entities"]["user_mentions"] || []).map {|i| i["screen_name"] }) || []
|
149
|
+
options[:user_mention_columns].times { |i| row << users[i].to_s }
|
150
|
+
end
|
151
|
+
|
78
152
|
row
|
79
153
|
end
|
80
154
|
|
155
|
+
def afinn
|
156
|
+
@afinn_cache ||= begin
|
157
|
+
words_or_phrases = []
|
158
|
+
File.read(File.expand_path(File.join(File.dirname(__FILE__), "afinn", "AFINN-111.txt"))).each_line do |line|
|
159
|
+
word_or_phrase, valence = line.split(/\t/)
|
160
|
+
pattern = Regexp::escape word_or_phrase.gsub(/-/, " ").gsub(/'/, '')
|
161
|
+
words_or_phrases << [/\b#{pattern}\b/i, pattern.length, valence.to_f]
|
162
|
+
end
|
163
|
+
words_or_phrases.sort {|b, a| a[1] <=> b[1] }
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def compute_sentiment(original_text)
|
168
|
+
text = original_text.downcase.gsub(/'/, '').gsub(/[^a-z0-9]/, ' ').gsub(/\s+/, ' ').strip
|
169
|
+
count = 0
|
170
|
+
valence_sum = 0
|
171
|
+
afinn.each do |pattern, length, valence|
|
172
|
+
while text =~ pattern
|
173
|
+
text.sub! pattern, ''
|
174
|
+
valence_sum += valence
|
175
|
+
count += 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
if count > 0
|
179
|
+
[valence_sum / count.to_f, count]
|
180
|
+
else
|
181
|
+
[0, 0]
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
81
185
|
def replay_from(filename, &block)
|
82
|
-
|
83
|
-
|
84
|
-
|
186
|
+
# If a retweet mode is being used, we read the file backwards using the Elif gem.
|
187
|
+
opener = options[:retweet_mode] ? Elif : File
|
188
|
+
|
189
|
+
opener.open(filename, "r") do |file|
|
190
|
+
file.each do |line|
|
85
191
|
next if line =~ /\A------SEP.RATOR------\Z/i
|
86
192
|
handle_status JSON.parse(line), &block
|
87
193
|
end
|
88
194
|
end
|
195
|
+
puts "Last status seen at #{@last_status_seen_at}." if options[:analyze_gaps] && @last_status_seen_at
|
196
|
+
end
|
197
|
+
|
198
|
+
def analyze_gaps(status, min_gap_size_in_minutes)
|
199
|
+
time = Time.parse(status['created_at'])
|
200
|
+
if !@last_status_seen_at
|
201
|
+
puts "First status seen at #{time}."
|
202
|
+
else
|
203
|
+
gap_length = (time - @last_status_seen_at) / 60
|
204
|
+
if gap_length > min_gap_size_in_minutes
|
205
|
+
puts "Gap of #{gap_length.to_i} minutes from #{@last_status_seen_at} to #{time}."
|
206
|
+
end
|
207
|
+
end
|
208
|
+
@last_status_seen_at = time
|
89
209
|
end
|
90
210
|
|
91
211
|
def sample_fields(status)
|
@@ -142,4 +262,4 @@ module TwitterToCsv
|
|
142
262
|
true
|
143
263
|
end
|
144
264
|
end
|
145
|
-
end
|
265
|
+
end
|
@@ -17,25 +17,33 @@ module TwitterToCsv
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def run(&block)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
stream.each_item do |item|
|
28
|
-
handle_status JSON.parse(item), block
|
29
|
-
end
|
20
|
+
while true
|
21
|
+
EventMachine::run do
|
22
|
+
stream = Twitter::JSONStream.connect(
|
23
|
+
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
|
24
|
+
:auth => "#{username}:#{password}",
|
25
|
+
:ssl => true
|
26
|
+
)
|
30
27
|
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
stream.each_item do |item|
|
29
|
+
handle_status JSON.parse(item), block
|
30
|
+
end
|
31
|
+
|
32
|
+
stream.on_error do |message|
|
33
|
+
STDERR.puts " --> Twitter error: #{message} <--"
|
34
|
+
end
|
35
|
+
|
36
|
+
stream.on_no_data do |message|
|
37
|
+
STDERR.puts " --> Got no data for awhile; trying to reconnect."
|
38
|
+
EventMachine::stop_event_loop
|
39
|
+
end
|
34
40
|
|
35
|
-
|
36
|
-
|
37
|
-
|
41
|
+
stream.on_max_reconnects do |timeout, retries|
|
42
|
+
STDERR.puts " --> Oops, tried too many times! <--"
|
43
|
+
EventMachine::stop_event_loop
|
44
|
+
end
|
38
45
|
end
|
46
|
+
puts " --> Reconnecting..."
|
39
47
|
end
|
40
48
|
end
|
41
49
|
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'spec_helper'
|
3
|
+
require 'time'
|
3
4
|
|
4
5
|
describe TwitterToCsv::CsvBuilder do
|
5
6
|
describe "#handle_status" do
|
@@ -15,6 +16,23 @@ describe TwitterToCsv::CsvBuilder do
|
|
15
16
|
string_io.rewind
|
16
17
|
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
17
18
|
end
|
19
|
+
|
20
|
+
it "honors start_time and end_time" do
|
21
|
+
string_io = StringIO.new
|
22
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
|
23
|
+
:start => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
|
24
|
+
:end => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
|
25
|
+
|
26
|
+
# Order shouldn't matter
|
27
|
+
csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
|
28
|
+
csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
|
29
|
+
csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
|
30
|
+
csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
|
31
|
+
csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
|
32
|
+
csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
|
33
|
+
string_io.rewind
|
34
|
+
string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
|
35
|
+
end
|
18
36
|
end
|
19
37
|
|
20
38
|
describe "log_csv_header" do
|
@@ -33,10 +51,23 @@ describe TwitterToCsv::CsvBuilder do
|
|
33
51
|
string_io.rewind
|
34
52
|
string_io.read.should == '"something","url_1","url_2"' + "\n"
|
35
53
|
end
|
54
|
+
|
55
|
+
it "includes columns for the retweet_counts_at entries, if present" do
|
56
|
+
string_io = StringIO.new
|
57
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io,
|
58
|
+
:fields => %w[something],
|
59
|
+
:retweet_mode => :rollup,
|
60
|
+
:retweet_threshold => 1,
|
61
|
+
:retweet_window => 4,
|
62
|
+
:retweet_counts_at => [0.5, 24, 48])
|
63
|
+
csv_builder.log_csv_header
|
64
|
+
string_io.rewind
|
65
|
+
string_io.read.should == '"something","retweets_at_0.5_hours","retweets_at_24_hours","retweets_at_48_hours"' + "\n"
|
66
|
+
end
|
36
67
|
end
|
37
68
|
|
38
69
|
describe "logging to a CSV" do
|
39
|
-
it "outputs the requested fields when
|
70
|
+
it "outputs the requested fields when specified in dot-notation" do
|
40
71
|
string_io = StringIO.new
|
41
72
|
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
|
42
73
|
csv_builder.handle_status({
|
@@ -54,20 +85,233 @@ describe TwitterToCsv::CsvBuilder do
|
|
54
85
|
string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
|
55
86
|
end
|
56
87
|
|
57
|
-
it "can extract URLs" do
|
88
|
+
it "can extract URLs, hashtags, and user mentions" do
|
58
89
|
string_io = StringIO.new
|
59
|
-
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
|
90
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2, :hashtag_columns => 2, :user_mention_columns => 1)
|
60
91
|
csv_builder.handle_status({
|
61
|
-
'something' => "
|
62
|
-
|
92
|
+
'something' => "hello1",
|
93
|
+
"entities" => {
|
94
|
+
"hashtags" => [
|
95
|
+
{ "text" => "AHashTag" },
|
96
|
+
{ "text" => "AnotherHashTag" },
|
97
|
+
{ "text" => "AThirdHashTag" }
|
98
|
+
],
|
99
|
+
"user_mentions" => [
|
100
|
+
{ "screen_name" => "ScreenNameOne" },
|
101
|
+
{ "screen_name" => "ScreenNameTwo" },
|
102
|
+
{ "screen_name" => "ScreenNameThree" }
|
103
|
+
],
|
104
|
+
"urls" => [
|
105
|
+
{ "url" => "http://t.co/1231" },
|
106
|
+
{ "url" => "http://t.co/1232", "expanded_url" => "http://a.real.url2" },
|
107
|
+
{ "url" => "http://t.co/1233", "expanded_url" => "http://a.real.url3" }
|
108
|
+
]
|
109
|
+
},
|
110
|
+
'text' => 'some text'
|
111
|
+
|
63
112
|
})
|
64
113
|
csv_builder.handle_status({
|
65
|
-
'something' => "
|
66
|
-
|
114
|
+
'something' => "hello2",
|
115
|
+
"entities" => {
|
116
|
+
"hashtags" => [],
|
117
|
+
"user_mentions" => [],
|
118
|
+
"urls" => []
|
119
|
+
},
|
120
|
+
'text' => 'this is another status'
|
121
|
+
})
|
122
|
+
string_io.rewind
|
123
|
+
string_io.read.should == "\"hello1\",\"http://t.co/1231\",\"http://a.real.url2\",\"AHashTag\",\"AnotherHashTag\",\"ScreenNameOne\"\n" +
|
124
|
+
"\"hello2\",\"\",\"\",\"\",\"\",\"\"\n"
|
125
|
+
end
|
126
|
+
|
127
|
+
it "can compute the average sentiment" do
|
128
|
+
string_io = StringIO.new
|
129
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_sentiment => true)
|
130
|
+
csv_builder.handle_status({
|
131
|
+
'something' => "hello1",
|
132
|
+
'text' => 'i love cheese'
|
133
|
+
|
134
|
+
})
|
135
|
+
csv_builder.handle_status({
|
136
|
+
'something' => "hello2",
|
137
|
+
'text' => 'i love cheese and like bread'
|
138
|
+
})
|
139
|
+
csv_builder.handle_status({
|
140
|
+
'something' => "hello3",
|
141
|
+
'text' => 'some kind of once-in-a-lifetime cool-fest in the right direction or the right-direction or the son_of a bitch' # it tries both hyphenated and non-hyphenated, and does phrases
|
67
142
|
})
|
68
143
|
string_io.rewind
|
69
|
-
string_io.read.should == "\"
|
70
|
-
"\"
|
144
|
+
string_io.read.should == "\"hello1\",\"3.0\",\"1\"\n" +
|
145
|
+
"\"hello2\",\"#{(3 + 2) / 2.0}\",\"2\"\n" +
|
146
|
+
"\"hello3\",\"#{(0 + 3 + 1 + 3 + 3 + -5) / 6.0}\",\"6\"\n"
|
147
|
+
end
|
148
|
+
|
149
|
+
it "can compute word count" do
|
150
|
+
string_io = StringIO.new
|
151
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_word_count => true)
|
152
|
+
csv_builder.handle_status({
|
153
|
+
'something' => "hello1",
|
154
|
+
'text' => 'i love cheese'
|
155
|
+
|
156
|
+
})
|
157
|
+
csv_builder.handle_status({
|
158
|
+
'something' => "hello2",
|
159
|
+
'text' => 'foo_bar baz9bing'
|
160
|
+
})
|
161
|
+
string_io.rewind
|
162
|
+
string_io.read.should == "\"hello1\",\"3\"\n" +
|
163
|
+
"\"hello2\",\"2\"\n"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
describe "retweet handling" do
|
168
|
+
def play_data(builder)
|
169
|
+
days = 60 * 60 * 24
|
170
|
+
now = Time.now
|
171
|
+
|
172
|
+
builder.handle_status({
|
173
|
+
'created_at' => now,
|
174
|
+
'retweeted_status' => {
|
175
|
+
'id' => 3,
|
176
|
+
'created_at' => now - 1 * days,
|
177
|
+
'retweet_count' => 1
|
178
|
+
},
|
179
|
+
'text' => 'RT not enough time has passed'
|
180
|
+
})
|
181
|
+
|
182
|
+
builder.handle_status({
|
183
|
+
'id' => 3,
|
184
|
+
'created_at' => now - 1 * days,
|
185
|
+
'text' => 'not enough time has passed',
|
186
|
+
'retweet_count' => 0
|
187
|
+
})
|
188
|
+
|
189
|
+
builder.handle_status({
|
190
|
+
'created_at' => now - 1 * days,
|
191
|
+
'retweeted_status' => {
|
192
|
+
'id' => 2,
|
193
|
+
'created_at' => now - 4 * days,
|
194
|
+
'retweet_count' => 3
|
195
|
+
},
|
196
|
+
'text' => 'RT 2 retweets'
|
197
|
+
})
|
198
|
+
|
199
|
+
builder.handle_status({
|
200
|
+
'created_at' => now - 2 * days,
|
201
|
+
'retweeted_status' => {
|
202
|
+
'id' => 4,
|
203
|
+
'created_at' => now - 5 * days,
|
204
|
+
'retweet_count' => 1
|
205
|
+
},
|
206
|
+
'text' => 'RT 1 retweet'
|
207
|
+
})
|
208
|
+
|
209
|
+
builder.handle_status({
|
210
|
+
'created_at' => now - 3 * days,
|
211
|
+
'retweeted_status' => {
|
212
|
+
'id' => 2,
|
213
|
+
'created_at' => now - 4 * days,
|
214
|
+
'retweet_count' => 2
|
215
|
+
},
|
216
|
+
'text' => 'RT 2 retweets'
|
217
|
+
})
|
218
|
+
|
219
|
+
builder.handle_status({
|
220
|
+
'created_at' => now - 3.99 * days,
|
221
|
+
'retweeted_status' => {
|
222
|
+
'id' => 2,
|
223
|
+
'created_at' => now - 4 * days,
|
224
|
+
'retweet_count' => 1
|
225
|
+
},
|
226
|
+
'text' => 'RT 2 retweets'
|
227
|
+
})
|
228
|
+
|
229
|
+
builder.handle_status({
|
230
|
+
'id' => 2,
|
231
|
+
'created_at' => now - 4 * days,
|
232
|
+
'text' => '2 retweets',
|
233
|
+
'retweet_count' => 0
|
234
|
+
})
|
235
|
+
|
236
|
+
builder.handle_status({
|
237
|
+
'id' => 4,
|
238
|
+
'created_at' => now - 5 * days,
|
239
|
+
'text' => '1 retweet',
|
240
|
+
'retweet_count' => 0
|
241
|
+
})
|
242
|
+
|
243
|
+
builder.handle_status({
|
244
|
+
'id' => 5,
|
245
|
+
'created_at' => now - 5.1 * days,
|
246
|
+
'text' => 'no retweets',
|
247
|
+
'retweet_count' => 0
|
248
|
+
})
|
249
|
+
end
|
250
|
+
|
251
|
+
it "skips statuses with fewer than :retweet_threshold retweets and ignores statues that haven't been seen for retweet_window yet" do
|
252
|
+
string_io = StringIO.new
|
253
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
254
|
+
:retweet_threshold => 2,
|
255
|
+
:retweet_window => 2,
|
256
|
+
:csv => string_io,
|
257
|
+
:fields => %w[id retweet_count])
|
258
|
+
play_data builder
|
259
|
+
string_io.rewind
|
260
|
+
string_io.read.should == "\"2\",\"2\"\n"
|
261
|
+
|
262
|
+
string_io = StringIO.new
|
263
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
264
|
+
:retweet_threshold => 1,
|
265
|
+
:retweet_window => 3,
|
266
|
+
:csv => string_io,
|
267
|
+
:fields => %w[id retweet_count])
|
268
|
+
play_data builder
|
269
|
+
string_io.rewind
|
270
|
+
string_io.read.should == "\"2\",\"3\"\n" + "\"4\",\"1\"\n"
|
271
|
+
|
272
|
+
string_io = StringIO.new
|
273
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
274
|
+
:retweet_threshold => 1,
|
275
|
+
:retweet_window => 20,
|
276
|
+
:csv => string_io,
|
277
|
+
:fields => %w[id retweet_count])
|
278
|
+
play_data builder
|
279
|
+
string_io.rewind
|
280
|
+
string_io.read.should == ""
|
281
|
+
|
282
|
+
string_io = StringIO.new
|
283
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
284
|
+
:retweet_threshold => 1,
|
285
|
+
:retweet_window => nil,
|
286
|
+
:csv => string_io,
|
287
|
+
:fields => %w[id retweet_count])
|
288
|
+
play_data builder
|
289
|
+
string_io.rewind
|
290
|
+
string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n"
|
291
|
+
|
292
|
+
string_io = StringIO.new
|
293
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
294
|
+
:retweet_threshold => 0,
|
295
|
+
:retweet_window => nil,
|
296
|
+
:csv => string_io,
|
297
|
+
:fields => %w[id retweet_count])
|
298
|
+
play_data builder
|
299
|
+
string_io.rewind
|
300
|
+
string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n\"5\",\"0\"\n"
|
301
|
+
end
|
302
|
+
|
303
|
+
it "logs at the hourly marks requested in retweet_counts_at" do
|
304
|
+
string_io = StringIO.new
|
305
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
306
|
+
:retweet_threshold => 1,
|
307
|
+
:retweet_window => 4,
|
308
|
+
:retweet_counts_at => [0.5, 23, 24, 48, 73, 1000],
|
309
|
+
:csv => string_io,
|
310
|
+
:fields => %w[id retweet_count])
|
311
|
+
play_data builder
|
312
|
+
string_io.rewind
|
313
|
+
string_io.read.should == "\"2\",\"3\",\"1\",\"1\",\"2\",\"2\",\"3\",\"3\"\n" +
|
314
|
+
"\"4\",\"1\",\"0\",\"0\",\"0\",\"0\",\"1\",\"1\"\n"
|
71
315
|
end
|
72
316
|
end
|
73
317
|
end
|