twitter_to_csv 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +0 -2
- data/.rvmrc +1 -1
- data/Gemfile +1 -3
- data/README.markdown +34 -4
- data/bin/twitter_to_csv +60 -11
- data/lib/twitter_to_csv/afinn/AFINN-111.txt +2478 -0
- data/lib/twitter_to_csv/afinn/AFINN-96.txt +1480 -0
- data/lib/twitter_to_csv/afinn/AFINN-README.txt +43 -0
- data/lib/twitter_to_csv/csv_builder.rb +137 -17
- data/lib/twitter_to_csv/twitter_watcher.rb +24 -16
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +253 -9
- data/twitter_to_csv.gemspec +4 -3
- metadata +53 -13
@@ -0,0 +1,43 @@
|
|
1
|
+
AFINN is a list of English words rated for valence with an integer
|
2
|
+
between minus five (negative) and plus five (positive). The words have
|
3
|
+
been manually labeled by Finn Årup Nielsen in 2009-2011. The file
|
4
|
+
is tab-separated. There are two versions:
|
5
|
+
|
6
|
+
AFINN-111: Newest version with 2477 words and phrases.
|
7
|
+
|
8
|
+
AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
|
9
|
+
are 1480 lines, as some words are listed twice. The word list in not
|
10
|
+
entirely in alphabetic ordering.
|
11
|
+
|
12
|
+
An evaluation of the word list is available in:
|
13
|
+
|
14
|
+
Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
|
15
|
+
sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
|
16
|
+
|
17
|
+
The list was used in:
|
18
|
+
|
19
|
+
Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
|
20
|
+
Michael Etter, "Good Friends, Bad News - Affect and Virality in
|
21
|
+
Twitter", The 2011 International Workshop on Social Computing,
|
22
|
+
Network, and Services (SocialComNet 2011).
|
23
|
+
|
24
|
+
|
25
|
+
This database of words is copyright protected and distributed under
|
26
|
+
"Open Database License (ODbL) v1.0"
|
27
|
+
http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
|
28
|
+
copyleft license.
|
29
|
+
|
30
|
+
See comments on the word list here:
|
31
|
+
http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
|
32
|
+
|
33
|
+
|
34
|
+
In Python the file may be read into a dictionary with:
|
35
|
+
|
36
|
+
>>> afinn = dict(map(lambda (k,v): (k,int(v)),
|
37
|
+
[ line.split('\t') for line in open("AFINN-111.txt") ]))
|
38
|
+
>>> afinn["Good".lower()]
|
39
|
+
3
|
40
|
+
>>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split()))
|
41
|
+
2
|
42
|
+
|
43
|
+
|
@@ -1,17 +1,18 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'pp'
|
3
|
+
require 'elif'
|
4
|
+
require 'time'
|
3
5
|
|
4
6
|
module TwitterToCsv
|
5
7
|
class CsvBuilder
|
6
8
|
attr_accessor :options, :sampled_fields
|
7
9
|
|
8
|
-
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
|
9
|
-
URL_REGEX = %r"\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s\`\!\(\)\[\]\{\};:'\".,<>\?«»“”‘’]))"i
|
10
|
-
|
11
10
|
def initialize(options = {})
|
12
11
|
@options = options
|
13
12
|
@sampled_fields = {}
|
14
13
|
@num_samples = 0
|
14
|
+
@retweet_counts = {}
|
15
|
+
@retweet_hour_counts = {}
|
15
16
|
end
|
16
17
|
|
17
18
|
def run(&block)
|
@@ -35,22 +36,79 @@ module TwitterToCsv
|
|
35
36
|
end
|
36
37
|
end
|
37
38
|
|
39
|
+
def within_time_window?(status)
|
40
|
+
if options[:start] || options[:end]
|
41
|
+
created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
|
42
|
+
return false if options[:start] && created_at < options[:start]
|
43
|
+
return false if options[:end] && created_at >= options[:end]
|
44
|
+
end
|
45
|
+
true
|
46
|
+
end
|
47
|
+
|
48
|
+
def display_rolledup_status?(status)
|
49
|
+
created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
|
50
|
+
@newest_status_at = created_at if @newest_status_at.nil?
|
51
|
+
|
52
|
+
if status['retweeted_status'] && status['retweeted_status']['id']
|
53
|
+
# This is a retweet.
|
54
|
+
original_created_at = status['retweeted_status']['created_at'].is_a?(Time) ? status['retweeted_status']['created_at'] : Time.parse(status['retweeted_status']['created_at'])
|
55
|
+
if !options[:retweet_window] || created_at <= original_created_at + options[:retweet_window] * 60 * 60 * 24
|
56
|
+
@retweet_counts[status['retweeted_status']['id']] ||= 0
|
57
|
+
@retweet_counts[status['retweeted_status']['id']] = status['retweeted_status']['retweet_count'] if status['retweeted_status']['retweet_count'] > @retweet_counts[status['retweeted_status']['id']]
|
58
|
+
|
59
|
+
|
60
|
+
if options[:retweet_counts_at]
|
61
|
+
@retweet_hour_counts[status['retweeted_status']['id']] ||= options[:retweet_counts_at].map { 0 }
|
62
|
+
options[:retweet_counts_at].each.with_index do |hour_mark, index|
|
63
|
+
if created_at <= original_created_at + hour_mark * 60 * 60 && status['retweeted_status']['retweet_count'] > @retweet_hour_counts[status['retweeted_status']['id']][index]
|
64
|
+
@retweet_hour_counts[status['retweeted_status']['id']][index] = status['retweeted_status']['retweet_count']
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
false
|
70
|
+
else
|
71
|
+
# This is an original status.
|
72
|
+
if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
|
73
|
+
if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
|
74
|
+
status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
75
|
+
status['_retweet_hour_counts'] = @retweet_hour_counts.delete(status['id']) if options[:retweet_counts_at]
|
76
|
+
true
|
77
|
+
else
|
78
|
+
false
|
79
|
+
end
|
80
|
+
else
|
81
|
+
false
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
38
86
|
def handle_status(status, &block)
|
39
|
-
if
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
87
|
+
if within_time_window?(status)
|
88
|
+
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
89
|
+
if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
|
90
|
+
log_json(status) if options[:json]
|
91
|
+
log_csv(status) if options[:csv]
|
92
|
+
yield_status(status, &block) if block
|
93
|
+
sample_fields(status) if options[:sample_fields]
|
94
|
+
analyze_gaps(status, options[:analyze_gaps]) if options[:analyze_gaps]
|
95
|
+
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
96
|
+
end
|
97
|
+
end
|
45
98
|
end
|
46
99
|
end
|
47
100
|
|
48
101
|
def log_csv_header
|
49
102
|
header_labels = options[:fields].dup
|
50
103
|
|
51
|
-
|
52
|
-
|
53
|
-
|
104
|
+
header_labels += ["average_sentiment", "sentiment_words"] if options[:compute_sentiment]
|
105
|
+
header_labels << "word_count" if options[:compute_word_count]
|
106
|
+
|
107
|
+
options[:retweet_counts_at].each { |hours| header_labels << "retweets_at_#{hours}_hours" } if options[:retweet_counts_at]
|
108
|
+
|
109
|
+
options[:url_columns].times { |i| header_labels << "url_#{i+1}" } if options[:url_columns] && options[:url_columns] > 0
|
110
|
+
options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
|
111
|
+
options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
|
54
112
|
|
55
113
|
options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
56
114
|
end
|
@@ -70,22 +128,84 @@ module TwitterToCsv
|
|
70
128
|
}.to_s
|
71
129
|
end
|
72
130
|
|
131
|
+
row += compute_sentiment(status["text"]) if options[:compute_sentiment]
|
132
|
+
|
133
|
+
row << status["text"].split(/\s+/).length if options[:compute_word_count]
|
134
|
+
|
135
|
+
row += status["_retweet_hour_counts"] if options[:retweet_counts_at]
|
136
|
+
|
73
137
|
if options[:url_columns] && options[:url_columns] > 0
|
74
|
-
urls = status[
|
138
|
+
urls = (status["entities"] && (status["entities"]["urls"] || []).map {|i| i["expanded_url"] || i["url"] }) || []
|
75
139
|
options[:url_columns].times { |i| row << urls[i].to_s }
|
76
140
|
end
|
77
141
|
|
142
|
+
if options[:hashtag_columns] && options[:hashtag_columns] > 0
|
143
|
+
hashes = (status["entities"] && (status["entities"]["hashtags"] || []).map {|i| i["text"] }) || []
|
144
|
+
options[:hashtag_columns].times { |i| row << hashes[i].to_s }
|
145
|
+
end
|
146
|
+
|
147
|
+
if options[:user_mention_columns] && options[:user_mention_columns] > 0
|
148
|
+
users = (status["entities"] && (status["entities"]["user_mentions"] || []).map {|i| i["screen_name"] }) || []
|
149
|
+
options[:user_mention_columns].times { |i| row << users[i].to_s }
|
150
|
+
end
|
151
|
+
|
78
152
|
row
|
79
153
|
end
|
80
154
|
|
155
|
+
def afinn
|
156
|
+
@afinn_cache ||= begin
|
157
|
+
words_or_phrases = []
|
158
|
+
File.read(File.expand_path(File.join(File.dirname(__FILE__), "afinn", "AFINN-111.txt"))).each_line do |line|
|
159
|
+
word_or_phrase, valence = line.split(/\t/)
|
160
|
+
pattern = Regexp::escape word_or_phrase.gsub(/-/, " ").gsub(/'/, '')
|
161
|
+
words_or_phrases << [/\b#{pattern}\b/i, pattern.length, valence.to_f]
|
162
|
+
end
|
163
|
+
words_or_phrases.sort {|b, a| a[1] <=> b[1] }
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def compute_sentiment(original_text)
|
168
|
+
text = original_text.downcase.gsub(/'/, '').gsub(/[^a-z0-9]/, ' ').gsub(/\s+/, ' ').strip
|
169
|
+
count = 0
|
170
|
+
valence_sum = 0
|
171
|
+
afinn.each do |pattern, length, valence|
|
172
|
+
while text =~ pattern
|
173
|
+
text.sub! pattern, ''
|
174
|
+
valence_sum += valence
|
175
|
+
count += 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
if count > 0
|
179
|
+
[valence_sum / count.to_f, count]
|
180
|
+
else
|
181
|
+
[0, 0]
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
81
185
|
def replay_from(filename, &block)
|
82
|
-
|
83
|
-
|
84
|
-
|
186
|
+
# If a retweet mode is being used, we read the file backwards using the Elif gem.
|
187
|
+
opener = options[:retweet_mode] ? Elif : File
|
188
|
+
|
189
|
+
opener.open(filename, "r") do |file|
|
190
|
+
file.each do |line|
|
85
191
|
next if line =~ /\A------SEP.RATOR------\Z/i
|
86
192
|
handle_status JSON.parse(line), &block
|
87
193
|
end
|
88
194
|
end
|
195
|
+
puts "Last status seen at #{@last_status_seen_at}." if options[:analyze_gaps] && @last_status_seen_at
|
196
|
+
end
|
197
|
+
|
198
|
+
def analyze_gaps(status, min_gap_size_in_minutes)
|
199
|
+
time = Time.parse(status['created_at'])
|
200
|
+
if !@last_status_seen_at
|
201
|
+
puts "First status seen at #{time}."
|
202
|
+
else
|
203
|
+
gap_length = (time - @last_status_seen_at) / 60
|
204
|
+
if gap_length > min_gap_size_in_minutes
|
205
|
+
puts "Gap of #{gap_length.to_i} minutes from #{@last_status_seen_at} to #{time}."
|
206
|
+
end
|
207
|
+
end
|
208
|
+
@last_status_seen_at = time
|
89
209
|
end
|
90
210
|
|
91
211
|
def sample_fields(status)
|
@@ -142,4 +262,4 @@ module TwitterToCsv
|
|
142
262
|
true
|
143
263
|
end
|
144
264
|
end
|
145
|
-
end
|
265
|
+
end
|
@@ -17,25 +17,33 @@ module TwitterToCsv
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def run(&block)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
stream.each_item do |item|
|
28
|
-
handle_status JSON.parse(item), block
|
29
|
-
end
|
20
|
+
while true
|
21
|
+
EventMachine::run do
|
22
|
+
stream = Twitter::JSONStream.connect(
|
23
|
+
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
|
24
|
+
:auth => "#{username}:#{password}",
|
25
|
+
:ssl => true
|
26
|
+
)
|
30
27
|
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
stream.each_item do |item|
|
29
|
+
handle_status JSON.parse(item), block
|
30
|
+
end
|
31
|
+
|
32
|
+
stream.on_error do |message|
|
33
|
+
STDERR.puts " --> Twitter error: #{message} <--"
|
34
|
+
end
|
35
|
+
|
36
|
+
stream.on_no_data do |message|
|
37
|
+
STDERR.puts " --> Got no data for awhile; trying to reconnect."
|
38
|
+
EventMachine::stop_event_loop
|
39
|
+
end
|
34
40
|
|
35
|
-
|
36
|
-
|
37
|
-
|
41
|
+
stream.on_max_reconnects do |timeout, retries|
|
42
|
+
STDERR.puts " --> Oops, tried too many times! <--"
|
43
|
+
EventMachine::stop_event_loop
|
44
|
+
end
|
38
45
|
end
|
46
|
+
puts " --> Reconnecting..."
|
39
47
|
end
|
40
48
|
end
|
41
49
|
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'spec_helper'
|
3
|
+
require 'time'
|
3
4
|
|
4
5
|
describe TwitterToCsv::CsvBuilder do
|
5
6
|
describe "#handle_status" do
|
@@ -15,6 +16,23 @@ describe TwitterToCsv::CsvBuilder do
|
|
15
16
|
string_io.rewind
|
16
17
|
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
17
18
|
end
|
19
|
+
|
20
|
+
it "honors start_time and end_time" do
|
21
|
+
string_io = StringIO.new
|
22
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
|
23
|
+
:start => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
|
24
|
+
:end => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
|
25
|
+
|
26
|
+
# Order shouldn't matter
|
27
|
+
csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
|
28
|
+
csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
|
29
|
+
csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
|
30
|
+
csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
|
31
|
+
csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
|
32
|
+
csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
|
33
|
+
string_io.rewind
|
34
|
+
string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
|
35
|
+
end
|
18
36
|
end
|
19
37
|
|
20
38
|
describe "log_csv_header" do
|
@@ -33,10 +51,23 @@ describe TwitterToCsv::CsvBuilder do
|
|
33
51
|
string_io.rewind
|
34
52
|
string_io.read.should == '"something","url_1","url_2"' + "\n"
|
35
53
|
end
|
54
|
+
|
55
|
+
it "includes columns for the retweet_counts_at entries, if present" do
|
56
|
+
string_io = StringIO.new
|
57
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io,
|
58
|
+
:fields => %w[something],
|
59
|
+
:retweet_mode => :rollup,
|
60
|
+
:retweet_threshold => 1,
|
61
|
+
:retweet_window => 4,
|
62
|
+
:retweet_counts_at => [0.5, 24, 48])
|
63
|
+
csv_builder.log_csv_header
|
64
|
+
string_io.rewind
|
65
|
+
string_io.read.should == '"something","retweets_at_0.5_hours","retweets_at_24_hours","retweets_at_48_hours"' + "\n"
|
66
|
+
end
|
36
67
|
end
|
37
68
|
|
38
69
|
describe "logging to a CSV" do
|
39
|
-
it "outputs the requested fields when
|
70
|
+
it "outputs the requested fields when specified in dot-notation" do
|
40
71
|
string_io = StringIO.new
|
41
72
|
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
|
42
73
|
csv_builder.handle_status({
|
@@ -54,20 +85,233 @@ describe TwitterToCsv::CsvBuilder do
|
|
54
85
|
string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
|
55
86
|
end
|
56
87
|
|
57
|
-
it "can extract URLs" do
|
88
|
+
it "can extract URLs, hashtags, and user mentions" do
|
58
89
|
string_io = StringIO.new
|
59
|
-
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
|
90
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2, :hashtag_columns => 2, :user_mention_columns => 1)
|
60
91
|
csv_builder.handle_status({
|
61
|
-
'something' => "
|
62
|
-
|
92
|
+
'something' => "hello1",
|
93
|
+
"entities" => {
|
94
|
+
"hashtags" => [
|
95
|
+
{ "text" => "AHashTag" },
|
96
|
+
{ "text" => "AnotherHashTag" },
|
97
|
+
{ "text" => "AThirdHashTag" }
|
98
|
+
],
|
99
|
+
"user_mentions" => [
|
100
|
+
{ "screen_name" => "ScreenNameOne" },
|
101
|
+
{ "screen_name" => "ScreenNameTwo" },
|
102
|
+
{ "screen_name" => "ScreenNameThree" }
|
103
|
+
],
|
104
|
+
"urls" => [
|
105
|
+
{ "url" => "http://t.co/1231" },
|
106
|
+
{ "url" => "http://t.co/1232", "expanded_url" => "http://a.real.url2" },
|
107
|
+
{ "url" => "http://t.co/1233", "expanded_url" => "http://a.real.url3" }
|
108
|
+
]
|
109
|
+
},
|
110
|
+
'text' => 'some text'
|
111
|
+
|
63
112
|
})
|
64
113
|
csv_builder.handle_status({
|
65
|
-
'something' => "
|
66
|
-
|
114
|
+
'something' => "hello2",
|
115
|
+
"entities" => {
|
116
|
+
"hashtags" => [],
|
117
|
+
"user_mentions" => [],
|
118
|
+
"urls" => []
|
119
|
+
},
|
120
|
+
'text' => 'this is another status'
|
121
|
+
})
|
122
|
+
string_io.rewind
|
123
|
+
string_io.read.should == "\"hello1\",\"http://t.co/1231\",\"http://a.real.url2\",\"AHashTag\",\"AnotherHashTag\",\"ScreenNameOne\"\n" +
|
124
|
+
"\"hello2\",\"\",\"\",\"\",\"\",\"\"\n"
|
125
|
+
end
|
126
|
+
|
127
|
+
it "can compute the average sentiment" do
|
128
|
+
string_io = StringIO.new
|
129
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_sentiment => true)
|
130
|
+
csv_builder.handle_status({
|
131
|
+
'something' => "hello1",
|
132
|
+
'text' => 'i love cheese'
|
133
|
+
|
134
|
+
})
|
135
|
+
csv_builder.handle_status({
|
136
|
+
'something' => "hello2",
|
137
|
+
'text' => 'i love cheese and like bread'
|
138
|
+
})
|
139
|
+
csv_builder.handle_status({
|
140
|
+
'something' => "hello3",
|
141
|
+
'text' => 'some kind of once-in-a-lifetime cool-fest in the right direction or the right-direction or the son_of a bitch' # it tries both hyphenated and non-hyphenated, and does phrases
|
67
142
|
})
|
68
143
|
string_io.rewind
|
69
|
-
string_io.read.should == "\"
|
70
|
-
"\"
|
144
|
+
string_io.read.should == "\"hello1\",\"3.0\",\"1\"\n" +
|
145
|
+
"\"hello2\",\"#{(3 + 2) / 2.0}\",\"2\"\n" +
|
146
|
+
"\"hello3\",\"#{(0 + 3 + 1 + 3 + 3 + -5) / 6.0}\",\"6\"\n"
|
147
|
+
end
|
148
|
+
|
149
|
+
it "can compute word count" do
|
150
|
+
string_io = StringIO.new
|
151
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_word_count => true)
|
152
|
+
csv_builder.handle_status({
|
153
|
+
'something' => "hello1",
|
154
|
+
'text' => 'i love cheese'
|
155
|
+
|
156
|
+
})
|
157
|
+
csv_builder.handle_status({
|
158
|
+
'something' => "hello2",
|
159
|
+
'text' => 'foo_bar baz9bing'
|
160
|
+
})
|
161
|
+
string_io.rewind
|
162
|
+
string_io.read.should == "\"hello1\",\"3\"\n" +
|
163
|
+
"\"hello2\",\"2\"\n"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
describe "retweet handling" do
|
168
|
+
def play_data(builder)
|
169
|
+
days = 60 * 60 * 24
|
170
|
+
now = Time.now
|
171
|
+
|
172
|
+
builder.handle_status({
|
173
|
+
'created_at' => now,
|
174
|
+
'retweeted_status' => {
|
175
|
+
'id' => 3,
|
176
|
+
'created_at' => now - 1 * days,
|
177
|
+
'retweet_count' => 1
|
178
|
+
},
|
179
|
+
'text' => 'RT not enough time has passed'
|
180
|
+
})
|
181
|
+
|
182
|
+
builder.handle_status({
|
183
|
+
'id' => 3,
|
184
|
+
'created_at' => now - 1 * days,
|
185
|
+
'text' => 'not enough time has passed',
|
186
|
+
'retweet_count' => 0
|
187
|
+
})
|
188
|
+
|
189
|
+
builder.handle_status({
|
190
|
+
'created_at' => now - 1 * days,
|
191
|
+
'retweeted_status' => {
|
192
|
+
'id' => 2,
|
193
|
+
'created_at' => now - 4 * days,
|
194
|
+
'retweet_count' => 3
|
195
|
+
},
|
196
|
+
'text' => 'RT 2 retweets'
|
197
|
+
})
|
198
|
+
|
199
|
+
builder.handle_status({
|
200
|
+
'created_at' => now - 2 * days,
|
201
|
+
'retweeted_status' => {
|
202
|
+
'id' => 4,
|
203
|
+
'created_at' => now - 5 * days,
|
204
|
+
'retweet_count' => 1
|
205
|
+
},
|
206
|
+
'text' => 'RT 1 retweet'
|
207
|
+
})
|
208
|
+
|
209
|
+
builder.handle_status({
|
210
|
+
'created_at' => now - 3 * days,
|
211
|
+
'retweeted_status' => {
|
212
|
+
'id' => 2,
|
213
|
+
'created_at' => now - 4 * days,
|
214
|
+
'retweet_count' => 2
|
215
|
+
},
|
216
|
+
'text' => 'RT 2 retweets'
|
217
|
+
})
|
218
|
+
|
219
|
+
builder.handle_status({
|
220
|
+
'created_at' => now - 3.99 * days,
|
221
|
+
'retweeted_status' => {
|
222
|
+
'id' => 2,
|
223
|
+
'created_at' => now - 4 * days,
|
224
|
+
'retweet_count' => 1
|
225
|
+
},
|
226
|
+
'text' => 'RT 2 retweets'
|
227
|
+
})
|
228
|
+
|
229
|
+
builder.handle_status({
|
230
|
+
'id' => 2,
|
231
|
+
'created_at' => now - 4 * days,
|
232
|
+
'text' => '2 retweets',
|
233
|
+
'retweet_count' => 0
|
234
|
+
})
|
235
|
+
|
236
|
+
builder.handle_status({
|
237
|
+
'id' => 4,
|
238
|
+
'created_at' => now - 5 * days,
|
239
|
+
'text' => '1 retweet',
|
240
|
+
'retweet_count' => 0
|
241
|
+
})
|
242
|
+
|
243
|
+
builder.handle_status({
|
244
|
+
'id' => 5,
|
245
|
+
'created_at' => now - 5.1 * days,
|
246
|
+
'text' => 'no retweets',
|
247
|
+
'retweet_count' => 0
|
248
|
+
})
|
249
|
+
end
|
250
|
+
|
251
|
+
it "skips statuses with fewer than :retweet_threshold retweets and ignores statues that haven't been seen for retweet_window yet" do
|
252
|
+
string_io = StringIO.new
|
253
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
254
|
+
:retweet_threshold => 2,
|
255
|
+
:retweet_window => 2,
|
256
|
+
:csv => string_io,
|
257
|
+
:fields => %w[id retweet_count])
|
258
|
+
play_data builder
|
259
|
+
string_io.rewind
|
260
|
+
string_io.read.should == "\"2\",\"2\"\n"
|
261
|
+
|
262
|
+
string_io = StringIO.new
|
263
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
264
|
+
:retweet_threshold => 1,
|
265
|
+
:retweet_window => 3,
|
266
|
+
:csv => string_io,
|
267
|
+
:fields => %w[id retweet_count])
|
268
|
+
play_data builder
|
269
|
+
string_io.rewind
|
270
|
+
string_io.read.should == "\"2\",\"3\"\n" + "\"4\",\"1\"\n"
|
271
|
+
|
272
|
+
string_io = StringIO.new
|
273
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
274
|
+
:retweet_threshold => 1,
|
275
|
+
:retweet_window => 20,
|
276
|
+
:csv => string_io,
|
277
|
+
:fields => %w[id retweet_count])
|
278
|
+
play_data builder
|
279
|
+
string_io.rewind
|
280
|
+
string_io.read.should == ""
|
281
|
+
|
282
|
+
string_io = StringIO.new
|
283
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
284
|
+
:retweet_threshold => 1,
|
285
|
+
:retweet_window => nil,
|
286
|
+
:csv => string_io,
|
287
|
+
:fields => %w[id retweet_count])
|
288
|
+
play_data builder
|
289
|
+
string_io.rewind
|
290
|
+
string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n"
|
291
|
+
|
292
|
+
string_io = StringIO.new
|
293
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
294
|
+
:retweet_threshold => 0,
|
295
|
+
:retweet_window => nil,
|
296
|
+
:csv => string_io,
|
297
|
+
:fields => %w[id retweet_count])
|
298
|
+
play_data builder
|
299
|
+
string_io.rewind
|
300
|
+
string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n\"5\",\"0\"\n"
|
301
|
+
end
|
302
|
+
|
303
|
+
it "logs at the hourly marks requested in retweet_counts_at" do
|
304
|
+
string_io = StringIO.new
|
305
|
+
builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
|
306
|
+
:retweet_threshold => 1,
|
307
|
+
:retweet_window => 4,
|
308
|
+
:retweet_counts_at => [0.5, 23, 24, 48, 73, 1000],
|
309
|
+
:csv => string_io,
|
310
|
+
:fields => %w[id retweet_count])
|
311
|
+
play_data builder
|
312
|
+
string_io.rewind
|
313
|
+
string_io.read.should == "\"2\",\"3\",\"1\",\"1\",\"2\",\"2\",\"3\",\"3\"\n" +
|
314
|
+
"\"4\",\"1\",\"0\",\"0\",\"0\",\"0\",\"1\",\"1\"\n"
|
71
315
|
end
|
72
316
|
end
|
73
317
|
end
|