twitter_to_csv 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ AFINN is a list of English words rated for valence with an integer
2
+ between minus five (negative) and plus five (positive). The words have
3
+ been manually labeled by Finn Årup Nielsen in 2009-2011. The file
4
+ is tab-separated. There are two versions:
5
+
6
+ AFINN-111: Newest version with 2477 words and phrases.
7
+
8
+ AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
9
+ are 1480 lines, as some words are listed twice. The word list in not
10
+ entirely in alphabetic ordering.
11
+
12
+ An evaluation of the word list is available in:
13
+
14
+ Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
15
+ sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
16
+
17
+ The list was used in:
18
+
19
+ Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
20
+ Michael Etter, "Good Friends, Bad News - Affect and Virality in
21
+ Twitter", The 2011 International Workshop on Social Computing,
22
+ Network, and Services (SocialComNet 2011).
23
+
24
+
25
+ This database of words is copyright protected and distributed under
26
+ "Open Database License (ODbL) v1.0"
27
+ http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
28
+ copyleft license.
29
+
30
+ See comments on the word list here:
31
+ http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
32
+
33
+
34
+ In Python the file may be read into a dictionary with:
35
+
36
+ >>> afinn = dict(map(lambda (k,v): (k,int(v)),
37
+ [ line.split('\t') for line in open("AFINN-111.txt") ]))
38
+ >>> afinn["Good".lower()]
39
+ 3
40
+ >>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split()))
41
+ 2
42
+
43
+
@@ -1,17 +1,18 @@
1
1
  # encoding: UTF-8
2
2
  require 'pp'
3
+ require 'elif'
4
+ require 'time'
3
5
 
4
6
  module TwitterToCsv
5
7
  class CsvBuilder
6
8
  attr_accessor :options, :sampled_fields
7
9
 
8
- # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
9
- URL_REGEX = %r"\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s\`\!\(\)\[\]\{\};:'\".,<>\?«»“”‘’]))"i
10
-
11
10
  def initialize(options = {})
12
11
  @options = options
13
12
  @sampled_fields = {}
14
13
  @num_samples = 0
14
+ @retweet_counts = {}
15
+ @retweet_hour_counts = {}
15
16
  end
16
17
 
17
18
  def run(&block)
@@ -35,22 +36,79 @@ module TwitterToCsv
35
36
  end
36
37
  end
37
38
 
39
+ def within_time_window?(status)
40
+ if options[:start] || options[:end]
41
+ created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
42
+ return false if options[:start] && created_at < options[:start]
43
+ return false if options[:end] && created_at >= options[:end]
44
+ end
45
+ true
46
+ end
47
+
48
+ def display_rolledup_status?(status)
49
+ created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
50
+ @newest_status_at = created_at if @newest_status_at.nil?
51
+
52
+ if status['retweeted_status'] && status['retweeted_status']['id']
53
+ # This is a retweet.
54
+ original_created_at = status['retweeted_status']['created_at'].is_a?(Time) ? status['retweeted_status']['created_at'] : Time.parse(status['retweeted_status']['created_at'])
55
+ if !options[:retweet_window] || created_at <= original_created_at + options[:retweet_window] * 60 * 60 * 24
56
+ @retweet_counts[status['retweeted_status']['id']] ||= 0
57
+ @retweet_counts[status['retweeted_status']['id']] = status['retweeted_status']['retweet_count'] if status['retweeted_status']['retweet_count'] > @retweet_counts[status['retweeted_status']['id']]
58
+
59
+
60
+ if options[:retweet_counts_at]
61
+ @retweet_hour_counts[status['retweeted_status']['id']] ||= options[:retweet_counts_at].map { 0 }
62
+ options[:retweet_counts_at].each.with_index do |hour_mark, index|
63
+ if created_at <= original_created_at + hour_mark * 60 * 60 && status['retweeted_status']['retweet_count'] > @retweet_hour_counts[status['retweeted_status']['id']][index]
64
+ @retweet_hour_counts[status['retweeted_status']['id']][index] = status['retweeted_status']['retweet_count']
65
+ end
66
+ end
67
+ end
68
+ end
69
+ false
70
+ else
71
+ # This is an original status.
72
+ if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
73
+ if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
74
+ status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
75
+ status['_retweet_hour_counts'] = @retweet_hour_counts.delete(status['id']) if options[:retweet_counts_at]
76
+ true
77
+ else
78
+ false
79
+ end
80
+ else
81
+ false
82
+ end
83
+ end
84
+ end
85
+
38
86
  def handle_status(status, &block)
39
- if (options[:require_english] && is_english?(status)) || !options[:require_english]
40
- log_json(status) if options[:json]
41
- log_csv(status) if options[:csv]
42
- yield_status(status, &block) if block
43
- sample_fields(status) if options[:sample_fields]
44
- STDERR.puts "Logging: #{status['text']}" if options[:verbose]
87
+ if within_time_window?(status)
88
+ if (options[:require_english] && is_english?(status)) || !options[:require_english]
89
+ if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
90
+ log_json(status) if options[:json]
91
+ log_csv(status) if options[:csv]
92
+ yield_status(status, &block) if block
93
+ sample_fields(status) if options[:sample_fields]
94
+ analyze_gaps(status, options[:analyze_gaps]) if options[:analyze_gaps]
95
+ STDERR.puts "Logging: #{status['text']}" if options[:verbose]
96
+ end
97
+ end
45
98
  end
46
99
  end
47
100
 
48
101
  def log_csv_header
49
102
  header_labels = options[:fields].dup
50
103
 
51
- if options[:url_columns] && options[:url_columns] > 0
52
- options[:url_columns].times { |i| header_labels << "url_#{i+1}" }
53
- end
104
+ header_labels += ["average_sentiment", "sentiment_words"] if options[:compute_sentiment]
105
+ header_labels << "word_count" if options[:compute_word_count]
106
+
107
+ options[:retweet_counts_at].each { |hours| header_labels << "retweets_at_#{hours}_hours" } if options[:retweet_counts_at]
108
+
109
+ options[:url_columns].times { |i| header_labels << "url_#{i+1}" } if options[:url_columns] && options[:url_columns] > 0
110
+ options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
111
+ options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
54
112
 
55
113
  options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
56
114
  end
@@ -70,22 +128,84 @@ module TwitterToCsv
70
128
  }.to_s
71
129
  end
72
130
 
131
+ row += compute_sentiment(status["text"]) if options[:compute_sentiment]
132
+
133
+ row << status["text"].split(/\s+/).length if options[:compute_word_count]
134
+
135
+ row += status["_retweet_hour_counts"] if options[:retweet_counts_at]
136
+
73
137
  if options[:url_columns] && options[:url_columns] > 0
74
- urls = status['text'].scan(URL_REGEX).flatten.compact
138
+ urls = (status["entities"] && (status["entities"]["urls"] || []).map {|i| i["expanded_url"] || i["url"] }) || []
75
139
  options[:url_columns].times { |i| row << urls[i].to_s }
76
140
  end
77
141
 
142
+ if options[:hashtag_columns] && options[:hashtag_columns] > 0
143
+ hashes = (status["entities"] && (status["entities"]["hashtags"] || []).map {|i| i["text"] }) || []
144
+ options[:hashtag_columns].times { |i| row << hashes[i].to_s }
145
+ end
146
+
147
+ if options[:user_mention_columns] && options[:user_mention_columns] > 0
148
+ users = (status["entities"] && (status["entities"]["user_mentions"] || []).map {|i| i["screen_name"] }) || []
149
+ options[:user_mention_columns].times { |i| row << users[i].to_s }
150
+ end
151
+
78
152
  row
79
153
  end
80
154
 
155
+ def afinn
156
+ @afinn_cache ||= begin
157
+ words_or_phrases = []
158
+ File.read(File.expand_path(File.join(File.dirname(__FILE__), "afinn", "AFINN-111.txt"))).each_line do |line|
159
+ word_or_phrase, valence = line.split(/\t/)
160
+ pattern = Regexp::escape word_or_phrase.gsub(/-/, " ").gsub(/'/, '')
161
+ words_or_phrases << [/\b#{pattern}\b/i, pattern.length, valence.to_f]
162
+ end
163
+ words_or_phrases.sort {|b, a| a[1] <=> b[1] }
164
+ end
165
+ end
166
+
167
+ def compute_sentiment(original_text)
168
+ text = original_text.downcase.gsub(/'/, '').gsub(/[^a-z0-9]/, ' ').gsub(/\s+/, ' ').strip
169
+ count = 0
170
+ valence_sum = 0
171
+ afinn.each do |pattern, length, valence|
172
+ while text =~ pattern
173
+ text.sub! pattern, ''
174
+ valence_sum += valence
175
+ count += 1
176
+ end
177
+ end
178
+ if count > 0
179
+ [valence_sum / count.to_f, count]
180
+ else
181
+ [0, 0]
182
+ end
183
+ end
184
+
81
185
  def replay_from(filename, &block)
82
- File.open(filename, "r") do |file|
83
- until file.eof?
84
- line = file.readline
186
+ # If a retweet mode is being used, we read the file backwards using the Elif gem.
187
+ opener = options[:retweet_mode] ? Elif : File
188
+
189
+ opener.open(filename, "r") do |file|
190
+ file.each do |line|
85
191
  next if line =~ /\A------SEP.RATOR------\Z/i
86
192
  handle_status JSON.parse(line), &block
87
193
  end
88
194
  end
195
+ puts "Last status seen at #{@last_status_seen_at}." if options[:analyze_gaps] && @last_status_seen_at
196
+ end
197
+
198
+ def analyze_gaps(status, min_gap_size_in_minutes)
199
+ time = Time.parse(status['created_at'])
200
+ if !@last_status_seen_at
201
+ puts "First status seen at #{time}."
202
+ else
203
+ gap_length = (time - @last_status_seen_at) / 60
204
+ if gap_length > min_gap_size_in_minutes
205
+ puts "Gap of #{gap_length.to_i} minutes from #{@last_status_seen_at} to #{time}."
206
+ end
207
+ end
208
+ @last_status_seen_at = time
89
209
  end
90
210
 
91
211
  def sample_fields(status)
@@ -142,4 +262,4 @@ module TwitterToCsv
142
262
  true
143
263
  end
144
264
  end
145
- end
265
+ end
@@ -17,25 +17,33 @@ module TwitterToCsv
17
17
  end
18
18
 
19
19
  def run(&block)
20
- EventMachine::run do
21
- stream = Twitter::JSONStream.connect(
22
- :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
23
- :auth => "#{username}:#{password}",
24
- :ssl => true
25
- )
26
-
27
- stream.each_item do |item|
28
- handle_status JSON.parse(item), block
29
- end
20
+ while true
21
+ EventMachine::run do
22
+ stream = Twitter::JSONStream.connect(
23
+ :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
24
+ :auth => "#{username}:#{password}",
25
+ :ssl => true
26
+ )
30
27
 
31
- stream.on_error do |message|
32
- STDERR.puts " --> Twitter error: #{message} <--"
33
- end
28
+ stream.each_item do |item|
29
+ handle_status JSON.parse(item), block
30
+ end
31
+
32
+ stream.on_error do |message|
33
+ STDERR.puts " --> Twitter error: #{message} <--"
34
+ end
35
+
36
+ stream.on_no_data do |message|
37
+ STDERR.puts " --> Got no data for awhile; trying to reconnect."
38
+ EventMachine::stop_event_loop
39
+ end
34
40
 
35
- stream.on_max_reconnects do |timeout, retries|
36
- STDERR.puts " --> Oops, tried too many times! <--"
37
- EventMachine::stop_event_loop
41
+ stream.on_max_reconnects do |timeout, retries|
42
+ STDERR.puts " --> Oops, tried too many times! <--"
43
+ EventMachine::stop_event_loop
44
+ end
38
45
  end
46
+ puts " --> Reconnecting..."
39
47
  end
40
48
  end
41
49
 
@@ -1,3 +1,3 @@
1
1
  module TwitterToCsv
2
- VERSION = "0.0.5"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
  require 'spec_helper'
3
+ require 'time'
3
4
 
4
5
  describe TwitterToCsv::CsvBuilder do
5
6
  describe "#handle_status" do
@@ -15,6 +16,23 @@ describe TwitterToCsv::CsvBuilder do
15
16
  string_io.rewind
16
17
  string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
17
18
  end
19
+
20
+ it "honors start_time and end_time" do
21
+ string_io = StringIO.new
22
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
23
+ :start => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
24
+ :end => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
25
+
26
+ # Order shouldn't matter
27
+ csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
28
+ csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
29
+ csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
30
+ csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
31
+ csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
32
+ csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
33
+ string_io.rewind
34
+ string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
35
+ end
18
36
  end
19
37
 
20
38
  describe "log_csv_header" do
@@ -33,10 +51,23 @@ describe TwitterToCsv::CsvBuilder do
33
51
  string_io.rewind
34
52
  string_io.read.should == '"something","url_1","url_2"' + "\n"
35
53
  end
54
+
55
+ it "includes columns for the retweet_counts_at entries, if present" do
56
+ string_io = StringIO.new
57
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io,
58
+ :fields => %w[something],
59
+ :retweet_mode => :rollup,
60
+ :retweet_threshold => 1,
61
+ :retweet_window => 4,
62
+ :retweet_counts_at => [0.5, 24, 48])
63
+ csv_builder.log_csv_header
64
+ string_io.rewind
65
+ string_io.read.should == '"something","retweets_at_0.5_hours","retweets_at_24_hours","retweets_at_48_hours"' + "\n"
66
+ end
36
67
  end
37
68
 
38
69
  describe "logging to a CSV" do
39
- it "outputs the requested fields when requested in dot-notation" do
70
+ it "outputs the requested fields when specified in dot-notation" do
40
71
  string_io = StringIO.new
41
72
  csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
42
73
  csv_builder.handle_status({
@@ -54,20 +85,233 @@ describe TwitterToCsv::CsvBuilder do
54
85
  string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
55
86
  end
56
87
 
57
- it "can extract URLs" do
88
+ it "can extract URLs, hashtags, and user mentions" do
58
89
  string_io = StringIO.new
59
- csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
90
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2, :hashtag_columns => 2, :user_mention_columns => 1)
60
91
  csv_builder.handle_status({
61
- 'something' => "hello",
62
- 'text' => 'this is http://a.com/url and http://a.com/nother'
92
+ 'something' => "hello1",
93
+ "entities" => {
94
+ "hashtags" => [
95
+ { "text" => "AHashTag" },
96
+ { "text" => "AnotherHashTag" },
97
+ { "text" => "AThirdHashTag" }
98
+ ],
99
+ "user_mentions" => [
100
+ { "screen_name" => "ScreenNameOne" },
101
+ { "screen_name" => "ScreenNameTwo" },
102
+ { "screen_name" => "ScreenNameThree" }
103
+ ],
104
+ "urls" => [
105
+ { "url" => "http://t.co/1231" },
106
+ { "url" => "http://t.co/1232", "expanded_url" => "http://a.real.url2" },
107
+ { "url" => "http://t.co/1233", "expanded_url" => "http://a.real.url3" }
108
+ ]
109
+ },
110
+ 'text' => 'some text'
111
+
63
112
  })
64
113
  csv_builder.handle_status({
65
- 'something' => "hello",
66
- 'text' => 'this is http://a.com/url/again'
114
+ 'something' => "hello2",
115
+ "entities" => {
116
+ "hashtags" => [],
117
+ "user_mentions" => [],
118
+ "urls" => []
119
+ },
120
+ 'text' => 'this is another status'
121
+ })
122
+ string_io.rewind
123
+ string_io.read.should == "\"hello1\",\"http://t.co/1231\",\"http://a.real.url2\",\"AHashTag\",\"AnotherHashTag\",\"ScreenNameOne\"\n" +
124
+ "\"hello2\",\"\",\"\",\"\",\"\",\"\"\n"
125
+ end
126
+
127
+ it "can compute the average sentiment" do
128
+ string_io = StringIO.new
129
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_sentiment => true)
130
+ csv_builder.handle_status({
131
+ 'something' => "hello1",
132
+ 'text' => 'i love cheese'
133
+
134
+ })
135
+ csv_builder.handle_status({
136
+ 'something' => "hello2",
137
+ 'text' => 'i love cheese and like bread'
138
+ })
139
+ csv_builder.handle_status({
140
+ 'something' => "hello3",
141
+ 'text' => 'some kind of once-in-a-lifetime cool-fest in the right direction or the right-direction or the son_of a bitch' # it tries both hyphenated and non-hyphenated, and does phrases
67
142
  })
68
143
  string_io.rewind
69
- string_io.read.should == "\"hello\",\"http://a.com/url\",\"http://a.com/nother\"\n" +
70
- "\"hello\",\"http://a.com/url/again\",\"\"\n"
144
+ string_io.read.should == "\"hello1\",\"3.0\",\"1\"\n" +
145
+ "\"hello2\",\"#{(3 + 2) / 2.0}\",\"2\"\n" +
146
+ "\"hello3\",\"#{(0 + 3 + 1 + 3 + 3 + -5) / 6.0}\",\"6\"\n"
147
+ end
148
+
149
+ it "can compute word count" do
150
+ string_io = StringIO.new
151
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_word_count => true)
152
+ csv_builder.handle_status({
153
+ 'something' => "hello1",
154
+ 'text' => 'i love cheese'
155
+
156
+ })
157
+ csv_builder.handle_status({
158
+ 'something' => "hello2",
159
+ 'text' => 'foo_bar baz9bing'
160
+ })
161
+ string_io.rewind
162
+ string_io.read.should == "\"hello1\",\"3\"\n" +
163
+ "\"hello2\",\"2\"\n"
164
+ end
165
+ end
166
+
167
+ describe "retweet handling" do
168
+ def play_data(builder)
169
+ days = 60 * 60 * 24
170
+ now = Time.now
171
+
172
+ builder.handle_status({
173
+ 'created_at' => now,
174
+ 'retweeted_status' => {
175
+ 'id' => 3,
176
+ 'created_at' => now - 1 * days,
177
+ 'retweet_count' => 1
178
+ },
179
+ 'text' => 'RT not enough time has passed'
180
+ })
181
+
182
+ builder.handle_status({
183
+ 'id' => 3,
184
+ 'created_at' => now - 1 * days,
185
+ 'text' => 'not enough time has passed',
186
+ 'retweet_count' => 0
187
+ })
188
+
189
+ builder.handle_status({
190
+ 'created_at' => now - 1 * days,
191
+ 'retweeted_status' => {
192
+ 'id' => 2,
193
+ 'created_at' => now - 4 * days,
194
+ 'retweet_count' => 3
195
+ },
196
+ 'text' => 'RT 2 retweets'
197
+ })
198
+
199
+ builder.handle_status({
200
+ 'created_at' => now - 2 * days,
201
+ 'retweeted_status' => {
202
+ 'id' => 4,
203
+ 'created_at' => now - 5 * days,
204
+ 'retweet_count' => 1
205
+ },
206
+ 'text' => 'RT 1 retweet'
207
+ })
208
+
209
+ builder.handle_status({
210
+ 'created_at' => now - 3 * days,
211
+ 'retweeted_status' => {
212
+ 'id' => 2,
213
+ 'created_at' => now - 4 * days,
214
+ 'retweet_count' => 2
215
+ },
216
+ 'text' => 'RT 2 retweets'
217
+ })
218
+
219
+ builder.handle_status({
220
+ 'created_at' => now - 3.99 * days,
221
+ 'retweeted_status' => {
222
+ 'id' => 2,
223
+ 'created_at' => now - 4 * days,
224
+ 'retweet_count' => 1
225
+ },
226
+ 'text' => 'RT 2 retweets'
227
+ })
228
+
229
+ builder.handle_status({
230
+ 'id' => 2,
231
+ 'created_at' => now - 4 * days,
232
+ 'text' => '2 retweets',
233
+ 'retweet_count' => 0
234
+ })
235
+
236
+ builder.handle_status({
237
+ 'id' => 4,
238
+ 'created_at' => now - 5 * days,
239
+ 'text' => '1 retweet',
240
+ 'retweet_count' => 0
241
+ })
242
+
243
+ builder.handle_status({
244
+ 'id' => 5,
245
+ 'created_at' => now - 5.1 * days,
246
+ 'text' => 'no retweets',
247
+ 'retweet_count' => 0
248
+ })
249
+ end
250
+
251
+ it "skips statuses with fewer than :retweet_threshold retweets and ignores statues that haven't been seen for retweet_window yet" do
252
+ string_io = StringIO.new
253
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
254
+ :retweet_threshold => 2,
255
+ :retweet_window => 2,
256
+ :csv => string_io,
257
+ :fields => %w[id retweet_count])
258
+ play_data builder
259
+ string_io.rewind
260
+ string_io.read.should == "\"2\",\"2\"\n"
261
+
262
+ string_io = StringIO.new
263
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
264
+ :retweet_threshold => 1,
265
+ :retweet_window => 3,
266
+ :csv => string_io,
267
+ :fields => %w[id retweet_count])
268
+ play_data builder
269
+ string_io.rewind
270
+ string_io.read.should == "\"2\",\"3\"\n" + "\"4\",\"1\"\n"
271
+
272
+ string_io = StringIO.new
273
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
274
+ :retweet_threshold => 1,
275
+ :retweet_window => 20,
276
+ :csv => string_io,
277
+ :fields => %w[id retweet_count])
278
+ play_data builder
279
+ string_io.rewind
280
+ string_io.read.should == ""
281
+
282
+ string_io = StringIO.new
283
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
284
+ :retweet_threshold => 1,
285
+ :retweet_window => nil,
286
+ :csv => string_io,
287
+ :fields => %w[id retweet_count])
288
+ play_data builder
289
+ string_io.rewind
290
+ string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n"
291
+
292
+ string_io = StringIO.new
293
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
294
+ :retweet_threshold => 0,
295
+ :retweet_window => nil,
296
+ :csv => string_io,
297
+ :fields => %w[id retweet_count])
298
+ play_data builder
299
+ string_io.rewind
300
+ string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n\"5\",\"0\"\n"
301
+ end
302
+
303
+ it "logs at the hourly marks requested in retweet_counts_at" do
304
+ string_io = StringIO.new
305
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
306
+ :retweet_threshold => 1,
307
+ :retweet_window => 4,
308
+ :retweet_counts_at => [0.5, 23, 24, 48, 73, 1000],
309
+ :csv => string_io,
310
+ :fields => %w[id retweet_count])
311
+ play_data builder
312
+ string_io.rewind
313
+ string_io.read.should == "\"2\",\"3\",\"1\",\"1\",\"2\",\"2\",\"3\",\"3\"\n" +
314
+ "\"4\",\"1\",\"0\",\"0\",\"0\",\"0\",\"1\",\"1\"\n"
71
315
  end
72
316
  end
73
317
  end