twitter_to_csv 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ AFINN is a list of English words rated for valence with an integer
2
+ between minus five (negative) and plus five (positive). The words have
3
+ been manually labeled by Finn Årup Nielsen in 2009-2011. The file
4
+ is tab-separated. There are two versions:
5
+
6
+ AFINN-111: Newest version with 2477 words and phrases.
7
+
8
+ AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
9
+ are 1480 lines, as some words are listed twice. The word list in not
10
+ entirely in alphabetic ordering.
11
+
12
+ An evaluation of the word list is available in:
13
+
14
+ Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
15
+ sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
16
+
17
+ The list was used in:
18
+
19
+ Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
20
+ Michael Etter, "Good Friends, Bad News - Affect and Virality in
21
+ Twitter", The 2011 International Workshop on Social Computing,
22
+ Network, and Services (SocialComNet 2011).
23
+
24
+
25
+ This database of words is copyright protected and distributed under
26
+ "Open Database License (ODbL) v1.0"
27
+ http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
28
+ copyleft license.
29
+
30
+ See comments on the word list here:
31
+ http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
32
+
33
+
34
+ In Python the file may be read into a dictionary with:
35
+
36
+ >>> afinn = dict(map(lambda (k,v): (k,int(v)),
37
+ [ line.split('\t') for line in open("AFINN-111.txt") ]))
38
+ >>> afinn["Good".lower()]
39
+ 3
40
+ >>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split()))
41
+ 2
42
+
43
+
@@ -1,17 +1,18 @@
1
1
  # encoding: UTF-8
2
2
  require 'pp'
3
+ require 'elif'
4
+ require 'time'
3
5
 
4
6
  module TwitterToCsv
5
7
  class CsvBuilder
6
8
  attr_accessor :options, :sampled_fields
7
9
 
8
- # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
9
- URL_REGEX = %r"\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s\`\!\(\)\[\]\{\};:'\".,<>\?«»“”‘’]))"i
10
-
11
10
  def initialize(options = {})
12
11
  @options = options
13
12
  @sampled_fields = {}
14
13
  @num_samples = 0
14
+ @retweet_counts = {}
15
+ @retweet_hour_counts = {}
15
16
  end
16
17
 
17
18
  def run(&block)
@@ -35,22 +36,79 @@ module TwitterToCsv
35
36
  end
36
37
  end
37
38
 
39
+ def within_time_window?(status)
40
+ if options[:start] || options[:end]
41
+ created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
42
+ return false if options[:start] && created_at < options[:start]
43
+ return false if options[:end] && created_at >= options[:end]
44
+ end
45
+ true
46
+ end
47
+
48
+ def display_rolledup_status?(status)
49
+ created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
50
+ @newest_status_at = created_at if @newest_status_at.nil?
51
+
52
+ if status['retweeted_status'] && status['retweeted_status']['id']
53
+ # This is a retweet.
54
+ original_created_at = status['retweeted_status']['created_at'].is_a?(Time) ? status['retweeted_status']['created_at'] : Time.parse(status['retweeted_status']['created_at'])
55
+ if !options[:retweet_window] || created_at <= original_created_at + options[:retweet_window] * 60 * 60 * 24
56
+ @retweet_counts[status['retweeted_status']['id']] ||= 0
57
+ @retweet_counts[status['retweeted_status']['id']] = status['retweeted_status']['retweet_count'] if status['retweeted_status']['retweet_count'] > @retweet_counts[status['retweeted_status']['id']]
58
+
59
+
60
+ if options[:retweet_counts_at]
61
+ @retweet_hour_counts[status['retweeted_status']['id']] ||= options[:retweet_counts_at].map { 0 }
62
+ options[:retweet_counts_at].each.with_index do |hour_mark, index|
63
+ if created_at <= original_created_at + hour_mark * 60 * 60 && status['retweeted_status']['retweet_count'] > @retweet_hour_counts[status['retweeted_status']['id']][index]
64
+ @retweet_hour_counts[status['retweeted_status']['id']][index] = status['retweeted_status']['retweet_count']
65
+ end
66
+ end
67
+ end
68
+ end
69
+ false
70
+ else
71
+ # This is an original status.
72
+ if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
73
+ if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
74
+ status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
75
+ status['_retweet_hour_counts'] = @retweet_hour_counts.delete(status['id']) if options[:retweet_counts_at]
76
+ true
77
+ else
78
+ false
79
+ end
80
+ else
81
+ false
82
+ end
83
+ end
84
+ end
85
+
38
86
  def handle_status(status, &block)
39
- if (options[:require_english] && is_english?(status)) || !options[:require_english]
40
- log_json(status) if options[:json]
41
- log_csv(status) if options[:csv]
42
- yield_status(status, &block) if block
43
- sample_fields(status) if options[:sample_fields]
44
- STDERR.puts "Logging: #{status['text']}" if options[:verbose]
87
+ if within_time_window?(status)
88
+ if (options[:require_english] && is_english?(status)) || !options[:require_english]
89
+ if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
90
+ log_json(status) if options[:json]
91
+ log_csv(status) if options[:csv]
92
+ yield_status(status, &block) if block
93
+ sample_fields(status) if options[:sample_fields]
94
+ analyze_gaps(status, options[:analyze_gaps]) if options[:analyze_gaps]
95
+ STDERR.puts "Logging: #{status['text']}" if options[:verbose]
96
+ end
97
+ end
45
98
  end
46
99
  end
47
100
 
48
101
  def log_csv_header
49
102
  header_labels = options[:fields].dup
50
103
 
51
- if options[:url_columns] && options[:url_columns] > 0
52
- options[:url_columns].times { |i| header_labels << "url_#{i+1}" }
53
- end
104
+ header_labels += ["average_sentiment", "sentiment_words"] if options[:compute_sentiment]
105
+ header_labels << "word_count" if options[:compute_word_count]
106
+
107
+ options[:retweet_counts_at].each { |hours| header_labels << "retweets_at_#{hours}_hours" } if options[:retweet_counts_at]
108
+
109
+ options[:url_columns].times { |i| header_labels << "url_#{i+1}" } if options[:url_columns] && options[:url_columns] > 0
110
+ options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
111
+ options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
54
112
 
55
113
  options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
56
114
  end
@@ -70,22 +128,84 @@ module TwitterToCsv
70
128
  }.to_s
71
129
  end
72
130
 
131
+ row += compute_sentiment(status["text"]) if options[:compute_sentiment]
132
+
133
+ row << status["text"].split(/\s+/).length if options[:compute_word_count]
134
+
135
+ row += status["_retweet_hour_counts"] if options[:retweet_counts_at]
136
+
73
137
  if options[:url_columns] && options[:url_columns] > 0
74
- urls = status['text'].scan(URL_REGEX).flatten.compact
138
+ urls = (status["entities"] && (status["entities"]["urls"] || []).map {|i| i["expanded_url"] || i["url"] }) || []
75
139
  options[:url_columns].times { |i| row << urls[i].to_s }
76
140
  end
77
141
 
142
+ if options[:hashtag_columns] && options[:hashtag_columns] > 0
143
+ hashes = (status["entities"] && (status["entities"]["hashtags"] || []).map {|i| i["text"] }) || []
144
+ options[:hashtag_columns].times { |i| row << hashes[i].to_s }
145
+ end
146
+
147
+ if options[:user_mention_columns] && options[:user_mention_columns] > 0
148
+ users = (status["entities"] && (status["entities"]["user_mentions"] || []).map {|i| i["screen_name"] }) || []
149
+ options[:user_mention_columns].times { |i| row << users[i].to_s }
150
+ end
151
+
78
152
  row
79
153
  end
80
154
 
155
+ def afinn
156
+ @afinn_cache ||= begin
157
+ words_or_phrases = []
158
+ File.read(File.expand_path(File.join(File.dirname(__FILE__), "afinn", "AFINN-111.txt"))).each_line do |line|
159
+ word_or_phrase, valence = line.split(/\t/)
160
+ pattern = Regexp::escape word_or_phrase.gsub(/-/, " ").gsub(/'/, '')
161
+ words_or_phrases << [/\b#{pattern}\b/i, pattern.length, valence.to_f]
162
+ end
163
+ words_or_phrases.sort {|b, a| a[1] <=> b[1] }
164
+ end
165
+ end
166
+
167
+ def compute_sentiment(original_text)
168
+ text = original_text.downcase.gsub(/'/, '').gsub(/[^a-z0-9]/, ' ').gsub(/\s+/, ' ').strip
169
+ count = 0
170
+ valence_sum = 0
171
+ afinn.each do |pattern, length, valence|
172
+ while text =~ pattern
173
+ text.sub! pattern, ''
174
+ valence_sum += valence
175
+ count += 1
176
+ end
177
+ end
178
+ if count > 0
179
+ [valence_sum / count.to_f, count]
180
+ else
181
+ [0, 0]
182
+ end
183
+ end
184
+
81
185
  def replay_from(filename, &block)
82
- File.open(filename, "r") do |file|
83
- until file.eof?
84
- line = file.readline
186
+ # If a retweet mode is being used, we read the file backwards using the Elif gem.
187
+ opener = options[:retweet_mode] ? Elif : File
188
+
189
+ opener.open(filename, "r") do |file|
190
+ file.each do |line|
85
191
  next if line =~ /\A------SEP.RATOR------\Z/i
86
192
  handle_status JSON.parse(line), &block
87
193
  end
88
194
  end
195
+ puts "Last status seen at #{@last_status_seen_at}." if options[:analyze_gaps] && @last_status_seen_at
196
+ end
197
+
198
+ def analyze_gaps(status, min_gap_size_in_minutes)
199
+ time = Time.parse(status['created_at'])
200
+ if !@last_status_seen_at
201
+ puts "First status seen at #{time}."
202
+ else
203
+ gap_length = (time - @last_status_seen_at) / 60
204
+ if gap_length > min_gap_size_in_minutes
205
+ puts "Gap of #{gap_length.to_i} minutes from #{@last_status_seen_at} to #{time}."
206
+ end
207
+ end
208
+ @last_status_seen_at = time
89
209
  end
90
210
 
91
211
  def sample_fields(status)
@@ -142,4 +262,4 @@ module TwitterToCsv
142
262
  true
143
263
  end
144
264
  end
145
- end
265
+ end
@@ -17,25 +17,33 @@ module TwitterToCsv
17
17
  end
18
18
 
19
19
  def run(&block)
20
- EventMachine::run do
21
- stream = Twitter::JSONStream.connect(
22
- :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
23
- :auth => "#{username}:#{password}",
24
- :ssl => true
25
- )
26
-
27
- stream.each_item do |item|
28
- handle_status JSON.parse(item), block
29
- end
20
+ while true
21
+ EventMachine::run do
22
+ stream = Twitter::JSONStream.connect(
23
+ :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
24
+ :auth => "#{username}:#{password}",
25
+ :ssl => true
26
+ )
30
27
 
31
- stream.on_error do |message|
32
- STDERR.puts " --> Twitter error: #{message} <--"
33
- end
28
+ stream.each_item do |item|
29
+ handle_status JSON.parse(item), block
30
+ end
31
+
32
+ stream.on_error do |message|
33
+ STDERR.puts " --> Twitter error: #{message} <--"
34
+ end
35
+
36
+ stream.on_no_data do |message|
37
+ STDERR.puts " --> Got no data for awhile; trying to reconnect."
38
+ EventMachine::stop_event_loop
39
+ end
34
40
 
35
- stream.on_max_reconnects do |timeout, retries|
36
- STDERR.puts " --> Oops, tried too many times! <--"
37
- EventMachine::stop_event_loop
41
+ stream.on_max_reconnects do |timeout, retries|
42
+ STDERR.puts " --> Oops, tried too many times! <--"
43
+ EventMachine::stop_event_loop
44
+ end
38
45
  end
46
+ puts " --> Reconnecting..."
39
47
  end
40
48
  end
41
49
 
@@ -1,3 +1,3 @@
1
1
  module TwitterToCsv
2
- VERSION = "0.0.5"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
  require 'spec_helper'
3
+ require 'time'
3
4
 
4
5
  describe TwitterToCsv::CsvBuilder do
5
6
  describe "#handle_status" do
@@ -15,6 +16,23 @@ describe TwitterToCsv::CsvBuilder do
15
16
  string_io.rewind
16
17
  string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
17
18
  end
19
+
20
+ it "honors start_time and end_time" do
21
+ string_io = StringIO.new
22
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
23
+ :start => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
24
+ :end => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
25
+
26
+ # Order shouldn't matter
27
+ csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
28
+ csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
29
+ csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
30
+ csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
31
+ csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
32
+ csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
33
+ string_io.rewind
34
+ string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
35
+ end
18
36
  end
19
37
 
20
38
  describe "log_csv_header" do
@@ -33,10 +51,23 @@ describe TwitterToCsv::CsvBuilder do
33
51
  string_io.rewind
34
52
  string_io.read.should == '"something","url_1","url_2"' + "\n"
35
53
  end
54
+
55
+ it "includes columns for the retweet_counts_at entries, if present" do
56
+ string_io = StringIO.new
57
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io,
58
+ :fields => %w[something],
59
+ :retweet_mode => :rollup,
60
+ :retweet_threshold => 1,
61
+ :retweet_window => 4,
62
+ :retweet_counts_at => [0.5, 24, 48])
63
+ csv_builder.log_csv_header
64
+ string_io.rewind
65
+ string_io.read.should == '"something","retweets_at_0.5_hours","retweets_at_24_hours","retweets_at_48_hours"' + "\n"
66
+ end
36
67
  end
37
68
 
38
69
  describe "logging to a CSV" do
39
- it "outputs the requested fields when requested in dot-notation" do
70
+ it "outputs the requested fields when specified in dot-notation" do
40
71
  string_io = StringIO.new
41
72
  csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
42
73
  csv_builder.handle_status({
@@ -54,20 +85,233 @@ describe TwitterToCsv::CsvBuilder do
54
85
  string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
55
86
  end
56
87
 
57
- it "can extract URLs" do
88
+ it "can extract URLs, hashtags, and user mentions" do
58
89
  string_io = StringIO.new
59
- csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
90
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2, :hashtag_columns => 2, :user_mention_columns => 1)
60
91
  csv_builder.handle_status({
61
- 'something' => "hello",
62
- 'text' => 'this is http://a.com/url and http://a.com/nother'
92
+ 'something' => "hello1",
93
+ "entities" => {
94
+ "hashtags" => [
95
+ { "text" => "AHashTag" },
96
+ { "text" => "AnotherHashTag" },
97
+ { "text" => "AThirdHashTag" }
98
+ ],
99
+ "user_mentions" => [
100
+ { "screen_name" => "ScreenNameOne" },
101
+ { "screen_name" => "ScreenNameTwo" },
102
+ { "screen_name" => "ScreenNameThree" }
103
+ ],
104
+ "urls" => [
105
+ { "url" => "http://t.co/1231" },
106
+ { "url" => "http://t.co/1232", "expanded_url" => "http://a.real.url2" },
107
+ { "url" => "http://t.co/1233", "expanded_url" => "http://a.real.url3" }
108
+ ]
109
+ },
110
+ 'text' => 'some text'
111
+
63
112
  })
64
113
  csv_builder.handle_status({
65
- 'something' => "hello",
66
- 'text' => 'this is http://a.com/url/again'
114
+ 'something' => "hello2",
115
+ "entities" => {
116
+ "hashtags" => [],
117
+ "user_mentions" => [],
118
+ "urls" => []
119
+ },
120
+ 'text' => 'this is another status'
121
+ })
122
+ string_io.rewind
123
+ string_io.read.should == "\"hello1\",\"http://t.co/1231\",\"http://a.real.url2\",\"AHashTag\",\"AnotherHashTag\",\"ScreenNameOne\"\n" +
124
+ "\"hello2\",\"\",\"\",\"\",\"\",\"\"\n"
125
+ end
126
+
127
+ it "can compute the average sentiment" do
128
+ string_io = StringIO.new
129
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_sentiment => true)
130
+ csv_builder.handle_status({
131
+ 'something' => "hello1",
132
+ 'text' => 'i love cheese'
133
+
134
+ })
135
+ csv_builder.handle_status({
136
+ 'something' => "hello2",
137
+ 'text' => 'i love cheese and like bread'
138
+ })
139
+ csv_builder.handle_status({
140
+ 'something' => "hello3",
141
+ 'text' => 'some kind of once-in-a-lifetime cool-fest in the right direction or the right-direction or the son_of a bitch' # it tries both hyphenated and non-hyphenated, and does phrases
67
142
  })
68
143
  string_io.rewind
69
- string_io.read.should == "\"hello\",\"http://a.com/url\",\"http://a.com/nother\"\n" +
70
- "\"hello\",\"http://a.com/url/again\",\"\"\n"
144
+ string_io.read.should == "\"hello1\",\"3.0\",\"1\"\n" +
145
+ "\"hello2\",\"#{(3 + 2) / 2.0}\",\"2\"\n" +
146
+ "\"hello3\",\"#{(0 + 3 + 1 + 3 + 3 + -5) / 6.0}\",\"6\"\n"
147
+ end
148
+
149
+ it "can compute word count" do
150
+ string_io = StringIO.new
151
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :compute_word_count => true)
152
+ csv_builder.handle_status({
153
+ 'something' => "hello1",
154
+ 'text' => 'i love cheese'
155
+
156
+ })
157
+ csv_builder.handle_status({
158
+ 'something' => "hello2",
159
+ 'text' => 'foo_bar baz9bing'
160
+ })
161
+ string_io.rewind
162
+ string_io.read.should == "\"hello1\",\"3\"\n" +
163
+ "\"hello2\",\"2\"\n"
164
+ end
165
+ end
166
+
167
+ describe "retweet handling" do
168
+ def play_data(builder)
169
+ days = 60 * 60 * 24
170
+ now = Time.now
171
+
172
+ builder.handle_status({
173
+ 'created_at' => now,
174
+ 'retweeted_status' => {
175
+ 'id' => 3,
176
+ 'created_at' => now - 1 * days,
177
+ 'retweet_count' => 1
178
+ },
179
+ 'text' => 'RT not enough time has passed'
180
+ })
181
+
182
+ builder.handle_status({
183
+ 'id' => 3,
184
+ 'created_at' => now - 1 * days,
185
+ 'text' => 'not enough time has passed',
186
+ 'retweet_count' => 0
187
+ })
188
+
189
+ builder.handle_status({
190
+ 'created_at' => now - 1 * days,
191
+ 'retweeted_status' => {
192
+ 'id' => 2,
193
+ 'created_at' => now - 4 * days,
194
+ 'retweet_count' => 3
195
+ },
196
+ 'text' => 'RT 2 retweets'
197
+ })
198
+
199
+ builder.handle_status({
200
+ 'created_at' => now - 2 * days,
201
+ 'retweeted_status' => {
202
+ 'id' => 4,
203
+ 'created_at' => now - 5 * days,
204
+ 'retweet_count' => 1
205
+ },
206
+ 'text' => 'RT 1 retweet'
207
+ })
208
+
209
+ builder.handle_status({
210
+ 'created_at' => now - 3 * days,
211
+ 'retweeted_status' => {
212
+ 'id' => 2,
213
+ 'created_at' => now - 4 * days,
214
+ 'retweet_count' => 2
215
+ },
216
+ 'text' => 'RT 2 retweets'
217
+ })
218
+
219
+ builder.handle_status({
220
+ 'created_at' => now - 3.99 * days,
221
+ 'retweeted_status' => {
222
+ 'id' => 2,
223
+ 'created_at' => now - 4 * days,
224
+ 'retweet_count' => 1
225
+ },
226
+ 'text' => 'RT 2 retweets'
227
+ })
228
+
229
+ builder.handle_status({
230
+ 'id' => 2,
231
+ 'created_at' => now - 4 * days,
232
+ 'text' => '2 retweets',
233
+ 'retweet_count' => 0
234
+ })
235
+
236
+ builder.handle_status({
237
+ 'id' => 4,
238
+ 'created_at' => now - 5 * days,
239
+ 'text' => '1 retweet',
240
+ 'retweet_count' => 0
241
+ })
242
+
243
+ builder.handle_status({
244
+ 'id' => 5,
245
+ 'created_at' => now - 5.1 * days,
246
+ 'text' => 'no retweets',
247
+ 'retweet_count' => 0
248
+ })
249
+ end
250
+
251
+ it "skips statuses with fewer than :retweet_threshold retweets and ignores statues that haven't been seen for retweet_window yet" do
252
+ string_io = StringIO.new
253
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
254
+ :retweet_threshold => 2,
255
+ :retweet_window => 2,
256
+ :csv => string_io,
257
+ :fields => %w[id retweet_count])
258
+ play_data builder
259
+ string_io.rewind
260
+ string_io.read.should == "\"2\",\"2\"\n"
261
+
262
+ string_io = StringIO.new
263
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
264
+ :retweet_threshold => 1,
265
+ :retweet_window => 3,
266
+ :csv => string_io,
267
+ :fields => %w[id retweet_count])
268
+ play_data builder
269
+ string_io.rewind
270
+ string_io.read.should == "\"2\",\"3\"\n" + "\"4\",\"1\"\n"
271
+
272
+ string_io = StringIO.new
273
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
274
+ :retweet_threshold => 1,
275
+ :retweet_window => 20,
276
+ :csv => string_io,
277
+ :fields => %w[id retweet_count])
278
+ play_data builder
279
+ string_io.rewind
280
+ string_io.read.should == ""
281
+
282
+ string_io = StringIO.new
283
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
284
+ :retweet_threshold => 1,
285
+ :retweet_window => nil,
286
+ :csv => string_io,
287
+ :fields => %w[id retweet_count])
288
+ play_data builder
289
+ string_io.rewind
290
+ string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n"
291
+
292
+ string_io = StringIO.new
293
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
294
+ :retweet_threshold => 0,
295
+ :retweet_window => nil,
296
+ :csv => string_io,
297
+ :fields => %w[id retweet_count])
298
+ play_data builder
299
+ string_io.rewind
300
+ string_io.read.should == "\"3\",\"1\"\n\"2\",\"3\"\n\"4\",\"1\"\n\"5\",\"0\"\n"
301
+ end
302
+
303
+ it "logs at the hourly marks requested in retweet_counts_at" do
304
+ string_io = StringIO.new
305
+ builder = TwitterToCsv::CsvBuilder.new(:retweet_mode => :rollup,
306
+ :retweet_threshold => 1,
307
+ :retweet_window => 4,
308
+ :retweet_counts_at => [0.5, 23, 24, 48, 73, 1000],
309
+ :csv => string_io,
310
+ :fields => %w[id retweet_count])
311
+ play_data builder
312
+ string_io.rewind
313
+ string_io.read.should == "\"2\",\"3\",\"1\",\"1\",\"2\",\"2\",\"3\",\"3\"\n" +
314
+ "\"4\",\"1\",\"0\",\"0\",\"0\",\"0\",\"1\",\"1\"\n"
71
315
  end
72
316
  end
73
317
  end