twitter_to_csv 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +7 -0
- data/README.markdown +10 -2
- data/bin/twitter_to_csv +8 -0
- data/lib/twitter_to_csv/csv_builder.rb +39 -14
- data/lib/twitter_to_csv/twitter_watcher.rb +4 -2
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +36 -4
- metadata +7 -4
data/LICENSE.txt
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
Copyright (c) 2012 Andrew Cantino, Iteration Labs, LLC
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
|
+
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
CHANGED
@@ -49,7 +49,12 @@ separate words unless the whole thing has a single known valence.
|
|
49
49
|
|
50
50
|
Once you have a recorded Twitter stream, you can rollup retweets in various ways. Here is an example that collapses retweets into the `retweet_count` field of the original tweet, only outputs tweets with at least 1 retweet, ignores retweets that happened more than 7 days after the original tweet, and outputs retweet count columns at half an hour, 2 hours, and 2 days after the original tweet:
|
51
51
|
|
52
|
-
twitter_to_csv --replay-from-file out.json -c out.csv
|
52
|
+
twitter_to_csv --replay-from-file out.json -c out.csv \
|
53
|
+
--retweet-mode rollup \
|
54
|
+
--retweet-threshold 1 \
|
55
|
+
--retweet-window 7 \
|
56
|
+
--retweet-counts-at 0.5,2,48 \
|
57
|
+
--fields retweet_count,text
|
53
58
|
|
54
59
|
Note that all of the retweet features require you to `--replay-from-file` because they parse the stream backwards. They will not function correctly from the stream directly.
|
55
60
|
|
@@ -57,7 +62,10 @@ Note that all of the retweet features require you to `--replay-from-file` becaus
|
|
57
62
|
|
58
63
|
To select a specific window of time in a pre-recorded stream by `created_at`, pass in `--start` and `--end`, for example:
|
59
64
|
|
60
|
-
twitter_to_csv --replay-from-file out.json
|
65
|
+
twitter_to_csv --replay-from-file out.json \
|
66
|
+
--start "Mon Mar 07 07:42:22 +0000 2011" \
|
67
|
+
--end "Mon Mar 08 07:42:22 +0000 2011" \
|
68
|
+
...
|
61
69
|
|
62
70
|
## Mind the Gap
|
63
71
|
|
data/bin/twitter_to_csv
CHANGED
@@ -36,6 +36,10 @@ parser = OptionParser.new do |opts|
|
|
36
36
|
options[:fields] = fields.split(/\s*,\s*/)
|
37
37
|
end
|
38
38
|
|
39
|
+
opts.on("--date-fields FIELD_NAMES", "Break these fields into separate numerical columns for weekday, day, month, your, hour, minute, and second.") do |date_fields|
|
40
|
+
options[:date_fields] = date_fields.split(/\s*,\s*/)
|
41
|
+
end
|
42
|
+
|
39
43
|
opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
|
40
44
|
options[:require_english] = e
|
41
45
|
end
|
@@ -76,6 +80,10 @@ parser = OptionParser.new do |opts|
|
|
76
80
|
options[:compute_word_count] = compute_word_count
|
77
81
|
end
|
78
82
|
|
83
|
+
opts.on("--normalize-source", "Return just the domain name from the Tweet source (i.e., tweetdeck, facebook)") do |normalize_source|
|
84
|
+
options[:normalize_source] = normalize_source
|
85
|
+
end
|
86
|
+
|
79
87
|
opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
|
80
88
|
options[:start_time] = Time.parse(start_time)
|
81
89
|
end
|
@@ -37,10 +37,10 @@ module TwitterToCsv
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def within_time_window?(status)
|
40
|
-
if options[:
|
40
|
+
if options[:start_time] || options[:end_time]
|
41
41
|
created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
|
42
|
-
return false if options[:
|
43
|
-
return false if options[:
|
42
|
+
return false if options[:start_time] && created_at < options[:start_time]
|
43
|
+
return false if options[:end_time] && created_at >= options[:end_time]
|
44
44
|
end
|
45
45
|
true
|
46
46
|
end
|
@@ -56,7 +56,6 @@ module TwitterToCsv
|
|
56
56
|
@retweet_counts[status['retweeted_status']['id']] ||= 0
|
57
57
|
@retweet_counts[status['retweeted_status']['id']] = status['retweeted_status']['retweet_count'] if status['retweeted_status']['retweet_count'] > @retweet_counts[status['retweeted_status']['id']]
|
58
58
|
|
59
|
-
|
60
59
|
if options[:retweet_counts_at]
|
61
60
|
@retweet_hour_counts[status['retweeted_status']['id']] ||= options[:retweet_counts_at].map { 0 }
|
62
61
|
options[:retweet_counts_at].each.with_index do |hour_mark, index|
|
@@ -72,7 +71,14 @@ module TwitterToCsv
|
|
72
71
|
if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
|
73
72
|
if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
|
74
73
|
status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
75
|
-
|
74
|
+
if options[:retweet_counts_at]
|
75
|
+
retweet_hour_data = @retweet_hour_counts.delete(status['id'])
|
76
|
+
if !retweet_hour_data
|
77
|
+
puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
status['_retweet_hour_counts'] = retweet_hour_data
|
81
|
+
end
|
76
82
|
true
|
77
83
|
else
|
78
84
|
false
|
@@ -104,6 +110,14 @@ module TwitterToCsv
|
|
104
110
|
header_labels += ["average_sentiment", "sentiment_words"] if options[:compute_sentiment]
|
105
111
|
header_labels << "word_count" if options[:compute_word_count]
|
106
112
|
|
113
|
+
header_labels << "normalized_source" if options[:normalize_source]
|
114
|
+
|
115
|
+
(options[:date_fields] || []).each do |date_field|
|
116
|
+
%w[week_day day month year hour minute second].each do |value|
|
117
|
+
header_labels << "#{date_field}_#{value}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
107
121
|
options[:retweet_counts_at].each { |hours| header_labels << "retweets_at_#{hours}_hours" } if options[:retweet_counts_at]
|
108
122
|
|
109
123
|
options[:url_columns].times { |i| header_labels << "url_#{i+1}" } if options[:url_columns] && options[:url_columns] > 0
|
@@ -132,6 +146,22 @@ module TwitterToCsv
|
|
132
146
|
|
133
147
|
row << status["text"].split(/\s+/).length if options[:compute_word_count]
|
134
148
|
|
149
|
+
row << status["source"].gsub(/<[^>]+>/, '').strip if options[:normalize_source]
|
150
|
+
|
151
|
+
(options[:date_fields] || []).each do |date_field|
|
152
|
+
time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
|
153
|
+
memo && memo[segment]
|
154
|
+
}.to_s)
|
155
|
+
|
156
|
+
row << time.strftime("%w") # week_day
|
157
|
+
row << time.strftime("%-d") # day
|
158
|
+
row << time.strftime("%-m") # month
|
159
|
+
row << time.strftime("%Y") # year
|
160
|
+
row << time.strftime("%-H") # hour
|
161
|
+
row << time.strftime("%M") # minute
|
162
|
+
row << time.strftime("%S") # second
|
163
|
+
end
|
164
|
+
|
135
165
|
row += status["_retweet_hour_counts"] if options[:retweet_counts_at]
|
136
166
|
|
137
167
|
if options[:url_columns] && options[:url_columns] > 0
|
@@ -244,15 +274,10 @@ module TwitterToCsv
|
|
244
274
|
return false
|
245
275
|
end
|
246
276
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
end
|
251
|
-
|
252
|
-
unless status['user']['lang'] == "en"
|
253
|
-
STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
|
254
|
-
return false
|
255
|
-
end
|
277
|
+
#unless status['user']['lang'] == "en"
|
278
|
+
# STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
|
279
|
+
# return false
|
280
|
+
#end
|
256
281
|
|
257
282
|
unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
|
258
283
|
STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
|
1
3
|
module TwitterToCsv
|
2
4
|
class TwitterWatcher
|
3
5
|
attr_accessor :username, :password, :filter, :fetch_errors
|
@@ -20,7 +22,7 @@ module TwitterToCsv
|
|
20
22
|
while true
|
21
23
|
EventMachine::run do
|
22
24
|
stream = Twitter::JSONStream.connect(
|
23
|
-
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
|
25
|
+
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
|
24
26
|
:auth => "#{username}:#{password}",
|
25
27
|
:ssl => true
|
26
28
|
)
|
@@ -55,4 +57,4 @@ module TwitterToCsv
|
|
55
57
|
block.call(status)
|
56
58
|
end
|
57
59
|
end
|
58
|
-
end
|
60
|
+
end
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -9,10 +9,8 @@ describe TwitterToCsv::CsvBuilder do
|
|
9
9
|
string_io = StringIO.new
|
10
10
|
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
|
11
11
|
csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
|
12
|
-
csv_builder.handle_status('text' => "هذه الجملة باللغة الإنجليزية.", 'user' => { 'lang' => 'en' })
|
13
12
|
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
|
14
13
|
csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
|
15
|
-
csv_builder.handle_status('text' => "The lang code can lie, but we trust it for now.", 'user' => { 'lang' => 'fr' })
|
16
14
|
string_io.rewind
|
17
15
|
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
18
16
|
end
|
@@ -20,8 +18,8 @@ describe TwitterToCsv::CsvBuilder do
|
|
20
18
|
it "honors start_time and end_time" do
|
21
19
|
string_io = StringIO.new
|
22
20
|
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
|
23
|
-
:
|
24
|
-
:
|
21
|
+
:start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
|
22
|
+
:end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
|
25
23
|
|
26
24
|
# Order shouldn't matter
|
27
25
|
csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
|
@@ -52,6 +50,14 @@ describe TwitterToCsv::CsvBuilder do
|
|
52
50
|
string_io.read.should == '"something","url_1","url_2"' + "\n"
|
53
51
|
end
|
54
52
|
|
53
|
+
it "includes date fields if requested" do
|
54
|
+
string_io = StringIO.new
|
55
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
|
56
|
+
csv_builder.log_csv_header
|
57
|
+
string_io.rewind
|
58
|
+
string_io.read.should == '"something","created_at_week_day","created_at_day","created_at_month","created_at_year","created_at_hour","created_at_minute","created_at_second"' + "\n"
|
59
|
+
end
|
60
|
+
|
55
61
|
it "includes columns for the retweet_counts_at entries, if present" do
|
56
62
|
string_io = StringIO.new
|
57
63
|
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io,
|
@@ -162,6 +168,32 @@ describe TwitterToCsv::CsvBuilder do
|
|
162
168
|
string_io.read.should == "\"hello1\",\"3\"\n" +
|
163
169
|
"\"hello2\",\"2\"\n"
|
164
170
|
end
|
171
|
+
|
172
|
+
it "can return date fields" do
|
173
|
+
string_io = StringIO.new
|
174
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
|
175
|
+
csv_builder.handle_status({
|
176
|
+
'something' => "hello1",
|
177
|
+
'text' => 'i love cheese',
|
178
|
+
'created_at' => "2012-06-29 13:12:09 -0700"
|
179
|
+
|
180
|
+
})
|
181
|
+
string_io.rewind
|
182
|
+
string_io.read.should == "\"hello1\",\"5\",\"29\",\"6\",\"2012\",\"13\",\"12\",\"09\"\n"
|
183
|
+
end
|
184
|
+
|
185
|
+
it "can return a normalized source" do
|
186
|
+
string_io = StringIO.new
|
187
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :normalize_source => true)
|
188
|
+
csv_builder.handle_status({
|
189
|
+
'something' => "hello1",
|
190
|
+
'text' => 'i love cheese',
|
191
|
+
'source' => "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>"
|
192
|
+
|
193
|
+
})
|
194
|
+
string_io.rewind
|
195
|
+
string_io.read.should == "\"hello1\",\"Twitter for Android\"\n"
|
196
|
+
end
|
165
197
|
end
|
166
198
|
|
167
199
|
describe "retweet handling" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_to_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07
|
12
|
+
date: 2012-08-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -103,6 +103,7 @@ files:
|
|
103
103
|
- .rspec
|
104
104
|
- .rvmrc
|
105
105
|
- Gemfile
|
106
|
+
- LICENSE.txt
|
106
107
|
- README.markdown
|
107
108
|
- Rakefile
|
108
109
|
- bin/twitter_to_csv
|
@@ -136,9 +137,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
136
137
|
version: '0'
|
137
138
|
requirements: []
|
138
139
|
rubyforge_project: twitter_to_csv
|
139
|
-
rubygems_version: 1.8.
|
140
|
+
rubygems_version: 1.8.21
|
140
141
|
signing_key:
|
141
142
|
specification_version: 3
|
142
143
|
summary: Dump the Twitter streaming API to a CSV or JSON file and then filter, handle
|
143
144
|
retweets, apply sentiment analysis, and more.
|
144
|
-
test_files:
|
145
|
+
test_files:
|
146
|
+
- spec/csv_builder_spec.rb
|
147
|
+
- spec/spec_helper.rb
|