twitter_to_csv 0.1.2 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -0
- data/Gemfile +3 -1
- data/README.markdown +316 -153
- data/bin/twitter_to_csv +45 -10
- data/lib/twitter_to_csv.rb +1 -0
- data/lib/twitter_to_csv/bool_word_field_parser.rb +74 -0
- data/lib/twitter_to_csv/csv_builder.rb +50 -28
- data/lib/twitter_to_csv/twitter_watcher.rb +12 -5
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/bool_word_field_parser_spec.rb +57 -0
- data/spec/csv_builder_spec.rb +104 -32
- data/twitter_to_csv.gemspec +1 -1
- metadata +14 -21
- data/.rvmrc +0 -1
data/bin/twitter_to_csv
CHANGED
@@ -5,20 +5,32 @@ require 'optparse'
|
|
5
5
|
require 'time'
|
6
6
|
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
7
7
|
|
8
|
-
options = { :fields => %w[created_at text] }
|
8
|
+
options = { :fields => %w[created_at text], :bool_word_fields => [] }
|
9
9
|
parser = OptionParser.new do |opts|
|
10
10
|
opts.banner = "Usage: #{File.basename($0)} [options]"
|
11
|
+
|
11
12
|
opts.separator ""
|
12
|
-
opts.separator "
|
13
|
+
opts.separator "These four fields are required. Please see the README to learn how to get them for your Twitter account."
|
14
|
+
|
15
|
+
opts.on("--api-key KEY", "Twitter API key") do |api_key|
|
16
|
+
options[:api_key] = api_key
|
17
|
+
end
|
13
18
|
|
14
|
-
opts.on("-
|
15
|
-
options[:
|
19
|
+
opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret|
|
20
|
+
options[:api_secret] = api_secret
|
16
21
|
end
|
17
22
|
|
18
|
-
opts.on("-
|
19
|
-
options[:
|
23
|
+
opts.on("--access-token TOKEN", "Twitter access token") do |access_token|
|
24
|
+
options[:access_token] = access_token
|
20
25
|
end
|
21
26
|
|
27
|
+
opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret|
|
28
|
+
options[:access_token_secret] = access_token_secret
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.separator ""
|
32
|
+
opts.separator "General settings:"
|
33
|
+
|
22
34
|
opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv|
|
23
35
|
options[:csv_appending] = File.exists?(csv)
|
24
36
|
options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a')
|
@@ -40,8 +52,12 @@ parser = OptionParser.new do |opts|
|
|
40
52
|
options[:date_fields] = date_fields.split(/\s*,\s*/)
|
41
53
|
end
|
42
54
|
|
43
|
-
opts.on("-e", "--require-english
|
44
|
-
|
55
|
+
opts.on("-e", "--require-english [STRATEGY]",
|
56
|
+
"Attempt to filter out non-English tweets. This will have both false positives and false negatives.",
|
57
|
+
"The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,",
|
58
|
+
"'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that",
|
59
|
+
"both Twitter and ULD think are non-English. This is most conservative and is the default.") do |e|
|
60
|
+
options[:require_english] = (e || "both").downcase.to_sym
|
45
61
|
end
|
46
62
|
|
47
63
|
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
@@ -84,6 +100,25 @@ parser = OptionParser.new do |opts|
|
|
84
100
|
options[:normalize_source] = normalize_source
|
85
101
|
end
|
86
102
|
|
103
|
+
opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes|
|
104
|
+
options[:remove_quotes] = remove_quotes
|
105
|
+
end
|
106
|
+
|
107
|
+
opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids|
|
108
|
+
options[:prefix_ids] = prefix_ids
|
109
|
+
end
|
110
|
+
|
111
|
+
opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"",
|
112
|
+
"Create a named CSV column that is true when the word expression matches, false otherwise.",
|
113
|
+
"Word expressions are boolean expressions where neighboring words must occur sequentially",
|
114
|
+
"and you can use parentheses, AND, and OR to test for occurrence relationships. Examples:",
|
115
|
+
" keyword_any:tanning booth OR tanning booths OR tanningbooth",
|
116
|
+
" keyword_both:tanning AND booth",
|
117
|
+
" keyword_complex:tanning AND (booth OR bed)",
|
118
|
+
"This option can be used multiple times.") do |bool_word_field|
|
119
|
+
options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field)
|
120
|
+
end
|
121
|
+
|
87
122
|
opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
|
88
123
|
options[:start_time] = Time.parse(start_time)
|
89
124
|
end
|
@@ -125,8 +160,8 @@ end
|
|
125
160
|
|
126
161
|
parser.parse!
|
127
162
|
|
128
|
-
unless (options[:
|
129
|
-
STDERR.puts "Error: Twitter
|
163
|
+
unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file]
|
164
|
+
STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n"
|
130
165
|
STDERR.puts parser
|
131
166
|
exit 1
|
132
167
|
end
|
data/lib/twitter_to_csv.rb
CHANGED
@@ -7,6 +7,7 @@ require 'twitter/json_stream'
|
|
7
7
|
require 'em-http-request'
|
8
8
|
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
|
9
9
|
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "bool_word_field_parser"))
|
10
11
|
require 'unsupervised-language-detection'
|
11
12
|
|
12
13
|
module TwitterToCsv
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module TwitterToCsv
|
4
|
+
class InvalidLogicError < StandardError; end
|
5
|
+
|
6
|
+
class BoolWordFieldParser
|
7
|
+
TOKEN_SEPARATOR = /[^a-zA-Z0-9-]+/
|
8
|
+
|
9
|
+
def self.parse(string)
|
10
|
+
parts = string.split(":")
|
11
|
+
name = parts.shift
|
12
|
+
tokens = parts.join(":").gsub(/\)/, " ) ").gsub(/\(/, " ( ").split(/\s+/).reject {|s| s.length == 0 }
|
13
|
+
struct = []
|
14
|
+
descend_parse(struct, tokens)
|
15
|
+
{ :name => name, :logic => struct }
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.descend_parse(struct, tokens)
|
19
|
+
while tokens.length > 0
|
20
|
+
token = tokens.shift
|
21
|
+
if token == ")"
|
22
|
+
return
|
23
|
+
elsif token == "("
|
24
|
+
if struct.length > 0
|
25
|
+
sub_struct = []
|
26
|
+
struct << sub_struct
|
27
|
+
descend_parse(sub_struct, tokens)
|
28
|
+
end
|
29
|
+
elsif %w[AND OR].include?(token)
|
30
|
+
sub_struct = []
|
31
|
+
struct << :and if token == "AND"
|
32
|
+
struct << :or if token == "OR"
|
33
|
+
struct << sub_struct
|
34
|
+
descend_parse(sub_struct, tokens)
|
35
|
+
else
|
36
|
+
if struct[0]
|
37
|
+
struct[0] += " " + token.downcase
|
38
|
+
else
|
39
|
+
struct << token.downcase
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.check(pattern, text)
|
46
|
+
logic = pattern[:logic]
|
47
|
+
tokens = text.downcase.split(TOKEN_SEPARATOR).reject {|t| t.length == 0 }.join(" ")
|
48
|
+
!!descend_check(logic, tokens)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.descend_check(logic, tokens)
|
52
|
+
if logic.is_a?(String)
|
53
|
+
# See if the token(s) are present.
|
54
|
+
tokens =~ /\b#{Regexp::escape logic}\b/
|
55
|
+
elsif logic.length == 1
|
56
|
+
# Recurse further.
|
57
|
+
descend_check logic.first, tokens
|
58
|
+
elsif logic.length == 3
|
59
|
+
# Apply the given logical operation.
|
60
|
+
first = descend_check(logic.first, tokens)
|
61
|
+
last = descend_check(logic.last, tokens)
|
62
|
+
if logic[1] == :and
|
63
|
+
first && last
|
64
|
+
elsif logic[1] == :or
|
65
|
+
first || last
|
66
|
+
else
|
67
|
+
raise InvalidLogicError.new("Unknown operation: #{logic[1]}")
|
68
|
+
end
|
69
|
+
else
|
70
|
+
raise InvalidLogicError.new("Invalid expression length of #{logic.length}")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -25,7 +25,7 @@ module TwitterToCsv
|
|
25
25
|
handle_status status
|
26
26
|
end
|
27
27
|
rescue SignalException, SystemExit
|
28
|
-
EventMachine::stop_event_loop
|
28
|
+
EventMachine::stop_event_loop if EventMachine::reactor_running?
|
29
29
|
exit
|
30
30
|
rescue StandardError => e
|
31
31
|
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
@@ -70,13 +70,9 @@ module TwitterToCsv
|
|
70
70
|
# This is an original status.
|
71
71
|
if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
|
72
72
|
if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
|
73
|
-
status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
73
|
+
status['retweet_count'] = @retweet_counts[status['id']] || 0 # if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
74
74
|
if options[:retweet_counts_at]
|
75
|
-
retweet_hour_data = @retweet_hour_counts.delete(status['id'])
|
76
|
-
if !retweet_hour_data
|
77
|
-
puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
|
78
|
-
return false
|
79
|
-
end
|
75
|
+
retweet_hour_data = @retweet_hour_counts.delete(status['id']) || options[:retweet_counts_at].map { 0 }
|
80
76
|
status['_retweet_hour_counts'] = retweet_hour_data
|
81
77
|
end
|
82
78
|
true
|
@@ -90,8 +86,10 @@ module TwitterToCsv
|
|
90
86
|
end
|
91
87
|
|
92
88
|
def handle_status(status, &block)
|
93
|
-
if
|
94
|
-
|
89
|
+
if status.has_key?('delete')
|
90
|
+
STDERR.puts "Skipping Tweet with delete." if options[:verbose]
|
91
|
+
elsif within_time_window?(status)
|
92
|
+
if (options[:require_english] && is_english?(status, options[:require_english])) || !options[:require_english]
|
95
93
|
if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
|
96
94
|
log_json(status) if options[:json]
|
97
95
|
log_csv(status) if options[:csv]
|
@@ -124,6 +122,10 @@ module TwitterToCsv
|
|
124
122
|
options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
|
125
123
|
options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
|
126
124
|
|
125
|
+
(options[:bool_word_fields] || []).each do |pattern|
|
126
|
+
header_labels << pattern[:name]
|
127
|
+
end
|
128
|
+
|
127
129
|
options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
128
130
|
end
|
129
131
|
|
@@ -137,9 +139,19 @@ module TwitterToCsv
|
|
137
139
|
|
138
140
|
def output_row(status)
|
139
141
|
row = options[:fields].map do |field|
|
140
|
-
field.split(".").inject(status) { |memo, segment|
|
142
|
+
value = field.split(".").inject(status) { |memo, segment|
|
141
143
|
memo && memo[segment]
|
142
144
|
}.to_s
|
145
|
+
|
146
|
+
if options[:prefix_ids]
|
147
|
+
value = "id" + value if value.length > 0 && (field =~ /\Aid_str|id\Z/ || field =~ /_id|_id_str\Z/)
|
148
|
+
end
|
149
|
+
|
150
|
+
if options[:remove_quotes]
|
151
|
+
value = value.gsub(/\"/, '')
|
152
|
+
end
|
153
|
+
|
154
|
+
value
|
143
155
|
end
|
144
156
|
|
145
157
|
row += compute_sentiment(status["text"]) if options[:compute_sentiment]
|
@@ -151,7 +163,7 @@ module TwitterToCsv
|
|
151
163
|
(options[:date_fields] || []).each do |date_field|
|
152
164
|
time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
|
153
165
|
memo && memo[segment]
|
154
|
-
}.to_s)
|
166
|
+
}.to_s).utc
|
155
167
|
|
156
168
|
row << time.strftime("%w") # week_day
|
157
169
|
row << time.strftime("%-d") # day
|
@@ -179,6 +191,10 @@ module TwitterToCsv
|
|
179
191
|
options[:user_mention_columns].times { |i| row << users[i].to_s }
|
180
192
|
end
|
181
193
|
|
194
|
+
(options[:bool_word_fields] || []).each do |pattern|
|
195
|
+
row << (!!TwitterToCsv::BoolWordFieldParser.check(pattern, status["text"])).to_s
|
196
|
+
end
|
197
|
+
|
182
198
|
row
|
183
199
|
end
|
184
200
|
|
@@ -239,24 +255,30 @@ module TwitterToCsv
|
|
239
255
|
end
|
240
256
|
|
241
257
|
def sample_fields(status)
|
242
|
-
extract_fields
|
258
|
+
extract_fields status, sampled_fields
|
243
259
|
@num_samples += 1
|
244
260
|
if @num_samples > options[:sample_fields]
|
245
261
|
puts "Sampled fields from Twitter:"
|
246
262
|
sampled_fields.each do |field, count|
|
247
263
|
puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
|
248
264
|
end
|
249
|
-
exit
|
265
|
+
exit 0
|
250
266
|
end
|
251
267
|
end
|
252
268
|
|
253
|
-
def extract_fields(object, fields, current_path =
|
269
|
+
def extract_fields(object, fields, current_path = "")
|
254
270
|
if object.is_a?(Hash)
|
255
271
|
object.each do |k, v|
|
256
|
-
extract_fields v, fields, current_path +
|
272
|
+
extract_fields v, fields, current_path + "." + k.to_s
|
273
|
+
end
|
274
|
+
elsif object.is_a?(Array)
|
275
|
+
local_fields = {}
|
276
|
+
object.each do |v|
|
277
|
+
extract_fields v, local_fields, current_path + "[]"
|
257
278
|
end
|
279
|
+
local_fields.keys.each { |key| fields[key] ||= 0 ; fields[key] += 1 }
|
258
280
|
else
|
259
|
-
path = current_path
|
281
|
+
path = current_path[1..-1]
|
260
282
|
fields[path] ||= 0
|
261
283
|
fields[path] += 1
|
262
284
|
end
|
@@ -268,19 +290,19 @@ module TwitterToCsv
|
|
268
290
|
options[:json].flush
|
269
291
|
end
|
270
292
|
|
271
|
-
def is_english?(status)
|
272
|
-
|
273
|
-
|
274
|
-
return false
|
293
|
+
def is_english?(status, strategy)
|
294
|
+
unless strategy == :twitter
|
295
|
+
status['uld'] = !!UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
|
275
296
|
end
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
297
|
+
|
298
|
+
if strategy == :both && status['lang'] != 'en' && !status['uld']
|
299
|
+
STDERR.puts "Skipping \"#{status['text']}\" because both Twitter (#{status['lang']}) and UnsupervisedLanguageDetection think it is not English." if options[:verbose]
|
300
|
+
return false
|
301
|
+
elsif strategy == :uld && !status['uld']
|
302
|
+
STDERR.puts "Skipping \"#{status['text']}\" because UnsupervisedLanguageDetection thinks it is not English." if options[:verbose]
|
303
|
+
return false
|
304
|
+
elsif strategy == :twitter && status['lang'] != 'en'
|
305
|
+
STDERR.puts "Skipping \"#{status['text']}\" because Twitter (#{status['lang']}) thinks it is not English." if options[:verbose]
|
284
306
|
return false
|
285
307
|
end
|
286
308
|
|
@@ -2,11 +2,13 @@ require 'cgi'
|
|
2
2
|
|
3
3
|
module TwitterToCsv
|
4
4
|
class TwitterWatcher
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :api_key, :api_secret, :access_token, :access_token_secret, :filter, :fetch_errors
|
6
6
|
|
7
7
|
def initialize(options)
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@api_key = options[:api_key]
|
9
|
+
@api_secret = options[:api_secret]
|
10
|
+
@access_token = options[:access_token]
|
11
|
+
@access_token_secret = options[:access_token_secret]
|
10
12
|
@filter = options[:filter]
|
11
13
|
@fetch_errors = 0
|
12
14
|
end
|
@@ -23,8 +25,13 @@ module TwitterToCsv
|
|
23
25
|
EventMachine::run do
|
24
26
|
stream = Twitter::JSONStream.connect(
|
25
27
|
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
|
26
|
-
:
|
27
|
-
:
|
28
|
+
:ssl => true,
|
29
|
+
:oauth => {
|
30
|
+
:consumer_key => api_key,
|
31
|
+
:consumer_secret => api_secret,
|
32
|
+
:access_key => access_token,
|
33
|
+
:access_secret => access_token_secret
|
34
|
+
}
|
28
35
|
)
|
29
36
|
|
30
37
|
stream.each_item do |item|
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'time'
|
4
|
+
|
5
|
+
describe TwitterToCsv::BoolWordFieldParser do
|
6
|
+
describe "#parse" do
|
7
|
+
it "parses name:string AND string AND string... syntax" do
|
8
|
+
TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 AND string3 AND string4").should == {
|
9
|
+
:name => "something",
|
10
|
+
:logic => ["string1 string2", :and, ["string3", :and, ["string4"]]]
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
it "parses name:string OR string OR string... syntax" do
|
15
|
+
TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 OR string3 OR string4").should == {
|
16
|
+
:name => "something",
|
17
|
+
:logic => ["string1 string2", :or, ["string3", :or, ["string4"]]]
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
it "parses parens" do
|
22
|
+
TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 STRING2 OR ( string3 AND (string4 OR string5 ))").should == {
|
23
|
+
:name => "something_else",
|
24
|
+
:logic => ["string1 string2", :or, ["string3", :and, ["string4", :or, ["string5"]]]]
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#check" do
|
30
|
+
it "returns true when an expression matches some text, false when it doesn't" do
|
31
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND (string4 OR string5))")
|
32
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2").should be_true
|
33
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string2 string1").should be_false
|
34
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string1").should be_false
|
35
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string2").should be_false
|
36
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string3 string4").should be_true
|
37
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string4 string3").should be_true
|
38
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string5 string3").should be_true
|
39
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz").should be_true
|
40
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string5 baz").should be_false
|
41
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string4 string5 baz").should be_true
|
42
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz string4").should be_true
|
43
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2 string3 string4").should be_true
|
44
|
+
end
|
45
|
+
|
46
|
+
it "raises errors when the input is un-evaluable" do
|
47
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND OR string5))")
|
48
|
+
lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
|
49
|
+
|
50
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("hello (")
|
51
|
+
lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
|
52
|
+
|
53
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("hello ()")
|
54
|
+
lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -4,35 +4,73 @@ require 'time'
|
|
4
4
|
|
5
5
|
describe TwitterToCsv::CsvBuilder do
|
6
6
|
describe "#handle_status" do
|
7
|
-
describe "
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
describe "English language detection" do
|
8
|
+
describe "with the :uld strategy" do
|
9
|
+
it "uses the UnsupervisedLanguageDetection library to skip non-English tweets" do
|
10
|
+
string_io = StringIO.new
|
11
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text])
|
12
|
+
csv_builder.handle_status('text' => "This is English")
|
13
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
|
14
|
+
csv_builder.handle_status('text' => "This is still English")
|
15
|
+
string_io.rewind
|
16
|
+
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "makes a new 'uld' variable available on the tweet" do
|
20
|
+
string_io = StringIO.new
|
21
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text uld])
|
22
|
+
csv_builder.handle_status('text' => "This is English")
|
23
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
|
24
|
+
csv_builder.handle_status('text' => "This is still English")
|
25
|
+
string_io.rewind
|
26
|
+
string_io.read.should == "\"This is English\",\"true\"\n\"This is still English\",\"true\"\n"
|
27
|
+
end
|
16
28
|
end
|
17
29
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
describe "with the :twitter strategy" do
|
31
|
+
it "uses Twitter's lang field to skip non-English tweets" do
|
32
|
+
string_io = StringIO.new
|
33
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :twitter, :csv => string_io, :fields => %w[text])
|
34
|
+
csv_builder.handle_status('text' => "This is English", 'lang' => 'en')
|
35
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'fr')
|
36
|
+
csv_builder.handle_status('text' => "This is still English", 'lang' => 'en')
|
37
|
+
string_io.rewind
|
38
|
+
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "with the :both strategy" do
|
43
|
+
it "returns tweets unless both ULD and Twitter agree that the Tweet is non-English" do
|
44
|
+
string_io = StringIO.new
|
45
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :both, :csv => string_io, :fields => %w[text])
|
46
|
+
csv_builder.handle_status('text' => "This is English", 'lang' => 'en') # agree
|
47
|
+
csv_builder.handle_status('text' => "This is English 2", 'lang' => 'fr') # disagree
|
48
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'en') # disagree
|
49
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles 2.", 'lang' => 'fr') # agree
|
50
|
+
csv_builder.handle_status('text' => "This is still English", 'lang' => 'en') # agree
|
51
|
+
string_io.rewind
|
52
|
+
string_io.read.should == "\"This is English\"\n\"This is English 2\"\n\"Esta frase se encuentra en Ingles.\"\n\"This is still English\"\n"
|
53
|
+
end
|
33
54
|
end
|
34
55
|
end
|
35
56
|
|
57
|
+
it "honors start_time and end_time" do
|
58
|
+
string_io = StringIO.new
|
59
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
|
60
|
+
:start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
|
61
|
+
:end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
|
62
|
+
|
63
|
+
# Order shouldn't matter
|
64
|
+
csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
|
65
|
+
csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
|
66
|
+
csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
|
67
|
+
csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
|
68
|
+
csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
|
69
|
+
csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
|
70
|
+
string_io.rewind
|
71
|
+
string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
|
72
|
+
end
|
73
|
+
|
36
74
|
describe "log_csv_header" do
|
37
75
|
it "outputs the fields as header labels" do
|
38
76
|
string_io = StringIO.new
|
@@ -169,17 +207,42 @@ describe TwitterToCsv::CsvBuilder do
|
|
169
207
|
"\"hello2\",\"2\"\n"
|
170
208
|
end
|
171
209
|
|
172
|
-
it "can
|
210
|
+
it "can extract boolean word fields" do
|
211
|
+
string_io = StringIO.new
|
212
|
+
patterns = [
|
213
|
+
TwitterToCsv::BoolWordFieldParser.parse("field1:hello AND world"),
|
214
|
+
TwitterToCsv::BoolWordFieldParser.parse("field2:hello"),
|
215
|
+
TwitterToCsv::BoolWordFieldParser.parse("field3:string OR text"),
|
216
|
+
TwitterToCsv::BoolWordFieldParser.parse("field3:hello this")
|
217
|
+
]
|
218
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :bool_word_fields => patterns)
|
219
|
+
csv_builder.handle_status({
|
220
|
+
'something' => "hello1",
|
221
|
+
'text' => 'hello this is a string'
|
222
|
+
|
223
|
+
})
|
224
|
+
csv_builder.handle_status({
|
225
|
+
'something' => "hello2",
|
226
|
+
'text' => 'hello world this is some text'
|
227
|
+
})
|
228
|
+
string_io.rewind
|
229
|
+
string_io.read.should == "\"hello1\",\"false\",\"true\",\"true\",\"true\"\n" +
|
230
|
+
"\"hello2\",\"true\",\"true\",\"true\",\"false\"\n"
|
231
|
+
end
|
232
|
+
|
233
|
+
it "can return date fields and convert them to UTC" do
|
173
234
|
string_io = StringIO.new
|
174
235
|
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
|
175
236
|
csv_builder.handle_status({
|
176
237
|
'something' => "hello1",
|
177
238
|
'text' => 'i love cheese',
|
178
239
|
'created_at' => "2012-06-29 13:12:09 -0700"
|
179
|
-
|
180
240
|
})
|
181
241
|
string_io.rewind
|
182
|
-
|
242
|
+
time = Time.parse("2012-06-29 13:12:09 -0700").utc
|
243
|
+
string_io.read.should == '"' + ["hello1", time.strftime("%w"), time.strftime("%-d"),
|
244
|
+
time.strftime("%-m"), time.strftime("%Y"), time.strftime("%-H"),
|
245
|
+
time.strftime("%M"), time.strftime("%S")].join('","') + "\"\n"
|
183
246
|
end
|
184
247
|
|
185
248
|
it "can return a normalized source" do
|
@@ -349,24 +412,33 @@ describe TwitterToCsv::CsvBuilder do
|
|
349
412
|
end
|
350
413
|
|
351
414
|
describe "#extract_fields" do
|
352
|
-
it "finds all the paths through a
|
415
|
+
it "finds all the paths through a structure" do
|
353
416
|
obj = {
|
354
417
|
:a => :b,
|
355
418
|
:b => "c",
|
356
419
|
:d => {
|
357
420
|
:e => :f,
|
358
|
-
:g =>
|
421
|
+
:g => [
|
422
|
+
{
|
359
423
|
:h => :i,
|
360
424
|
:j => {
|
361
|
-
|
425
|
+
:k => "l"
|
362
426
|
}
|
363
|
-
|
427
|
+
},
|
428
|
+
{
|
429
|
+
:h => :i,
|
430
|
+
:j => {
|
431
|
+
:m => "n"
|
432
|
+
},
|
433
|
+
:hi => 2
|
434
|
+
}
|
435
|
+
],
|
364
436
|
:m => "n"
|
365
437
|
}
|
366
438
|
}
|
367
439
|
fields = { "a" => 1 }
|
368
440
|
TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
|
369
|
-
fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
|
441
|
+
fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g[].h" => 1, "d.g[].j.k" => 1, "d.g[].j.m" => 1, "d.g[].hi" => 1, "d.m" => 1 }
|
370
442
|
end
|
371
443
|
end
|
372
444
|
end
|