twitter_to_csv 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -0
- data/Gemfile +3 -1
- data/README.markdown +316 -153
- data/bin/twitter_to_csv +45 -10
- data/lib/twitter_to_csv.rb +1 -0
- data/lib/twitter_to_csv/bool_word_field_parser.rb +74 -0
- data/lib/twitter_to_csv/csv_builder.rb +50 -28
- data/lib/twitter_to_csv/twitter_watcher.rb +12 -5
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/bool_word_field_parser_spec.rb +57 -0
- data/spec/csv_builder_spec.rb +104 -32
- data/twitter_to_csv.gemspec +1 -1
- metadata +14 -21
- data/.rvmrc +0 -1
data/bin/twitter_to_csv
CHANGED
@@ -5,20 +5,32 @@ require 'optparse'
|
|
5
5
|
require 'time'
|
6
6
|
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
7
7
|
|
8
|
-
options = { :fields => %w[created_at text] }
|
8
|
+
options = { :fields => %w[created_at text], :bool_word_fields => [] }
|
9
9
|
parser = OptionParser.new do |opts|
|
10
10
|
opts.banner = "Usage: #{File.basename($0)} [options]"
|
11
|
+
|
11
12
|
opts.separator ""
|
12
|
-
opts.separator "
|
13
|
+
opts.separator "These four fields are required. Please see the README to learn how to get them for your Twitter account."
|
14
|
+
|
15
|
+
opts.on("--api-key KEY", "Twitter API key") do |api_key|
|
16
|
+
options[:api_key] = api_key
|
17
|
+
end
|
13
18
|
|
14
|
-
opts.on("-
|
15
|
-
options[:
|
19
|
+
opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret|
|
20
|
+
options[:api_secret] = api_secret
|
16
21
|
end
|
17
22
|
|
18
|
-
opts.on("-
|
19
|
-
options[:
|
23
|
+
opts.on("--access-token TOKEN", "Twitter access token") do |access_token|
|
24
|
+
options[:access_token] = access_token
|
20
25
|
end
|
21
26
|
|
27
|
+
opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret|
|
28
|
+
options[:access_token_secret] = access_token_secret
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.separator ""
|
32
|
+
opts.separator "General settings:"
|
33
|
+
|
22
34
|
opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv|
|
23
35
|
options[:csv_appending] = File.exists?(csv)
|
24
36
|
options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a')
|
@@ -40,8 +52,12 @@ parser = OptionParser.new do |opts|
|
|
40
52
|
options[:date_fields] = date_fields.split(/\s*,\s*/)
|
41
53
|
end
|
42
54
|
|
43
|
-
opts.on("-e", "--require-english
|
44
|
-
|
55
|
+
opts.on("-e", "--require-english [STRATEGY]",
|
56
|
+
"Attempt to filter out non-English tweets. This will have both false positives and false negatives.",
|
57
|
+
"The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,",
|
58
|
+
"'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that",
|
59
|
+
"both Twitter and ULD think are non-English. This is most conservative and is the default.") do |e|
|
60
|
+
options[:require_english] = (e || "both").downcase.to_sym
|
45
61
|
end
|
46
62
|
|
47
63
|
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
@@ -84,6 +100,25 @@ parser = OptionParser.new do |opts|
|
|
84
100
|
options[:normalize_source] = normalize_source
|
85
101
|
end
|
86
102
|
|
103
|
+
opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes|
|
104
|
+
options[:remove_quotes] = remove_quotes
|
105
|
+
end
|
106
|
+
|
107
|
+
opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids|
|
108
|
+
options[:prefix_ids] = prefix_ids
|
109
|
+
end
|
110
|
+
|
111
|
+
opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"",
|
112
|
+
"Create a named CSV column that is true when the word expression matches, false otherwise.",
|
113
|
+
"Word expressions are boolean expressions where neighboring words must occur sequentially",
|
114
|
+
"and you can use parentheses, AND, and OR to test for occurrence relationships. Examples:",
|
115
|
+
" keyword_any:tanning booth OR tanning booths OR tanningbooth",
|
116
|
+
" keyword_both:tanning AND booth",
|
117
|
+
" keyword_complex:tanning AND (booth OR bed)",
|
118
|
+
"This option can be used multiple times.") do |bool_word_field|
|
119
|
+
options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field)
|
120
|
+
end
|
121
|
+
|
87
122
|
opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
|
88
123
|
options[:start_time] = Time.parse(start_time)
|
89
124
|
end
|
@@ -125,8 +160,8 @@ end
|
|
125
160
|
|
126
161
|
parser.parse!
|
127
162
|
|
128
|
-
unless (options[:
|
129
|
-
STDERR.puts "Error: Twitter
|
163
|
+
unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file]
|
164
|
+
STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n"
|
130
165
|
STDERR.puts parser
|
131
166
|
exit 1
|
132
167
|
end
|
data/lib/twitter_to_csv.rb
CHANGED
@@ -7,6 +7,7 @@ require 'twitter/json_stream'
|
|
7
7
|
require 'em-http-request'
|
8
8
|
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
|
9
9
|
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "bool_word_field_parser"))
|
10
11
|
require 'unsupervised-language-detection'
|
11
12
|
|
12
13
|
module TwitterToCsv
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module TwitterToCsv
|
4
|
+
class InvalidLogicError < StandardError; end
|
5
|
+
|
6
|
+
class BoolWordFieldParser
|
7
|
+
TOKEN_SEPARATOR = /[^a-zA-Z0-9-]+/
|
8
|
+
|
9
|
+
def self.parse(string)
|
10
|
+
parts = string.split(":")
|
11
|
+
name = parts.shift
|
12
|
+
tokens = parts.join(":").gsub(/\)/, " ) ").gsub(/\(/, " ( ").split(/\s+/).reject {|s| s.length == 0 }
|
13
|
+
struct = []
|
14
|
+
descend_parse(struct, tokens)
|
15
|
+
{ :name => name, :logic => struct }
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.descend_parse(struct, tokens)
|
19
|
+
while tokens.length > 0
|
20
|
+
token = tokens.shift
|
21
|
+
if token == ")"
|
22
|
+
return
|
23
|
+
elsif token == "("
|
24
|
+
if struct.length > 0
|
25
|
+
sub_struct = []
|
26
|
+
struct << sub_struct
|
27
|
+
descend_parse(sub_struct, tokens)
|
28
|
+
end
|
29
|
+
elsif %w[AND OR].include?(token)
|
30
|
+
sub_struct = []
|
31
|
+
struct << :and if token == "AND"
|
32
|
+
struct << :or if token == "OR"
|
33
|
+
struct << sub_struct
|
34
|
+
descend_parse(sub_struct, tokens)
|
35
|
+
else
|
36
|
+
if struct[0]
|
37
|
+
struct[0] += " " + token.downcase
|
38
|
+
else
|
39
|
+
struct << token.downcase
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.check(pattern, text)
|
46
|
+
logic = pattern[:logic]
|
47
|
+
tokens = text.downcase.split(TOKEN_SEPARATOR).reject {|t| t.length == 0 }.join(" ")
|
48
|
+
!!descend_check(logic, tokens)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.descend_check(logic, tokens)
|
52
|
+
if logic.is_a?(String)
|
53
|
+
# See if the token(s) are present.
|
54
|
+
tokens =~ /\b#{Regexp::escape logic}\b/
|
55
|
+
elsif logic.length == 1
|
56
|
+
# Recurse further.
|
57
|
+
descend_check logic.first, tokens
|
58
|
+
elsif logic.length == 3
|
59
|
+
# Apply the given logical operation.
|
60
|
+
first = descend_check(logic.first, tokens)
|
61
|
+
last = descend_check(logic.last, tokens)
|
62
|
+
if logic[1] == :and
|
63
|
+
first && last
|
64
|
+
elsif logic[1] == :or
|
65
|
+
first || last
|
66
|
+
else
|
67
|
+
raise InvalidLogicError.new("Unknown operation: #{logic[1]}")
|
68
|
+
end
|
69
|
+
else
|
70
|
+
raise InvalidLogicError.new("Invalid expression length of #{logic.length}")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -25,7 +25,7 @@ module TwitterToCsv
|
|
25
25
|
handle_status status
|
26
26
|
end
|
27
27
|
rescue SignalException, SystemExit
|
28
|
-
EventMachine::stop_event_loop
|
28
|
+
EventMachine::stop_event_loop if EventMachine::reactor_running?
|
29
29
|
exit
|
30
30
|
rescue StandardError => e
|
31
31
|
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
@@ -70,13 +70,9 @@ module TwitterToCsv
|
|
70
70
|
# This is an original status.
|
71
71
|
if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
|
72
72
|
if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
|
73
|
-
status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
73
|
+
status['retweet_count'] = @retweet_counts[status['id']] || 0 # if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
|
74
74
|
if options[:retweet_counts_at]
|
75
|
-
retweet_hour_data = @retweet_hour_counts.delete(status['id'])
|
76
|
-
if !retweet_hour_data
|
77
|
-
puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
|
78
|
-
return false
|
79
|
-
end
|
75
|
+
retweet_hour_data = @retweet_hour_counts.delete(status['id']) || options[:retweet_counts_at].map { 0 }
|
80
76
|
status['_retweet_hour_counts'] = retweet_hour_data
|
81
77
|
end
|
82
78
|
true
|
@@ -90,8 +86,10 @@ module TwitterToCsv
|
|
90
86
|
end
|
91
87
|
|
92
88
|
def handle_status(status, &block)
|
93
|
-
if
|
94
|
-
|
89
|
+
if status.has_key?('delete')
|
90
|
+
STDERR.puts "Skipping Tweet with delete." if options[:verbose]
|
91
|
+
elsif within_time_window?(status)
|
92
|
+
if (options[:require_english] && is_english?(status, options[:require_english])) || !options[:require_english]
|
95
93
|
if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
|
96
94
|
log_json(status) if options[:json]
|
97
95
|
log_csv(status) if options[:csv]
|
@@ -124,6 +122,10 @@ module TwitterToCsv
|
|
124
122
|
options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
|
125
123
|
options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
|
126
124
|
|
125
|
+
(options[:bool_word_fields] || []).each do |pattern|
|
126
|
+
header_labels << pattern[:name]
|
127
|
+
end
|
128
|
+
|
127
129
|
options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
128
130
|
end
|
129
131
|
|
@@ -137,9 +139,19 @@ module TwitterToCsv
|
|
137
139
|
|
138
140
|
def output_row(status)
|
139
141
|
row = options[:fields].map do |field|
|
140
|
-
field.split(".").inject(status) { |memo, segment|
|
142
|
+
value = field.split(".").inject(status) { |memo, segment|
|
141
143
|
memo && memo[segment]
|
142
144
|
}.to_s
|
145
|
+
|
146
|
+
if options[:prefix_ids]
|
147
|
+
value = "id" + value if value.length > 0 && (field =~ /\Aid_str|id\Z/ || field =~ /_id|_id_str\Z/)
|
148
|
+
end
|
149
|
+
|
150
|
+
if options[:remove_quotes]
|
151
|
+
value = value.gsub(/\"/, '')
|
152
|
+
end
|
153
|
+
|
154
|
+
value
|
143
155
|
end
|
144
156
|
|
145
157
|
row += compute_sentiment(status["text"]) if options[:compute_sentiment]
|
@@ -151,7 +163,7 @@ module TwitterToCsv
|
|
151
163
|
(options[:date_fields] || []).each do |date_field|
|
152
164
|
time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
|
153
165
|
memo && memo[segment]
|
154
|
-
}.to_s)
|
166
|
+
}.to_s).utc
|
155
167
|
|
156
168
|
row << time.strftime("%w") # week_day
|
157
169
|
row << time.strftime("%-d") # day
|
@@ -179,6 +191,10 @@ module TwitterToCsv
|
|
179
191
|
options[:user_mention_columns].times { |i| row << users[i].to_s }
|
180
192
|
end
|
181
193
|
|
194
|
+
(options[:bool_word_fields] || []).each do |pattern|
|
195
|
+
row << (!!TwitterToCsv::BoolWordFieldParser.check(pattern, status["text"])).to_s
|
196
|
+
end
|
197
|
+
|
182
198
|
row
|
183
199
|
end
|
184
200
|
|
@@ -239,24 +255,30 @@ module TwitterToCsv
|
|
239
255
|
end
|
240
256
|
|
241
257
|
def sample_fields(status)
|
242
|
-
extract_fields
|
258
|
+
extract_fields status, sampled_fields
|
243
259
|
@num_samples += 1
|
244
260
|
if @num_samples > options[:sample_fields]
|
245
261
|
puts "Sampled fields from Twitter:"
|
246
262
|
sampled_fields.each do |field, count|
|
247
263
|
puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
|
248
264
|
end
|
249
|
-
exit
|
265
|
+
exit 0
|
250
266
|
end
|
251
267
|
end
|
252
268
|
|
253
|
-
def extract_fields(object, fields, current_path =
|
269
|
+
def extract_fields(object, fields, current_path = "")
|
254
270
|
if object.is_a?(Hash)
|
255
271
|
object.each do |k, v|
|
256
|
-
extract_fields v, fields, current_path +
|
272
|
+
extract_fields v, fields, current_path + "." + k.to_s
|
273
|
+
end
|
274
|
+
elsif object.is_a?(Array)
|
275
|
+
local_fields = {}
|
276
|
+
object.each do |v|
|
277
|
+
extract_fields v, local_fields, current_path + "[]"
|
257
278
|
end
|
279
|
+
local_fields.keys.each { |key| fields[key] ||= 0 ; fields[key] += 1 }
|
258
280
|
else
|
259
|
-
path = current_path
|
281
|
+
path = current_path[1..-1]
|
260
282
|
fields[path] ||= 0
|
261
283
|
fields[path] += 1
|
262
284
|
end
|
@@ -268,19 +290,19 @@ module TwitterToCsv
|
|
268
290
|
options[:json].flush
|
269
291
|
end
|
270
292
|
|
271
|
-
def is_english?(status)
|
272
|
-
|
273
|
-
|
274
|
-
return false
|
293
|
+
def is_english?(status, strategy)
|
294
|
+
unless strategy == :twitter
|
295
|
+
status['uld'] = !!UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
|
275
296
|
end
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
297
|
+
|
298
|
+
if strategy == :both && status['lang'] != 'en' && !status['uld']
|
299
|
+
STDERR.puts "Skipping \"#{status['text']}\" because both Twitter (#{status['lang']}) and UnsupervisedLanguageDetection think it is not English." if options[:verbose]
|
300
|
+
return false
|
301
|
+
elsif strategy == :uld && !status['uld']
|
302
|
+
STDERR.puts "Skipping \"#{status['text']}\" because UnsupervisedLanguageDetection thinks it is not English." if options[:verbose]
|
303
|
+
return false
|
304
|
+
elsif strategy == :twitter && status['lang'] != 'en'
|
305
|
+
STDERR.puts "Skipping \"#{status['text']}\" because Twitter (#{status['lang']}) thinks it is not English." if options[:verbose]
|
284
306
|
return false
|
285
307
|
end
|
286
308
|
|
@@ -2,11 +2,13 @@ require 'cgi'
|
|
2
2
|
|
3
3
|
module TwitterToCsv
|
4
4
|
class TwitterWatcher
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :api_key, :api_secret, :access_token, :access_token_secret, :filter, :fetch_errors
|
6
6
|
|
7
7
|
def initialize(options)
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@api_key = options[:api_key]
|
9
|
+
@api_secret = options[:api_secret]
|
10
|
+
@access_token = options[:access_token]
|
11
|
+
@access_token_secret = options[:access_token_secret]
|
10
12
|
@filter = options[:filter]
|
11
13
|
@fetch_errors = 0
|
12
14
|
end
|
@@ -23,8 +25,13 @@ module TwitterToCsv
|
|
23
25
|
EventMachine::run do
|
24
26
|
stream = Twitter::JSONStream.connect(
|
25
27
|
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
|
26
|
-
:
|
27
|
-
:
|
28
|
+
:ssl => true,
|
29
|
+
:oauth => {
|
30
|
+
:consumer_key => api_key,
|
31
|
+
:consumer_secret => api_secret,
|
32
|
+
:access_key => access_token,
|
33
|
+
:access_secret => access_token_secret
|
34
|
+
}
|
28
35
|
)
|
29
36
|
|
30
37
|
stream.each_item do |item|
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'time'
|
4
|
+
|
5
|
+
describe TwitterToCsv::BoolWordFieldParser do
|
6
|
+
describe "#parse" do
|
7
|
+
it "parses name:string AND string AND string... syntax" do
|
8
|
+
TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 AND string3 AND string4").should == {
|
9
|
+
:name => "something",
|
10
|
+
:logic => ["string1 string2", :and, ["string3", :and, ["string4"]]]
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
it "parses name:string OR string OR string... syntax" do
|
15
|
+
TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 OR string3 OR string4").should == {
|
16
|
+
:name => "something",
|
17
|
+
:logic => ["string1 string2", :or, ["string3", :or, ["string4"]]]
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
it "parses parens" do
|
22
|
+
TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 STRING2 OR ( string3 AND (string4 OR string5 ))").should == {
|
23
|
+
:name => "something_else",
|
24
|
+
:logic => ["string1 string2", :or, ["string3", :and, ["string4", :or, ["string5"]]]]
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#check" do
|
30
|
+
it "returns true when an expression matches some text, false when it doesn't" do
|
31
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND (string4 OR string5))")
|
32
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2").should be_true
|
33
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string2 string1").should be_false
|
34
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string1").should be_false
|
35
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string2").should be_false
|
36
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string3 string4").should be_true
|
37
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string4 string3").should be_true
|
38
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string5 string3").should be_true
|
39
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz").should be_true
|
40
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string5 baz").should be_false
|
41
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string4 string5 baz").should be_true
|
42
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz string4").should be_true
|
43
|
+
TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2 string3 string4").should be_true
|
44
|
+
end
|
45
|
+
|
46
|
+
it "raises errors when the input is un-evaluable" do
|
47
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND OR string5))")
|
48
|
+
lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
|
49
|
+
|
50
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("hello (")
|
51
|
+
lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
|
52
|
+
|
53
|
+
pattern = TwitterToCsv::BoolWordFieldParser.parse("hello ()")
|
54
|
+
lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -4,35 +4,73 @@ require 'time'
|
|
4
4
|
|
5
5
|
describe TwitterToCsv::CsvBuilder do
|
6
6
|
describe "#handle_status" do
|
7
|
-
describe "
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
describe "English language detection" do
|
8
|
+
describe "with the :uld strategy" do
|
9
|
+
it "uses the UnsupervisedLanguageDetection library to skip non-English tweets" do
|
10
|
+
string_io = StringIO.new
|
11
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text])
|
12
|
+
csv_builder.handle_status('text' => "This is English")
|
13
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
|
14
|
+
csv_builder.handle_status('text' => "This is still English")
|
15
|
+
string_io.rewind
|
16
|
+
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "makes a new 'uld' variable available on the tweet" do
|
20
|
+
string_io = StringIO.new
|
21
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text uld])
|
22
|
+
csv_builder.handle_status('text' => "This is English")
|
23
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
|
24
|
+
csv_builder.handle_status('text' => "This is still English")
|
25
|
+
string_io.rewind
|
26
|
+
string_io.read.should == "\"This is English\",\"true\"\n\"This is still English\",\"true\"\n"
|
27
|
+
end
|
16
28
|
end
|
17
29
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
describe "with the :twitter strategy" do
|
31
|
+
it "uses Twitter's lang field to skip non-English tweets" do
|
32
|
+
string_io = StringIO.new
|
33
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :twitter, :csv => string_io, :fields => %w[text])
|
34
|
+
csv_builder.handle_status('text' => "This is English", 'lang' => 'en')
|
35
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'fr')
|
36
|
+
csv_builder.handle_status('text' => "This is still English", 'lang' => 'en')
|
37
|
+
string_io.rewind
|
38
|
+
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "with the :both strategy" do
|
43
|
+
it "returns tweets unless both ULD and Twitter agree that the Tweet is non-English" do
|
44
|
+
string_io = StringIO.new
|
45
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :both, :csv => string_io, :fields => %w[text])
|
46
|
+
csv_builder.handle_status('text' => "This is English", 'lang' => 'en') # agree
|
47
|
+
csv_builder.handle_status('text' => "This is English 2", 'lang' => 'fr') # disagree
|
48
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'en') # disagree
|
49
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles 2.", 'lang' => 'fr') # agree
|
50
|
+
csv_builder.handle_status('text' => "This is still English", 'lang' => 'en') # agree
|
51
|
+
string_io.rewind
|
52
|
+
string_io.read.should == "\"This is English\"\n\"This is English 2\"\n\"Esta frase se encuentra en Ingles.\"\n\"This is still English\"\n"
|
53
|
+
end
|
33
54
|
end
|
34
55
|
end
|
35
56
|
|
57
|
+
it "honors start_time and end_time" do
|
58
|
+
string_io = StringIO.new
|
59
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
|
60
|
+
:start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
|
61
|
+
:end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
|
62
|
+
|
63
|
+
# Order shouldn't matter
|
64
|
+
csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
|
65
|
+
csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
|
66
|
+
csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
|
67
|
+
csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
|
68
|
+
csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
|
69
|
+
csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
|
70
|
+
string_io.rewind
|
71
|
+
string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
|
72
|
+
end
|
73
|
+
|
36
74
|
describe "log_csv_header" do
|
37
75
|
it "outputs the fields as header labels" do
|
38
76
|
string_io = StringIO.new
|
@@ -169,17 +207,42 @@ describe TwitterToCsv::CsvBuilder do
|
|
169
207
|
"\"hello2\",\"2\"\n"
|
170
208
|
end
|
171
209
|
|
172
|
-
it "can
|
210
|
+
it "can extract boolean word fields" do
|
211
|
+
string_io = StringIO.new
|
212
|
+
patterns = [
|
213
|
+
TwitterToCsv::BoolWordFieldParser.parse("field1:hello AND world"),
|
214
|
+
TwitterToCsv::BoolWordFieldParser.parse("field2:hello"),
|
215
|
+
TwitterToCsv::BoolWordFieldParser.parse("field3:string OR text"),
|
216
|
+
TwitterToCsv::BoolWordFieldParser.parse("field3:hello this")
|
217
|
+
]
|
218
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :bool_word_fields => patterns)
|
219
|
+
csv_builder.handle_status({
|
220
|
+
'something' => "hello1",
|
221
|
+
'text' => 'hello this is a string'
|
222
|
+
|
223
|
+
})
|
224
|
+
csv_builder.handle_status({
|
225
|
+
'something' => "hello2",
|
226
|
+
'text' => 'hello world this is some text'
|
227
|
+
})
|
228
|
+
string_io.rewind
|
229
|
+
string_io.read.should == "\"hello1\",\"false\",\"true\",\"true\",\"true\"\n" +
|
230
|
+
"\"hello2\",\"true\",\"true\",\"true\",\"false\"\n"
|
231
|
+
end
|
232
|
+
|
233
|
+
it "can return date fields and convert them to UTC" do
|
173
234
|
string_io = StringIO.new
|
174
235
|
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
|
175
236
|
csv_builder.handle_status({
|
176
237
|
'something' => "hello1",
|
177
238
|
'text' => 'i love cheese',
|
178
239
|
'created_at' => "2012-06-29 13:12:09 -0700"
|
179
|
-
|
180
240
|
})
|
181
241
|
string_io.rewind
|
182
|
-
|
242
|
+
time = Time.parse("2012-06-29 13:12:09 -0700").utc
|
243
|
+
string_io.read.should == '"' + ["hello1", time.strftime("%w"), time.strftime("%-d"),
|
244
|
+
time.strftime("%-m"), time.strftime("%Y"), time.strftime("%-H"),
|
245
|
+
time.strftime("%M"), time.strftime("%S")].join('","') + "\"\n"
|
183
246
|
end
|
184
247
|
|
185
248
|
it "can return a normalized source" do
|
@@ -349,24 +412,33 @@ describe TwitterToCsv::CsvBuilder do
|
|
349
412
|
end
|
350
413
|
|
351
414
|
describe "#extract_fields" do
|
352
|
-
it "finds all the paths through a
|
415
|
+
it "finds all the paths through a structure" do
|
353
416
|
obj = {
|
354
417
|
:a => :b,
|
355
418
|
:b => "c",
|
356
419
|
:d => {
|
357
420
|
:e => :f,
|
358
|
-
:g =>
|
421
|
+
:g => [
|
422
|
+
{
|
359
423
|
:h => :i,
|
360
424
|
:j => {
|
361
|
-
|
425
|
+
:k => "l"
|
362
426
|
}
|
363
|
-
|
427
|
+
},
|
428
|
+
{
|
429
|
+
:h => :i,
|
430
|
+
:j => {
|
431
|
+
:m => "n"
|
432
|
+
},
|
433
|
+
:hi => 2
|
434
|
+
}
|
435
|
+
],
|
364
436
|
:m => "n"
|
365
437
|
}
|
366
438
|
}
|
367
439
|
fields = { "a" => 1 }
|
368
440
|
TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
|
369
|
-
fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
|
441
|
+
fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g[].h" => 1, "d.g[].j.k" => 1, "d.g[].j.m" => 1, "d.g[].hi" => 1, "d.m" => 1 }
|
370
442
|
end
|
371
443
|
end
|
372
444
|
end
|