twitter_to_csv 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,20 +5,32 @@ require 'optparse'
5
5
  require 'time'
6
6
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
7
7
 
8
- options = { :fields => %w[created_at text] }
8
+ options = { :fields => %w[created_at text], :bool_word_fields => [] }
9
9
  parser = OptionParser.new do |opts|
10
10
  opts.banner = "Usage: #{File.basename($0)} [options]"
11
+
11
12
  opts.separator ""
12
- opts.separator "Specific options:"
13
+ opts.separator "These four fields are required. Please see the README to learn how to get them for your Twitter account."
14
+
15
+ opts.on("--api-key KEY", "Twitter API key") do |api_key|
16
+ options[:api_key] = api_key
17
+ end
13
18
 
14
- opts.on("-u", "--username USERNAME", "Twitter username") do |username|
15
- options[:username] = username
19
+ opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret|
20
+ options[:api_secret] = api_secret
16
21
  end
17
22
 
18
- opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
19
- options[:password] = password
23
+ opts.on("--access-token TOKEN", "Twitter access token") do |access_token|
24
+ options[:access_token] = access_token
20
25
  end
21
26
 
27
+ opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret|
28
+ options[:access_token_secret] = access_token_secret
29
+ end
30
+
31
+ opts.separator ""
32
+ opts.separator "General settings:"
33
+
22
34
  opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv|
23
35
  options[:csv_appending] = File.exists?(csv)
24
36
  options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a')
@@ -40,8 +52,12 @@ parser = OptionParser.new do |opts|
40
52
  options[:date_fields] = date_fields.split(/\s*,\s*/)
41
53
  end
42
54
 
43
- opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
44
- options[:require_english] = e
55
+ opts.on("-e", "--require-english [STRATEGY]",
56
+ "Attempt to filter out non-English tweets. This will have both false positives and false negatives.",
57
+ "The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,",
58
+ "'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that",
59
+ "both Twitter and ULD think are non-English. This is most conservative and is the default.") do |e|
60
+ options[:require_english] = (e || "both").downcase.to_sym
45
61
  end
46
62
 
47
63
  opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
@@ -84,6 +100,25 @@ parser = OptionParser.new do |opts|
84
100
  options[:normalize_source] = normalize_source
85
101
  end
86
102
 
103
+ opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes|
104
+ options[:remove_quotes] = remove_quotes
105
+ end
106
+
107
+ opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids|
108
+ options[:prefix_ids] = prefix_ids
109
+ end
110
+
111
+ opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"",
112
+ "Create a named CSV column that is true when the word expression matches, false otherwise.",
113
+ "Word expressions are boolean expressions where neighboring words must occur sequentially",
114
+ "and you can use parentheses, AND, and OR to test for occurrence relationships. Examples:",
115
+ " keyword_any:tanning booth OR tanning booths OR tanningbooth",
116
+ " keyword_both:tanning AND booth",
117
+ " keyword_complex:tanning AND (booth OR bed)",
118
+ "This option can be used multiple times.") do |bool_word_field|
119
+ options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field)
120
+ end
121
+
87
122
  opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
88
123
  options[:start_time] = Time.parse(start_time)
89
124
  end
@@ -125,8 +160,8 @@ end
125
160
 
126
161
  parser.parse!
127
162
 
128
- unless (options[:username] && options[:password]) || options[:replay_from_file]
129
- STDERR.puts "Error: Twitter username and password are required fields unless you're replaying from a file.\n\n"
163
+ unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file]
164
+ STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n"
130
165
  STDERR.puts parser
131
166
  exit 1
132
167
  end
@@ -7,6 +7,7 @@ require 'twitter/json_stream'
7
7
  require 'em-http-request'
8
8
  require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
9
9
  require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
10
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "bool_word_field_parser"))
10
11
  require 'unsupervised-language-detection'
11
12
 
12
13
  module TwitterToCsv
@@ -0,0 +1,74 @@
1
+ # encoding: UTF-8
2
+
3
+ module TwitterToCsv
4
+ class InvalidLogicError < StandardError; end
5
+
6
+ class BoolWordFieldParser
7
+ TOKEN_SEPARATOR = /[^a-zA-Z0-9-]+/
8
+
9
+ def self.parse(string)
10
+ parts = string.split(":")
11
+ name = parts.shift
12
+ tokens = parts.join(":").gsub(/\)/, " ) ").gsub(/\(/, " ( ").split(/\s+/).reject {|s| s.length == 0 }
13
+ struct = []
14
+ descend_parse(struct, tokens)
15
+ { :name => name, :logic => struct }
16
+ end
17
+
18
+ def self.descend_parse(struct, tokens)
19
+ while tokens.length > 0
20
+ token = tokens.shift
21
+ if token == ")"
22
+ return
23
+ elsif token == "("
24
+ if struct.length > 0
25
+ sub_struct = []
26
+ struct << sub_struct
27
+ descend_parse(sub_struct, tokens)
28
+ end
29
+ elsif %w[AND OR].include?(token)
30
+ sub_struct = []
31
+ struct << :and if token == "AND"
32
+ struct << :or if token == "OR"
33
+ struct << sub_struct
34
+ descend_parse(sub_struct, tokens)
35
+ else
36
+ if struct[0]
37
+ struct[0] += " " + token.downcase
38
+ else
39
+ struct << token.downcase
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ def self.check(pattern, text)
46
+ logic = pattern[:logic]
47
+ tokens = text.downcase.split(TOKEN_SEPARATOR).reject {|t| t.length == 0 }.join(" ")
48
+ !!descend_check(logic, tokens)
49
+ end
50
+
51
+ def self.descend_check(logic, tokens)
52
+ if logic.is_a?(String)
53
+ # See if the token(s) are present.
54
+ tokens =~ /\b#{Regexp::escape logic}\b/
55
+ elsif logic.length == 1
56
+ # Recurse further.
57
+ descend_check logic.first, tokens
58
+ elsif logic.length == 3
59
+ # Apply the given logical operation.
60
+ first = descend_check(logic.first, tokens)
61
+ last = descend_check(logic.last, tokens)
62
+ if logic[1] == :and
63
+ first && last
64
+ elsif logic[1] == :or
65
+ first || last
66
+ else
67
+ raise InvalidLogicError.new("Unknown operation: #{logic[1]}")
68
+ end
69
+ else
70
+ raise InvalidLogicError.new("Invalid expression length of #{logic.length}")
71
+ end
72
+ end
73
+ end
74
+ end
@@ -25,7 +25,7 @@ module TwitterToCsv
25
25
  handle_status status
26
26
  end
27
27
  rescue SignalException, SystemExit
28
- EventMachine::stop_event_loop
28
+ EventMachine::stop_event_loop if EventMachine::reactor_running?
29
29
  exit
30
30
  rescue StandardError => e
31
31
  STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
@@ -70,13 +70,9 @@ module TwitterToCsv
70
70
  # This is an original status.
71
71
  if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
72
72
  if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
73
- status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
73
+ status['retweet_count'] = @retweet_counts[status['id']] || 0 # if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
74
74
  if options[:retweet_counts_at]
75
- retweet_hour_data = @retweet_hour_counts.delete(status['id'])
76
- if !retweet_hour_data
77
- puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
78
- return false
79
- end
75
+ retweet_hour_data = @retweet_hour_counts.delete(status['id']) || options[:retweet_counts_at].map { 0 }
80
76
  status['_retweet_hour_counts'] = retweet_hour_data
81
77
  end
82
78
  true
@@ -90,8 +86,10 @@ module TwitterToCsv
90
86
  end
91
87
 
92
88
  def handle_status(status, &block)
93
- if within_time_window?(status)
94
- if (options[:require_english] && is_english?(status)) || !options[:require_english]
89
+ if status.has_key?('delete')
90
+ STDERR.puts "Skipping Tweet with delete." if options[:verbose]
91
+ elsif within_time_window?(status)
92
+ if (options[:require_english] && is_english?(status, options[:require_english])) || !options[:require_english]
95
93
  if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
96
94
  log_json(status) if options[:json]
97
95
  log_csv(status) if options[:csv]
@@ -124,6 +122,10 @@ module TwitterToCsv
124
122
  options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
125
123
  options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
126
124
 
125
+ (options[:bool_word_fields] || []).each do |pattern|
126
+ header_labels << pattern[:name]
127
+ end
128
+
127
129
  options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
128
130
  end
129
131
 
@@ -137,9 +139,19 @@ module TwitterToCsv
137
139
 
138
140
  def output_row(status)
139
141
  row = options[:fields].map do |field|
140
- field.split(".").inject(status) { |memo, segment|
142
+ value = field.split(".").inject(status) { |memo, segment|
141
143
  memo && memo[segment]
142
144
  }.to_s
145
+
146
+ if options[:prefix_ids]
147
+ value = "id" + value if value.length > 0 && (field =~ /\Aid_str|id\Z/ || field =~ /_id|_id_str\Z/)
148
+ end
149
+
150
+ if options[:remove_quotes]
151
+ value = value.gsub(/\"/, '')
152
+ end
153
+
154
+ value
143
155
  end
144
156
 
145
157
  row += compute_sentiment(status["text"]) if options[:compute_sentiment]
@@ -151,7 +163,7 @@ module TwitterToCsv
151
163
  (options[:date_fields] || []).each do |date_field|
152
164
  time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
153
165
  memo && memo[segment]
154
- }.to_s)
166
+ }.to_s).utc
155
167
 
156
168
  row << time.strftime("%w") # week_day
157
169
  row << time.strftime("%-d") # day
@@ -179,6 +191,10 @@ module TwitterToCsv
179
191
  options[:user_mention_columns].times { |i| row << users[i].to_s }
180
192
  end
181
193
 
194
+ (options[:bool_word_fields] || []).each do |pattern|
195
+ row << (!!TwitterToCsv::BoolWordFieldParser.check(pattern, status["text"])).to_s
196
+ end
197
+
182
198
  row
183
199
  end
184
200
 
@@ -239,24 +255,30 @@ module TwitterToCsv
239
255
  end
240
256
 
241
257
  def sample_fields(status)
242
- extract_fields(status, sampled_fields)
258
+ extract_fields status, sampled_fields
243
259
  @num_samples += 1
244
260
  if @num_samples > options[:sample_fields]
245
261
  puts "Sampled fields from Twitter:"
246
262
  sampled_fields.each do |field, count|
247
263
  puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
248
264
  end
249
- exit 1
265
+ exit 0
250
266
  end
251
267
  end
252
268
 
253
- def extract_fields(object, fields, current_path = [])
269
+ def extract_fields(object, fields, current_path = "")
254
270
  if object.is_a?(Hash)
255
271
  object.each do |k, v|
256
- extract_fields v, fields, current_path + [k]
272
+ extract_fields v, fields, current_path + "." + k.to_s
273
+ end
274
+ elsif object.is_a?(Array)
275
+ local_fields = {}
276
+ object.each do |v|
277
+ extract_fields v, local_fields, current_path + "[]"
257
278
  end
279
+ local_fields.keys.each { |key| fields[key] ||= 0 ; fields[key] += 1 }
258
280
  else
259
- path = current_path.join(".")
281
+ path = current_path[1..-1]
260
282
  fields[path] ||= 0
261
283
  fields[path] += 1
262
284
  end
@@ -268,19 +290,19 @@ module TwitterToCsv
268
290
  options[:json].flush
269
291
  end
270
292
 
271
- def is_english?(status)
272
- if status.has_key?('delete')
273
- STDERR.puts "Skipping Tweet with delete." if options[:verbose]
274
- return false
293
+ def is_english?(status, strategy)
294
+ unless strategy == :twitter
295
+ status['uld'] = !!UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
275
296
  end
276
-
277
- #unless status['user']['lang'] == "en"
278
- # STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
279
- # return false
280
- #end
281
-
282
- unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
283
- STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
297
+
298
+ if strategy == :both && status['lang'] != 'en' && !status['uld']
299
+ STDERR.puts "Skipping \"#{status['text']}\" because both Twitter (#{status['lang']}) and UnsupervisedLanguageDetection think it is not English." if options[:verbose]
300
+ return false
301
+ elsif strategy == :uld && !status['uld']
302
+ STDERR.puts "Skipping \"#{status['text']}\" because UnsupervisedLanguageDetection thinks it is not English." if options[:verbose]
303
+ return false
304
+ elsif strategy == :twitter && status['lang'] != 'en'
305
+ STDERR.puts "Skipping \"#{status['text']}\" because Twitter (#{status['lang']}) thinks it is not English." if options[:verbose]
284
306
  return false
285
307
  end
286
308
 
@@ -2,11 +2,13 @@ require 'cgi'
2
2
 
3
3
  module TwitterToCsv
4
4
  class TwitterWatcher
5
- attr_accessor :username, :password, :filter, :fetch_errors
5
+ attr_accessor :api_key, :api_secret, :access_token, :access_token_secret, :filter, :fetch_errors
6
6
 
7
7
  def initialize(options)
8
- @username = options[:username]
9
- @password = options[:password]
8
+ @api_key = options[:api_key]
9
+ @api_secret = options[:api_secret]
10
+ @access_token = options[:access_token]
11
+ @access_token_secret = options[:access_token_secret]
10
12
  @filter = options[:filter]
11
13
  @fetch_errors = 0
12
14
  end
@@ -23,8 +25,13 @@ module TwitterToCsv
23
25
  EventMachine::run do
24
26
  stream = Twitter::JSONStream.connect(
25
27
  :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
26
- :auth => "#{username}:#{password}",
27
- :ssl => true
28
+ :ssl => true,
29
+ :oauth => {
30
+ :consumer_key => api_key,
31
+ :consumer_secret => api_secret,
32
+ :access_key => access_token,
33
+ :access_secret => access_token_secret
34
+ }
28
35
  )
29
36
 
30
37
  stream.each_item do |item|
@@ -1,3 +1,3 @@
1
1
  module TwitterToCsv
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+ require 'time'
4
+
5
+ describe TwitterToCsv::BoolWordFieldParser do
6
+ describe "#parse" do
7
+ it "parses name:string AND string AND string... syntax" do
8
+ TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 AND string3 AND string4").should == {
9
+ :name => "something",
10
+ :logic => ["string1 string2", :and, ["string3", :and, ["string4"]]]
11
+ }
12
+ end
13
+
14
+ it "parses name:string OR string OR string... syntax" do
15
+ TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 OR string3 OR string4").should == {
16
+ :name => "something",
17
+ :logic => ["string1 string2", :or, ["string3", :or, ["string4"]]]
18
+ }
19
+ end
20
+
21
+ it "parses parens" do
22
+ TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 STRING2 OR ( string3 AND (string4 OR string5 ))").should == {
23
+ :name => "something_else",
24
+ :logic => ["string1 string2", :or, ["string3", :and, ["string4", :or, ["string5"]]]]
25
+ }
26
+ end
27
+ end
28
+
29
+ describe "#check" do
30
+ it "returns true when an expression matches some text, false when it doesn't" do
31
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND (string4 OR string5))")
32
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2").should be_true
33
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string2 string1").should be_false
34
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string1").should be_false
35
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string2").should be_false
36
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string3 string4").should be_true
37
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string4 string3").should be_true
38
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string5 string3").should be_true
39
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz").should be_true
40
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string5 baz").should be_false
41
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string4 string5 baz").should be_true
42
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz string4").should be_true
43
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2 string3 string4").should be_true
44
+ end
45
+
46
+ it "raises errors when the input is un-evaluable" do
47
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND OR string5))")
48
+ lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
49
+
50
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("hello (")
51
+ lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
52
+
53
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("hello ()")
54
+ lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
55
+ end
56
+ end
57
+ end
@@ -4,35 +4,73 @@ require 'time'
4
4
 
5
5
  describe TwitterToCsv::CsvBuilder do
6
6
  describe "#handle_status" do
7
- describe "when :english is set" do
8
- it "skips non-English tweets" do
9
- string_io = StringIO.new
10
- csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
11
- csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
12
- csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
13
- csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
14
- string_io.rewind
15
- string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
7
+ describe "English language detection" do
8
+ describe "with the :uld strategy" do
9
+ it "uses the UnsupervisedLanguageDetection library to skip non-English tweets" do
10
+ string_io = StringIO.new
11
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text])
12
+ csv_builder.handle_status('text' => "This is English")
13
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
14
+ csv_builder.handle_status('text' => "This is still English")
15
+ string_io.rewind
16
+ string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
17
+ end
18
+
19
+ it "makes a new 'uld' variable available on the tweet" do
20
+ string_io = StringIO.new
21
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text uld])
22
+ csv_builder.handle_status('text' => "This is English")
23
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
24
+ csv_builder.handle_status('text' => "This is still English")
25
+ string_io.rewind
26
+ string_io.read.should == "\"This is English\",\"true\"\n\"This is still English\",\"true\"\n"
27
+ end
16
28
  end
17
29
 
18
- it "honors start_time and end_time" do
19
- string_io = StringIO.new
20
- csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
21
- :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
22
- :end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
23
-
24
- # Order shouldn't matter
25
- csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
26
- csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
27
- csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
28
- csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
29
- csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
30
- csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
31
- string_io.rewind
32
- string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
30
+ describe "with the :twitter strategy" do
31
+ it "uses Twitter's lang field to skip non-English tweets" do
32
+ string_io = StringIO.new
33
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :twitter, :csv => string_io, :fields => %w[text])
34
+ csv_builder.handle_status('text' => "This is English", 'lang' => 'en')
35
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'fr')
36
+ csv_builder.handle_status('text' => "This is still English", 'lang' => 'en')
37
+ string_io.rewind
38
+ string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
39
+ end
40
+ end
41
+
42
+ describe "with the :both strategy" do
43
+ it "returns tweets unless both ULD and Twitter agree that the Tweet is non-English" do
44
+ string_io = StringIO.new
45
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :both, :csv => string_io, :fields => %w[text])
46
+ csv_builder.handle_status('text' => "This is English", 'lang' => 'en') # agree
47
+ csv_builder.handle_status('text' => "This is English 2", 'lang' => 'fr') # disagree
48
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'en') # disagree
49
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles 2.", 'lang' => 'fr') # agree
50
+ csv_builder.handle_status('text' => "This is still English", 'lang' => 'en') # agree
51
+ string_io.rewind
52
+ string_io.read.should == "\"This is English\"\n\"This is English 2\"\n\"Esta frase se encuentra en Ingles.\"\n\"This is still English\"\n"
53
+ end
33
54
  end
34
55
  end
35
56
 
57
+ it "honors start_time and end_time" do
58
+ string_io = StringIO.new
59
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
60
+ :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
61
+ :end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
62
+
63
+ # Order shouldn't matter
64
+ csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
65
+ csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
66
+ csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
67
+ csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
68
+ csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
69
+ csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
70
+ string_io.rewind
71
+ string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
72
+ end
73
+
36
74
  describe "log_csv_header" do
37
75
  it "outputs the fields as header labels" do
38
76
  string_io = StringIO.new
@@ -169,17 +207,42 @@ describe TwitterToCsv::CsvBuilder do
169
207
  "\"hello2\",\"2\"\n"
170
208
  end
171
209
 
172
- it "can return date fields" do
210
+ it "can extract boolean word fields" do
211
+ string_io = StringIO.new
212
+ patterns = [
213
+ TwitterToCsv::BoolWordFieldParser.parse("field1:hello AND world"),
214
+ TwitterToCsv::BoolWordFieldParser.parse("field2:hello"),
215
+ TwitterToCsv::BoolWordFieldParser.parse("field3:string OR text"),
216
+ TwitterToCsv::BoolWordFieldParser.parse("field3:hello this")
217
+ ]
218
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :bool_word_fields => patterns)
219
+ csv_builder.handle_status({
220
+ 'something' => "hello1",
221
+ 'text' => 'hello this is a string'
222
+
223
+ })
224
+ csv_builder.handle_status({
225
+ 'something' => "hello2",
226
+ 'text' => 'hello world this is some text'
227
+ })
228
+ string_io.rewind
229
+ string_io.read.should == "\"hello1\",\"false\",\"true\",\"true\",\"true\"\n" +
230
+ "\"hello2\",\"true\",\"true\",\"true\",\"false\"\n"
231
+ end
232
+
233
+ it "can return date fields and convert them to UTC" do
173
234
  string_io = StringIO.new
174
235
  csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
175
236
  csv_builder.handle_status({
176
237
  'something' => "hello1",
177
238
  'text' => 'i love cheese',
178
239
  'created_at' => "2012-06-29 13:12:09 -0700"
179
-
180
240
  })
181
241
  string_io.rewind
182
- string_io.read.should == "\"hello1\",\"5\",\"29\",\"6\",\"2012\",\"13\",\"12\",\"09\"\n"
242
+ time = Time.parse("2012-06-29 13:12:09 -0700").utc
243
+ string_io.read.should == '"' + ["hello1", time.strftime("%w"), time.strftime("%-d"),
244
+ time.strftime("%-m"), time.strftime("%Y"), time.strftime("%-H"),
245
+ time.strftime("%M"), time.strftime("%S")].join('","') + "\"\n"
183
246
  end
184
247
 
185
248
  it "can return a normalized source" do
@@ -349,24 +412,33 @@ describe TwitterToCsv::CsvBuilder do
349
412
  end
350
413
 
351
414
  describe "#extract_fields" do
352
- it "finds all the paths through a hash" do
415
+ it "finds all the paths through a structure" do
353
416
  obj = {
354
417
  :a => :b,
355
418
  :b => "c",
356
419
  :d => {
357
420
  :e => :f,
358
- :g => {
421
+ :g => [
422
+ {
359
423
  :h => :i,
360
424
  :j => {
361
- :k => "l"
425
+ :k => "l"
362
426
  }
363
- },
427
+ },
428
+ {
429
+ :h => :i,
430
+ :j => {
431
+ :m => "n"
432
+ },
433
+ :hi => 2
434
+ }
435
+ ],
364
436
  :m => "n"
365
437
  }
366
438
  }
367
439
  fields = { "a" => 1 }
368
440
  TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
369
- fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
441
+ fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g[].h" => 1, "d.g[].j.k" => 1, "d.g[].j.m" => 1, "d.g[].hi" => 1, "d.m" => 1 }
370
442
  end
371
443
  end
372
444
  end