twitter_to_csv 0.1.2 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,20 +5,32 @@ require 'optparse'
5
5
  require 'time'
6
6
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
7
7
 
8
- options = { :fields => %w[created_at text] }
8
+ options = { :fields => %w[created_at text], :bool_word_fields => [] }
9
9
  parser = OptionParser.new do |opts|
10
10
  opts.banner = "Usage: #{File.basename($0)} [options]"
11
+
11
12
  opts.separator ""
12
- opts.separator "Specific options:"
13
+ opts.separator "These four fields are required. Please see the README to learn how to get them for your Twitter account."
14
+
15
+ opts.on("--api-key KEY", "Twitter API key") do |api_key|
16
+ options[:api_key] = api_key
17
+ end
13
18
 
14
- opts.on("-u", "--username USERNAME", "Twitter username") do |username|
15
- options[:username] = username
19
+ opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret|
20
+ options[:api_secret] = api_secret
16
21
  end
17
22
 
18
- opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
19
- options[:password] = password
23
+ opts.on("--access-token TOKEN", "Twitter access token") do |access_token|
24
+ options[:access_token] = access_token
20
25
  end
21
26
 
27
+ opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret|
28
+ options[:access_token_secret] = access_token_secret
29
+ end
30
+
31
+ opts.separator ""
32
+ opts.separator "General settings:"
33
+
22
34
  opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv|
23
35
  options[:csv_appending] = File.exists?(csv)
24
36
  options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a')
@@ -40,8 +52,12 @@ parser = OptionParser.new do |opts|
40
52
  options[:date_fields] = date_fields.split(/\s*,\s*/)
41
53
  end
42
54
 
43
- opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
44
- options[:require_english] = e
55
+ opts.on("-e", "--require-english [STRATEGY]",
56
+ "Attempt to filter out non-English tweets. This will have both false positives and false negatives.",
57
+ "The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,",
58
+ "'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that",
59
+ "both Twitter and ULD think are non-English. This is most conservative and is the default.") do |e|
60
+ options[:require_english] = (e || "both").downcase.to_sym
45
61
  end
46
62
 
47
63
  opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
@@ -84,6 +100,25 @@ parser = OptionParser.new do |opts|
84
100
  options[:normalize_source] = normalize_source
85
101
  end
86
102
 
103
+ opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes|
104
+ options[:remove_quotes] = remove_quotes
105
+ end
106
+
107
+ opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids|
108
+ options[:prefix_ids] = prefix_ids
109
+ end
110
+
111
+ opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"",
112
+ "Create a named CSV column that is true when the word expression matches, false otherwise.",
113
+ "Word expressions are boolean expressions where neighboring words must occur sequentially",
114
+ "and you can use parentheses, AND, and OR to test for occurrence relationships. Examples:",
115
+ " keyword_any:tanning booth OR tanning booths OR tanningbooth",
116
+ " keyword_both:tanning AND booth",
117
+ " keyword_complex:tanning AND (booth OR bed)",
118
+ "This option can be used multiple times.") do |bool_word_field|
119
+ options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field)
120
+ end
121
+
87
122
  opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
88
123
  options[:start_time] = Time.parse(start_time)
89
124
  end
@@ -125,8 +160,8 @@ end
125
160
 
126
161
  parser.parse!
127
162
 
128
- unless (options[:username] && options[:password]) || options[:replay_from_file]
129
- STDERR.puts "Error: Twitter username and password are required fields unless you're replaying from a file.\n\n"
163
+ unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file]
164
+ STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n"
130
165
  STDERR.puts parser
131
166
  exit 1
132
167
  end
@@ -7,6 +7,7 @@ require 'twitter/json_stream'
7
7
  require 'em-http-request'
8
8
  require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
9
9
  require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
10
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "bool_word_field_parser"))
10
11
  require 'unsupervised-language-detection'
11
12
 
12
13
  module TwitterToCsv
@@ -0,0 +1,74 @@
1
+ # encoding: UTF-8
2
+
3
+ module TwitterToCsv
4
+ class InvalidLogicError < StandardError; end
5
+
6
+ class BoolWordFieldParser
7
+ TOKEN_SEPARATOR = /[^a-zA-Z0-9-]+/
8
+
9
+ def self.parse(string)
10
+ parts = string.split(":")
11
+ name = parts.shift
12
+ tokens = parts.join(":").gsub(/\)/, " ) ").gsub(/\(/, " ( ").split(/\s+/).reject {|s| s.length == 0 }
13
+ struct = []
14
+ descend_parse(struct, tokens)
15
+ { :name => name, :logic => struct }
16
+ end
17
+
18
+ def self.descend_parse(struct, tokens)
19
+ while tokens.length > 0
20
+ token = tokens.shift
21
+ if token == ")"
22
+ return
23
+ elsif token == "("
24
+ if struct.length > 0
25
+ sub_struct = []
26
+ struct << sub_struct
27
+ descend_parse(sub_struct, tokens)
28
+ end
29
+ elsif %w[AND OR].include?(token)
30
+ sub_struct = []
31
+ struct << :and if token == "AND"
32
+ struct << :or if token == "OR"
33
+ struct << sub_struct
34
+ descend_parse(sub_struct, tokens)
35
+ else
36
+ if struct[0]
37
+ struct[0] += " " + token.downcase
38
+ else
39
+ struct << token.downcase
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ def self.check(pattern, text)
46
+ logic = pattern[:logic]
47
+ tokens = text.downcase.split(TOKEN_SEPARATOR).reject {|t| t.length == 0 }.join(" ")
48
+ !!descend_check(logic, tokens)
49
+ end
50
+
51
+ def self.descend_check(logic, tokens)
52
+ if logic.is_a?(String)
53
+ # See if the token(s) are present.
54
+ tokens =~ /\b#{Regexp::escape logic}\b/
55
+ elsif logic.length == 1
56
+ # Recurse further.
57
+ descend_check logic.first, tokens
58
+ elsif logic.length == 3
59
+ # Apply the given logical operation.
60
+ first = descend_check(logic.first, tokens)
61
+ last = descend_check(logic.last, tokens)
62
+ if logic[1] == :and
63
+ first && last
64
+ elsif logic[1] == :or
65
+ first || last
66
+ else
67
+ raise InvalidLogicError.new("Unknown operation: #{logic[1]}")
68
+ end
69
+ else
70
+ raise InvalidLogicError.new("Invalid expression length of #{logic.length}")
71
+ end
72
+ end
73
+ end
74
+ end
@@ -25,7 +25,7 @@ module TwitterToCsv
25
25
  handle_status status
26
26
  end
27
27
  rescue SignalException, SystemExit
28
- EventMachine::stop_event_loop
28
+ EventMachine::stop_event_loop if EventMachine::reactor_running?
29
29
  exit
30
30
  rescue StandardError => e
31
31
  STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
@@ -70,13 +70,9 @@ module TwitterToCsv
70
70
  # This is an original status.
71
71
  if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
72
72
  if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
73
- status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
73
+ status['retweet_count'] = @retweet_counts[status['id']] || 0 # if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
74
74
  if options[:retweet_counts_at]
75
- retweet_hour_data = @retweet_hour_counts.delete(status['id'])
76
- if !retweet_hour_data
77
- puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
78
- return false
79
- end
75
+ retweet_hour_data = @retweet_hour_counts.delete(status['id']) || options[:retweet_counts_at].map { 0 }
80
76
  status['_retweet_hour_counts'] = retweet_hour_data
81
77
  end
82
78
  true
@@ -90,8 +86,10 @@ module TwitterToCsv
90
86
  end
91
87
 
92
88
  def handle_status(status, &block)
93
- if within_time_window?(status)
94
- if (options[:require_english] && is_english?(status)) || !options[:require_english]
89
+ if status.has_key?('delete')
90
+ STDERR.puts "Skipping Tweet with delete." if options[:verbose]
91
+ elsif within_time_window?(status)
92
+ if (options[:require_english] && is_english?(status, options[:require_english])) || !options[:require_english]
95
93
  if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
96
94
  log_json(status) if options[:json]
97
95
  log_csv(status) if options[:csv]
@@ -124,6 +122,10 @@ module TwitterToCsv
124
122
  options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
125
123
  options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
126
124
 
125
+ (options[:bool_word_fields] || []).each do |pattern|
126
+ header_labels << pattern[:name]
127
+ end
128
+
127
129
  options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
128
130
  end
129
131
 
@@ -137,9 +139,19 @@ module TwitterToCsv
137
139
 
138
140
  def output_row(status)
139
141
  row = options[:fields].map do |field|
140
- field.split(".").inject(status) { |memo, segment|
142
+ value = field.split(".").inject(status) { |memo, segment|
141
143
  memo && memo[segment]
142
144
  }.to_s
145
+
146
+ if options[:prefix_ids]
147
+ value = "id" + value if value.length > 0 && (field =~ /\Aid_str|id\Z/ || field =~ /_id|_id_str\Z/)
148
+ end
149
+
150
+ if options[:remove_quotes]
151
+ value = value.gsub(/\"/, '')
152
+ end
153
+
154
+ value
143
155
  end
144
156
 
145
157
  row += compute_sentiment(status["text"]) if options[:compute_sentiment]
@@ -151,7 +163,7 @@ module TwitterToCsv
151
163
  (options[:date_fields] || []).each do |date_field|
152
164
  time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
153
165
  memo && memo[segment]
154
- }.to_s)
166
+ }.to_s).utc
155
167
 
156
168
  row << time.strftime("%w") # week_day
157
169
  row << time.strftime("%-d") # day
@@ -179,6 +191,10 @@ module TwitterToCsv
179
191
  options[:user_mention_columns].times { |i| row << users[i].to_s }
180
192
  end
181
193
 
194
+ (options[:bool_word_fields] || []).each do |pattern|
195
+ row << (!!TwitterToCsv::BoolWordFieldParser.check(pattern, status["text"])).to_s
196
+ end
197
+
182
198
  row
183
199
  end
184
200
 
@@ -239,24 +255,30 @@ module TwitterToCsv
239
255
  end
240
256
 
241
257
  def sample_fields(status)
242
- extract_fields(status, sampled_fields)
258
+ extract_fields status, sampled_fields
243
259
  @num_samples += 1
244
260
  if @num_samples > options[:sample_fields]
245
261
  puts "Sampled fields from Twitter:"
246
262
  sampled_fields.each do |field, count|
247
263
  puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
248
264
  end
249
- exit 1
265
+ exit 0
250
266
  end
251
267
  end
252
268
 
253
- def extract_fields(object, fields, current_path = [])
269
+ def extract_fields(object, fields, current_path = "")
254
270
  if object.is_a?(Hash)
255
271
  object.each do |k, v|
256
- extract_fields v, fields, current_path + [k]
272
+ extract_fields v, fields, current_path + "." + k.to_s
273
+ end
274
+ elsif object.is_a?(Array)
275
+ local_fields = {}
276
+ object.each do |v|
277
+ extract_fields v, local_fields, current_path + "[]"
257
278
  end
279
+ local_fields.keys.each { |key| fields[key] ||= 0 ; fields[key] += 1 }
258
280
  else
259
- path = current_path.join(".")
281
+ path = current_path[1..-1]
260
282
  fields[path] ||= 0
261
283
  fields[path] += 1
262
284
  end
@@ -268,19 +290,19 @@ module TwitterToCsv
268
290
  options[:json].flush
269
291
  end
270
292
 
271
- def is_english?(status)
272
- if status.has_key?('delete')
273
- STDERR.puts "Skipping Tweet with delete." if options[:verbose]
274
- return false
293
+ def is_english?(status, strategy)
294
+ unless strategy == :twitter
295
+ status['uld'] = !!UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
275
296
  end
276
-
277
- #unless status['user']['lang'] == "en"
278
- # STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
279
- # return false
280
- #end
281
-
282
- unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
283
- STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
297
+
298
+ if strategy == :both && status['lang'] != 'en' && !status['uld']
299
+ STDERR.puts "Skipping \"#{status['text']}\" because both Twitter (#{status['lang']}) and UnsupervisedLanguageDetection think it is not English." if options[:verbose]
300
+ return false
301
+ elsif strategy == :uld && !status['uld']
302
+ STDERR.puts "Skipping \"#{status['text']}\" because UnsupervisedLanguageDetection thinks it is not English." if options[:verbose]
303
+ return false
304
+ elsif strategy == :twitter && status['lang'] != 'en'
305
+ STDERR.puts "Skipping \"#{status['text']}\" because Twitter (#{status['lang']}) thinks it is not English." if options[:verbose]
284
306
  return false
285
307
  end
286
308
 
@@ -2,11 +2,13 @@ require 'cgi'
2
2
 
3
3
  module TwitterToCsv
4
4
  class TwitterWatcher
5
- attr_accessor :username, :password, :filter, :fetch_errors
5
+ attr_accessor :api_key, :api_secret, :access_token, :access_token_secret, :filter, :fetch_errors
6
6
 
7
7
  def initialize(options)
8
- @username = options[:username]
9
- @password = options[:password]
8
+ @api_key = options[:api_key]
9
+ @api_secret = options[:api_secret]
10
+ @access_token = options[:access_token]
11
+ @access_token_secret = options[:access_token_secret]
10
12
  @filter = options[:filter]
11
13
  @fetch_errors = 0
12
14
  end
@@ -23,8 +25,13 @@ module TwitterToCsv
23
25
  EventMachine::run do
24
26
  stream = Twitter::JSONStream.connect(
25
27
  :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
26
- :auth => "#{username}:#{password}",
27
- :ssl => true
28
+ :ssl => true,
29
+ :oauth => {
30
+ :consumer_key => api_key,
31
+ :consumer_secret => api_secret,
32
+ :access_key => access_token,
33
+ :access_secret => access_token_secret
34
+ }
28
35
  )
29
36
 
30
37
  stream.each_item do |item|
@@ -1,3 +1,3 @@
1
1
  module TwitterToCsv
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+ require 'time'
4
+
5
+ describe TwitterToCsv::BoolWordFieldParser do
6
+ describe "#parse" do
7
+ it "parses name:string AND string AND string... syntax" do
8
+ TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 AND string3 AND string4").should == {
9
+ :name => "something",
10
+ :logic => ["string1 string2", :and, ["string3", :and, ["string4"]]]
11
+ }
12
+ end
13
+
14
+ it "parses name:string OR string OR string... syntax" do
15
+ TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 OR string3 OR string4").should == {
16
+ :name => "something",
17
+ :logic => ["string1 string2", :or, ["string3", :or, ["string4"]]]
18
+ }
19
+ end
20
+
21
+ it "parses parens" do
22
+ TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 STRING2 OR ( string3 AND (string4 OR string5 ))").should == {
23
+ :name => "something_else",
24
+ :logic => ["string1 string2", :or, ["string3", :and, ["string4", :or, ["string5"]]]]
25
+ }
26
+ end
27
+ end
28
+
29
+ describe "#check" do
30
+ it "returns true when an expression matches some text, false when it doesn't" do
31
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND (string4 OR string5))")
32
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2").should be_true
33
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string2 string1").should be_false
34
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string1").should be_false
35
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string2").should be_false
36
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string3 string4").should be_true
37
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string4 string3").should be_true
38
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string5 string3").should be_true
39
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz").should be_true
40
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string5 baz").should be_false
41
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string4 string5 baz").should be_true
42
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz string4").should be_true
43
+ TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2 string3 string4").should be_true
44
+ end
45
+
46
+ it "raises errors when the input is un-evaluable" do
47
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND OR string5))")
48
+ lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
49
+
50
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("hello (")
51
+ lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
52
+
53
+ pattern = TwitterToCsv::BoolWordFieldParser.parse("hello ()")
54
+ lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
55
+ end
56
+ end
57
+ end
@@ -4,35 +4,73 @@ require 'time'
4
4
 
5
5
  describe TwitterToCsv::CsvBuilder do
6
6
  describe "#handle_status" do
7
- describe "when :english is set" do
8
- it "skips non-English tweets" do
9
- string_io = StringIO.new
10
- csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
11
- csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
12
- csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
13
- csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
14
- string_io.rewind
15
- string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
7
+ describe "English language detection" do
8
+ describe "with the :uld strategy" do
9
+ it "uses the UnsupervisedLanguageDetection library to skip non-English tweets" do
10
+ string_io = StringIO.new
11
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text])
12
+ csv_builder.handle_status('text' => "This is English")
13
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
14
+ csv_builder.handle_status('text' => "This is still English")
15
+ string_io.rewind
16
+ string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
17
+ end
18
+
19
+ it "makes a new 'uld' variable available on the tweet" do
20
+ string_io = StringIO.new
21
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text uld])
22
+ csv_builder.handle_status('text' => "This is English")
23
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
24
+ csv_builder.handle_status('text' => "This is still English")
25
+ string_io.rewind
26
+ string_io.read.should == "\"This is English\",\"true\"\n\"This is still English\",\"true\"\n"
27
+ end
16
28
  end
17
29
 
18
- it "honors start_time and end_time" do
19
- string_io = StringIO.new
20
- csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
21
- :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
22
- :end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
23
-
24
- # Order shouldn't matter
25
- csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
26
- csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
27
- csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
28
- csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
29
- csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
30
- csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
31
- string_io.rewind
32
- string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
30
+ describe "with the :twitter strategy" do
31
+ it "uses Twitter's lang field to skip non-English tweets" do
32
+ string_io = StringIO.new
33
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :twitter, :csv => string_io, :fields => %w[text])
34
+ csv_builder.handle_status('text' => "This is English", 'lang' => 'en')
35
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'fr')
36
+ csv_builder.handle_status('text' => "This is still English", 'lang' => 'en')
37
+ string_io.rewind
38
+ string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
39
+ end
40
+ end
41
+
42
+ describe "with the :both strategy" do
43
+ it "returns tweets unless both ULD and Twitter agree that the Tweet is non-English" do
44
+ string_io = StringIO.new
45
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :both, :csv => string_io, :fields => %w[text])
46
+ csv_builder.handle_status('text' => "This is English", 'lang' => 'en') # agree
47
+ csv_builder.handle_status('text' => "This is English 2", 'lang' => 'fr') # disagree
48
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'en') # disagree
49
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles 2.", 'lang' => 'fr') # agree
50
+ csv_builder.handle_status('text' => "This is still English", 'lang' => 'en') # agree
51
+ string_io.rewind
52
+ string_io.read.should == "\"This is English\"\n\"This is English 2\"\n\"Esta frase se encuentra en Ingles.\"\n\"This is still English\"\n"
53
+ end
33
54
  end
34
55
  end
35
56
 
57
+ it "honors start_time and end_time" do
58
+ string_io = StringIO.new
59
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
60
+ :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
61
+ :end_time => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
62
+
63
+ # Order shouldn't matter
64
+ csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
65
+ csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
66
+ csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
67
+ csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
68
+ csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
69
+ csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
70
+ string_io.rewind
71
+ string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
72
+ end
73
+
36
74
  describe "log_csv_header" do
37
75
  it "outputs the fields as header labels" do
38
76
  string_io = StringIO.new
@@ -169,17 +207,42 @@ describe TwitterToCsv::CsvBuilder do
169
207
  "\"hello2\",\"2\"\n"
170
208
  end
171
209
 
172
- it "can return date fields" do
210
+ it "can extract boolean word fields" do
211
+ string_io = StringIO.new
212
+ patterns = [
213
+ TwitterToCsv::BoolWordFieldParser.parse("field1:hello AND world"),
214
+ TwitterToCsv::BoolWordFieldParser.parse("field2:hello"),
215
+ TwitterToCsv::BoolWordFieldParser.parse("field3:string OR text"),
216
+ TwitterToCsv::BoolWordFieldParser.parse("field3:hello this")
217
+ ]
218
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :bool_word_fields => patterns)
219
+ csv_builder.handle_status({
220
+ 'something' => "hello1",
221
+ 'text' => 'hello this is a string'
222
+
223
+ })
224
+ csv_builder.handle_status({
225
+ 'something' => "hello2",
226
+ 'text' => 'hello world this is some text'
227
+ })
228
+ string_io.rewind
229
+ string_io.read.should == "\"hello1\",\"false\",\"true\",\"true\",\"true\"\n" +
230
+ "\"hello2\",\"true\",\"true\",\"true\",\"false\"\n"
231
+ end
232
+
233
+ it "can return date fields and convert them to UTC" do
173
234
  string_io = StringIO.new
174
235
  csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
175
236
  csv_builder.handle_status({
176
237
  'something' => "hello1",
177
238
  'text' => 'i love cheese',
178
239
  'created_at' => "2012-06-29 13:12:09 -0700"
179
-
180
240
  })
181
241
  string_io.rewind
182
- string_io.read.should == "\"hello1\",\"5\",\"29\",\"6\",\"2012\",\"13\",\"12\",\"09\"\n"
242
+ time = Time.parse("2012-06-29 13:12:09 -0700").utc
243
+ string_io.read.should == '"' + ["hello1", time.strftime("%w"), time.strftime("%-d"),
244
+ time.strftime("%-m"), time.strftime("%Y"), time.strftime("%-H"),
245
+ time.strftime("%M"), time.strftime("%S")].join('","') + "\"\n"
183
246
  end
184
247
 
185
248
  it "can return a normalized source" do
@@ -349,24 +412,33 @@ describe TwitterToCsv::CsvBuilder do
349
412
  end
350
413
 
351
414
  describe "#extract_fields" do
352
- it "finds all the paths through a hash" do
415
+ it "finds all the paths through a structure" do
353
416
  obj = {
354
417
  :a => :b,
355
418
  :b => "c",
356
419
  :d => {
357
420
  :e => :f,
358
- :g => {
421
+ :g => [
422
+ {
359
423
  :h => :i,
360
424
  :j => {
361
- :k => "l"
425
+ :k => "l"
362
426
  }
363
- },
427
+ },
428
+ {
429
+ :h => :i,
430
+ :j => {
431
+ :m => "n"
432
+ },
433
+ :hi => 2
434
+ }
435
+ ],
364
436
  :m => "n"
365
437
  }
366
438
  }
367
439
  fields = { "a" => 1 }
368
440
  TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
369
- fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
441
+ fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g[].h" => 1, "d.g[].j.k" => 1, "d.g[].j.m" => 1, "d.g[].hi" => 1, "d.m" => 1 }
370
442
  end
371
443
  end
372
444
  end