RubyGems - twitter_to_csv - Versions diffs - 0.1.2 → 0.1.5 - Mend

twitter_to_csv 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +15 -0
data/.gitignore +1 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/.travis.yml +4 -0
data/Gemfile +3 -1
data/README.markdown +316 -153
data/bin/twitter_to_csv +45 -10
data/lib/twitter_to_csv.rb +1 -0
data/lib/twitter_to_csv/bool_word_field_parser.rb +74 -0
data/lib/twitter_to_csv/csv_builder.rb +50 -28
data/lib/twitter_to_csv/twitter_watcher.rb +12 -5
data/lib/twitter_to_csv/version.rb +1 -1
data/spec/bool_word_field_parser_spec.rb +57 -0
data/spec/csv_builder_spec.rb +104 -32
data/twitter_to_csv.gemspec +1 -1
metadata +14 -21
data/.rvmrc +0 -1

data/bin/twitter_to_csv CHANGED

@@ -5,20 +5,32 @@ require 'optparse'
 require 'time'
 require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
-options = { :fields => %w[created_at text] }
+options = { :fields => %w[created_at text], :bool_word_fields => [] }
 parser = OptionParser.new do |opts|
   opts.banner = "Usage: #{File.basename($0)} [options]"
   opts.separator ""
-  opts.separator "Specific options:"
+  opts.separator "These four fields are required.  Please see the README to learn how to get them for your Twitter account."
+  opts.on("--api-key KEY", "Twitter API key") do |api_key|
+    options[:api_key] = api_key
+  end
-  opts.on("-u", "--username USERNAME", "Twitter username") do |username|
-    options[:username] = username
+  opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret|
+    options[:api_secret] = api_secret
   end
-  opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
-    options[:password] = password
+  opts.on("--access-token TOKEN", "Twitter access token") do |access_token|
+    options[:access_token] = access_token
   end
+  opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret|
+    options[:access_token_secret] = access_token_secret
+  end
+  opts.separator ""
+  opts.separator "General settings:"
   opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv|
     options[:csv_appending] = File.exists?(csv)
     options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a')
@@ -40,8 +52,12 @@ parser = OptionParser.new do |opts|
     options[:date_fields] = date_fields.split(/\s*,\s*/)
   end
-  opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
-    options[:require_english] = e
+  opts.on("-e", "--require-english [STRATEGY]",
+                "Attempt to filter out non-English tweets. This will have both false positives and false negatives.",
+                "The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,",
+                "'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that",
+                "both Twitter and ULD think are non-English.  This is most conservative and is the default.") do |e|
+    options[:require_english] = (e || "both").downcase.to_sym
   end
   opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
@@ -84,6 +100,25 @@ parser = OptionParser.new do |opts|
     options[:normalize_source] = normalize_source
   end
+  opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes|
+    options[:remove_quotes] = remove_quotes
+  end
+  opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids|
+    options[:prefix_ids] = prefix_ids
+  end
+  opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"",
+          "Create a named CSV column that is true when the word expression matches, false otherwise.",
+          "Word expressions are boolean expressions where neighboring words must occur sequentially",
+          "and you can use parentheses, AND, and OR to test for occurrence relationships.  Examples:",
+          "  keyword_any:tanning booth OR tanning booths OR tanningbooth",
+          "  keyword_both:tanning AND booth",
+          "  keyword_complex:tanning AND (booth OR bed)",
+          "This option can be used multiple times.") do |bool_word_field|
+    options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field)
+  end
   opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
     options[:start_time] = Time.parse(start_time)
   end
@@ -125,8 +160,8 @@ end
 parser.parse!
-unless (options[:username] && options[:password]) || options[:replay_from_file]
-  STDERR.puts "Error: Twitter username and password are required fields unless you're replaying from a file.\n\n"
+unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file]
+  STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n"
   STDERR.puts parser
   exit 1
 end

data/lib/twitter_to_csv.rb CHANGED

@@ -7,6 +7,7 @@ require 'twitter/json_stream'
 require 'em-http-request'
 require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
 require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
+require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "bool_word_field_parser"))
 require 'unsupervised-language-detection'
 module TwitterToCsv

data/lib/twitter_to_csv/bool_word_field_parser.rb ADDED

@@ -0,0 +1,74 @@
+# encoding: UTF-8
+module TwitterToCsv
+  class InvalidLogicError < StandardError; end
+  class BoolWordFieldParser
+    TOKEN_SEPARATOR = /[^a-zA-Z0-9-]+/
+    def self.parse(string)
+      parts = string.split(":")
+      name = parts.shift
+      tokens = parts.join(":").gsub(/\)/, " ) ").gsub(/\(/, " ( ").split(/\s+/).reject {|s| s.length == 0 }
+      struct = []
+      descend_parse(struct, tokens)
+      { :name => name, :logic => struct }
+    end
+    def self.descend_parse(struct, tokens)
+      while tokens.length > 0
+        token = tokens.shift
+        if token == ")"
+          return
+        elsif token == "("
+          if struct.length > 0
+            sub_struct = []
+            struct << sub_struct
+            descend_parse(sub_struct, tokens)
+          end
+        elsif %w[AND OR].include?(token)
+          sub_struct = []
+          struct << :and if token == "AND"
+          struct << :or if token == "OR"
+          struct << sub_struct
+          descend_parse(sub_struct, tokens)
+        else
+          if struct[0]
+            struct[0] += " " + token.downcase
+          else
+            struct << token.downcase
+          end
+        end
+      end
+    end
+    def self.check(pattern, text)
+      logic = pattern[:logic]
+      tokens = text.downcase.split(TOKEN_SEPARATOR).reject {|t| t.length == 0 }.join(" ")
+      !!descend_check(logic, tokens)
+    end
+    def self.descend_check(logic, tokens)
+      if logic.is_a?(String)
+        # See if the token(s) are present.
+        tokens =~ /\b#{Regexp::escape logic}\b/
+      elsif logic.length == 1
+        # Recurse further.
+        descend_check logic.first, tokens
+      elsif logic.length == 3
+        # Apply the given logical operation.
+        first = descend_check(logic.first, tokens)
+        last = descend_check(logic.last, tokens)
+        if logic[1] == :and
+          first && last
+        elsif logic[1] == :or
+          first || last
+        else
+          raise InvalidLogicError.new("Unknown operation: #{logic[1]}")
+        end
+      else
+        raise InvalidLogicError.new("Invalid expression length of #{logic.length}")
+      end
+    end
+  end
+end

data/lib/twitter_to_csv/csv_builder.rb CHANGED

@@ -25,7 +25,7 @@ module TwitterToCsv
             handle_status status
           end
         rescue SignalException, SystemExit
-          EventMachine::stop_event_loop
+          EventMachine::stop_event_loop if EventMachine::reactor_running?
           exit
         rescue StandardError => e
           STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
@@ -70,13 +70,9 @@ module TwitterToCsv
         # This is an original status.
         if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
           if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
-            status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
+            status['retweet_count'] = @retweet_counts[status['id']] || 0 # if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
             if options[:retweet_counts_at]
-              retweet_hour_data = @retweet_hour_counts.delete(status['id'])
-              if !retweet_hour_data
-                puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
-                return false
-              end
+              retweet_hour_data = @retweet_hour_counts.delete(status['id']) || options[:retweet_counts_at].map { 0 }
               status['_retweet_hour_counts'] = retweet_hour_data
             end
             true
@@ -90,8 +86,10 @@ module TwitterToCsv
     end
     def handle_status(status, &block)
-      if within_time_window?(status)
-        if (options[:require_english] && is_english?(status)) || !options[:require_english]
+      if status.has_key?('delete')
+        STDERR.puts "Skipping Tweet with delete." if options[:verbose]
+      elsif within_time_window?(status)
+        if (options[:require_english] && is_english?(status, options[:require_english])) || !options[:require_english]
           if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
             log_json(status) if options[:json]
             log_csv(status) if options[:csv]
@@ -124,6 +122,10 @@ module TwitterToCsv
       options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
       options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
+      (options[:bool_word_fields] || []).each do |pattern|
+        header_labels << pattern[:name]
+      end
       options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
     end
@@ -137,9 +139,19 @@ module TwitterToCsv
     def output_row(status)
       row = options[:fields].map do |field|
-        field.split(".").inject(status) { |memo, segment|
+        value = field.split(".").inject(status) { |memo, segment|
           memo && memo[segment]
         }.to_s
+        if options[:prefix_ids]
+          value = "id" + value if value.length > 0 && (field =~ /\Aid_str|id\Z/ || field =~ /_id|_id_str\Z/)
+        end
+        if options[:remove_quotes]
+          value = value.gsub(/\"/, '')
+        end
+        value
       end
       row += compute_sentiment(status["text"]) if options[:compute_sentiment]
@@ -151,7 +163,7 @@ module TwitterToCsv
       (options[:date_fields] || []).each do |date_field|
         time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
           memo && memo[segment]
-        }.to_s)
+        }.to_s).utc
         row << time.strftime("%w") # week_day
         row << time.strftime("%-d") # day
@@ -179,6 +191,10 @@ module TwitterToCsv
         options[:user_mention_columns].times { |i| row << users[i].to_s }
       end
+      (options[:bool_word_fields] || []).each do |pattern|
+        row << (!!TwitterToCsv::BoolWordFieldParser.check(pattern, status["text"])).to_s
+      end
       row
     end
@@ -239,24 +255,30 @@ module TwitterToCsv
     end
     def sample_fields(status)
-      extract_fields(status, sampled_fields)
+      extract_fields status, sampled_fields
       @num_samples += 1
       if @num_samples > options[:sample_fields]
         puts "Sampled fields from Twitter:"
         sampled_fields.each do |field, count|
           puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
         end
-        exit 1
+        exit 0
       end
     end
-    def extract_fields(object, fields, current_path = [])
+    def extract_fields(object, fields, current_path = "")
       if object.is_a?(Hash)
         object.each do |k, v|
-          extract_fields v, fields, current_path + [k]
+          extract_fields v, fields, current_path + "." + k.to_s
+        end
+      elsif object.is_a?(Array)
+        local_fields = {}
+        object.each do |v|
+          extract_fields v, local_fields, current_path + "[]"
         end
+        local_fields.keys.each { |key| fields[key] ||= 0 ; fields[key] += 1 }
       else
-        path = current_path.join(".")
+        path = current_path[1..-1]
         fields[path] ||= 0
         fields[path] += 1
       end
@@ -268,19 +290,19 @@ module TwitterToCsv
       options[:json].flush
     end
-    def is_english?(status)
-      if status.has_key?('delete')
-        STDERR.puts "Skipping Tweet with delete." if options[:verbose]
-        return false
+    def is_english?(status, strategy)
+      unless strategy == :twitter
+        status['uld'] = !!UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
       end
-      #unless status['user']['lang'] == "en"
-      #  STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
-      #  return false
-      #end
-      unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
-        STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
+      if strategy == :both && status['lang'] != 'en' && !status['uld']
+        STDERR.puts "Skipping \"#{status['text']}\" because both Twitter (#{status['lang']}) and UnsupervisedLanguageDetection think it is not English." if options[:verbose]
+        return false
+      elsif strategy == :uld && !status['uld']
+        STDERR.puts "Skipping \"#{status['text']}\" because UnsupervisedLanguageDetection thinks it is not English." if options[:verbose]
+        return false
+      elsif strategy == :twitter && status['lang'] != 'en'
+        STDERR.puts "Skipping \"#{status['text']}\" because Twitter (#{status['lang']}) thinks it is not English." if options[:verbose]
         return false
       end

data/lib/twitter_to_csv/twitter_watcher.rb CHANGED

@@ -2,11 +2,13 @@ require 'cgi'
 module TwitterToCsv
   class TwitterWatcher
-    attr_accessor :username, :password, :filter, :fetch_errors
+    attr_accessor :api_key, :api_secret, :access_token, :access_token_secret, :filter, :fetch_errors
     def initialize(options)
-      @username = options[:username]
-      @password = options[:password]
+      @api_key = options[:api_key]
+      @api_secret = options[:api_secret]
+      @access_token = options[:access_token]
+      @access_token_secret = options[:access_token_secret]
       @filter = options[:filter]
       @fetch_errors = 0
     end
@@ -23,8 +25,13 @@ module TwitterToCsv
         EventMachine::run do
           stream = Twitter::JSONStream.connect(
             :path    => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
-            :auth    => "#{username}:#{password}",
-            :ssl     => true
+            :ssl     => true,
+            :oauth   => {
+              :consumer_key    => api_key,
+              :consumer_secret => api_secret,
+              :access_key      => access_token,
+              :access_secret   => access_token_secret
+            }
           )
           stream.each_item do |item|

data/lib/twitter_to_csv/version.rb CHANGED

@@ -1,3 +1,3 @@
 module TwitterToCsv
-  VERSION = "0.1.2"
+  VERSION = "0.1.5"
 end

data/spec/bool_word_field_parser_spec.rb ADDED

@@ -0,0 +1,57 @@
+# encoding: utf-8
+require 'spec_helper'
+require 'time'
+describe TwitterToCsv::BoolWordFieldParser do
+  describe "#parse" do
+    it "parses name:string AND string AND string... syntax" do
+      TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 AND string3 AND string4").should == {
+          :name => "something",
+          :logic => ["string1 string2", :and, ["string3", :and, ["string4"]]]
+      }
+    end
+    it "parses name:string OR string OR string... syntax" do
+      TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 OR string3 OR string4").should == {
+          :name => "something",
+          :logic => ["string1 string2", :or, ["string3", :or, ["string4"]]]
+      }
+    end
+    it "parses parens" do
+      TwitterToCsv::BoolWordFieldParser.parse("something_else:string1   STRING2 OR ( string3 AND (string4 OR string5 ))").should == {
+          :name => "something_else",
+          :logic => ["string1 string2", :or, ["string3", :and, ["string4", :or, ["string5"]]]]
+      }
+    end
+  end
+  describe "#check" do
+    it "returns true when an expression matches some text, false when it doesn't" do
+      pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND (string4 OR string5))")
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string2 string1").should be_false
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string1").should be_false
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string2").should be_false
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string3 string4").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string4 string3").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string5 string3").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string5 baz").should be_false
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string4 string5 baz").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz string4").should be_true
+      TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2 string3 string4").should be_true
+    end
+    it "raises errors when the input is un-evaluable" do
+      pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND OR string5))")
+      lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
+      pattern = TwitterToCsv::BoolWordFieldParser.parse("hello (")
+      lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
+      pattern = TwitterToCsv::BoolWordFieldParser.parse("hello ()")
+      lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
+    end
+  end
+end

data/spec/csv_builder_spec.rb CHANGED

@@ -4,35 +4,73 @@ require 'time'
 describe TwitterToCsv::CsvBuilder do
   describe "#handle_status" do
-    describe "when :english is set" do
-      it "skips non-English tweets" do
-        string_io = StringIO.new
-        csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
-        csv_builder.handle_status('text' => "This is English", 'user' =>  { 'lang' => 'en' })
-        csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' =>  { 'lang' => 'en' })
-        csv_builder.handle_status('text' => "This is still English", 'user' =>  { 'lang' => 'en' })
-        string_io.rewind
-        string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
+    describe "English language detection" do
+      describe "with the :uld strategy" do
+        it "uses the UnsupervisedLanguageDetection library to skip non-English tweets" do
+          string_io = StringIO.new
+          csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text])
+          csv_builder.handle_status('text' => "This is English")
+          csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
+          csv_builder.handle_status('text' => "This is still English")
+          string_io.rewind
+          string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
+        end
+        it "makes a new 'uld' variable available on the tweet" do
+          string_io = StringIO.new
+          csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text uld])
+          csv_builder.handle_status('text' => "This is English")
+          csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
+          csv_builder.handle_status('text' => "This is still English")
+          string_io.rewind
+          string_io.read.should == "\"This is English\",\"true\"\n\"This is still English\",\"true\"\n"
+        end
       end
-      it "honors start_time and end_time" do
-        string_io = StringIO.new
-        csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
-                                                   :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
-                                                   :end_time   => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
-        # Order shouldn't matter
-        csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
-        csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
-        csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
-        csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
-        csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
-        csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
-        string_io.rewind
-        string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
+      describe "with the :twitter strategy" do
+        it "uses Twitter's lang field to skip non-English tweets" do
+          string_io = StringIO.new
+          csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :twitter, :csv => string_io, :fields => %w[text])
+          csv_builder.handle_status('text' => "This is English", 'lang' => 'en')
+          csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'fr')
+          csv_builder.handle_status('text' => "This is still English", 'lang' => 'en')
+          string_io.rewind
+          string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
+        end
+      end
+      describe "with the :both strategy" do
+        it "returns tweets unless both ULD and Twitter agree that the Tweet is non-English" do
+          string_io = StringIO.new
+          csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :both, :csv => string_io, :fields => %w[text])
+          csv_builder.handle_status('text' => "This is English", 'lang' => 'en') # agree
+          csv_builder.handle_status('text' => "This is English 2", 'lang' => 'fr') # disagree
+          csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'en') # disagree
+          csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles 2.", 'lang' => 'fr') # agree
+          csv_builder.handle_status('text' => "This is still English", 'lang' => 'en') # agree
+          string_io.rewind
+          string_io.read.should == "\"This is English\"\n\"This is English 2\"\n\"Esta frase se encuentra en Ingles.\"\n\"This is still English\"\n"
+        end
       end
     end
+    it "honors start_time and end_time" do
+      string_io = StringIO.new
+      csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
+                                                 :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
+                                                 :end_time   => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
+      # Order shouldn't matter
+      csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
+      csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
+      csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
+      csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
+      csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
+      csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
+      string_io.rewind
+      string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
+    end
     describe "log_csv_header" do
       it "outputs the fields as header labels" do
         string_io = StringIO.new
@@ -169,17 +207,42 @@ describe TwitterToCsv::CsvBuilder do
                                  "\"hello2\",\"2\"\n"
       end
-      it "can return date fields" do
+      it "can extract boolean word fields" do
+        string_io = StringIO.new
+        patterns = [
+            TwitterToCsv::BoolWordFieldParser.parse("field1:hello AND world"),
+            TwitterToCsv::BoolWordFieldParser.parse("field2:hello"),
+            TwitterToCsv::BoolWordFieldParser.parse("field3:string OR text"),
+            TwitterToCsv::BoolWordFieldParser.parse("field3:hello this")
+        ]
+        csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :bool_word_fields => patterns)
+        csv_builder.handle_status({
+            'something' => "hello1",
+            'text' => 'hello this is a string'
+        })
+        csv_builder.handle_status({
+            'something' => "hello2",
+            'text' => 'hello world this is some text'
+        })
+        string_io.rewind
+        string_io.read.should == "\"hello1\",\"false\",\"true\",\"true\",\"true\"\n" +
+                                 "\"hello2\",\"true\",\"true\",\"true\",\"false\"\n"
+      end
+      it "can return date fields and convert them to UTC" do
         string_io = StringIO.new
         csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
         csv_builder.handle_status({
             'something' => "hello1",
             'text' => 'i love cheese',
             'created_at' => "2012-06-29 13:12:09 -0700"
         })
         string_io.rewind
-        string_io.read.should == "\"hello1\",\"5\",\"29\",\"6\",\"2012\",\"13\",\"12\",\"09\"\n"
+        time = Time.parse("2012-06-29 13:12:09 -0700").utc
+        string_io.read.should == '"' + ["hello1", time.strftime("%w"), time.strftime("%-d"),
+                                        time.strftime("%-m"), time.strftime("%Y"), time.strftime("%-H"),
+                                        time.strftime("%M"), time.strftime("%S")].join('","') + "\"\n"
       end
       it "can return a normalized source" do
@@ -349,24 +412,33 @@ describe TwitterToCsv::CsvBuilder do
   end
   describe "#extract_fields" do
-    it "finds all the paths through a hash" do
+    it "finds all the paths through a structure" do
       obj = {
           :a => :b,
           :b => "c",
           :d => {
               :e => :f,
-              :g => {
+              :g => [
+                {
                   :h => :i,
                   :j => {
-                      :k => "l"
+                    :k => "l"
                   }
-              },
+                },
+                {
+                  :h => :i,
+                  :j => {
+                    :m => "n"
+                  },
+                  :hi => 2
+                }
+              ],
               :m => "n"
           }
       }
       fields = { "a" => 1 }
       TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
-      fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
+      fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g[].h" => 1, "d.g[].j.k" => 1, "d.g[].j.m" => 1, "d.g[].hi" => 1, "d.m" => 1 }
     end
   end
 end