twitter_to_csv 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -0
- data/Gemfile +3 -1
- data/README.markdown +316 -153
- data/bin/twitter_to_csv +45 -10
- data/lib/twitter_to_csv.rb +1 -0
- data/lib/twitter_to_csv/bool_word_field_parser.rb +74 -0
- data/lib/twitter_to_csv/csv_builder.rb +50 -28
- data/lib/twitter_to_csv/twitter_watcher.rb +12 -5
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/bool_word_field_parser_spec.rb +57 -0
- data/spec/csv_builder_spec.rb +104 -32
- data/twitter_to_csv.gemspec +1 -1
- metadata +14 -21
- data/.rvmrc +0 -1
    
        data/bin/twitter_to_csv
    CHANGED
    
    | @@ -5,20 +5,32 @@ require 'optparse' | |
| 5 5 | 
             
            require 'time'
         | 
| 6 6 | 
             
            require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
         | 
| 7 7 |  | 
| 8 | 
            -
            options = { :fields => %w[created_at text] }
         | 
| 8 | 
            +
            options = { :fields => %w[created_at text], :bool_word_fields => [] }
         | 
| 9 9 | 
             
            parser = OptionParser.new do |opts|
         | 
| 10 10 | 
             
              opts.banner = "Usage: #{File.basename($0)} [options]"
         | 
| 11 | 
            +
             | 
| 11 12 | 
             
              opts.separator ""
         | 
| 12 | 
            -
              opts.separator " | 
| 13 | 
            +
              opts.separator "These four fields are required.  Please see the README to learn how to get them for your Twitter account."
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              opts.on("--api-key KEY", "Twitter API key") do |api_key|
         | 
| 16 | 
            +
                options[:api_key] = api_key
         | 
| 17 | 
            +
              end
         | 
| 13 18 |  | 
| 14 | 
            -
              opts.on("- | 
| 15 | 
            -
                options[: | 
| 19 | 
            +
              opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret|
         | 
| 20 | 
            +
                options[:api_secret] = api_secret
         | 
| 16 21 | 
             
              end
         | 
| 17 22 |  | 
| 18 | 
            -
              opts.on("- | 
| 19 | 
            -
                options[: | 
| 23 | 
            +
              opts.on("--access-token TOKEN", "Twitter access token") do |access_token|
         | 
| 24 | 
            +
                options[:access_token] = access_token
         | 
| 20 25 | 
             
              end
         | 
| 21 26 |  | 
| 27 | 
            +
              opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret|
         | 
| 28 | 
            +
                options[:access_token_secret] = access_token_secret
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              opts.separator ""
         | 
| 32 | 
            +
              opts.separator "General settings:"
         | 
| 33 | 
            +
             | 
| 22 34 | 
             
              opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv|
         | 
| 23 35 | 
             
                options[:csv_appending] = File.exists?(csv)
         | 
| 24 36 | 
             
                options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a')
         | 
| @@ -40,8 +52,12 @@ parser = OptionParser.new do |opts| | |
| 40 52 | 
             
                options[:date_fields] = date_fields.split(/\s*,\s*/)
         | 
| 41 53 | 
             
              end
         | 
| 42 54 |  | 
| 43 | 
            -
              opts.on("-e", "--require-english | 
| 44 | 
            -
             | 
| 55 | 
            +
              opts.on("-e", "--require-english [STRATEGY]", 
         | 
| 56 | 
            +
                            "Attempt to filter out non-English tweets. This will have both false positives and false negatives.",
         | 
| 57 | 
            +
                            "The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,",
         | 
| 58 | 
            +
                            "'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that",
         | 
| 59 | 
            +
                            "both Twitter and ULD think are non-English.  This is most conservative and is the default.") do |e|
         | 
| 60 | 
            +
                options[:require_english] = (e || "both").downcase.to_sym
         | 
| 45 61 | 
             
              end
         | 
| 46 62 |  | 
| 47 63 | 
             
              opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
         | 
| @@ -84,6 +100,25 @@ parser = OptionParser.new do |opts| | |
| 84 100 | 
             
                options[:normalize_source] = normalize_source
         | 
| 85 101 | 
             
              end
         | 
| 86 102 |  | 
| 103 | 
            +
              opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes|
         | 
| 104 | 
            +
                options[:remove_quotes] = remove_quotes
         | 
| 105 | 
            +
              end
         | 
| 106 | 
            +
             | 
| 107 | 
            +
              opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids|
         | 
| 108 | 
            +
                options[:prefix_ids] = prefix_ids
         | 
| 109 | 
            +
              end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
              opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"",
         | 
| 112 | 
            +
                      "Create a named CSV column that is true when the word expression matches, false otherwise.",
         | 
| 113 | 
            +
                      "Word expressions are boolean expressions where neighboring words must occur sequentially",
         | 
| 114 | 
            +
                      "and you can use parentheses, AND, and OR to test for occurrence relationships.  Examples:",
         | 
| 115 | 
            +
                      "  keyword_any:tanning booth OR tanning booths OR tanningbooth",
         | 
| 116 | 
            +
                      "  keyword_both:tanning AND booth",
         | 
| 117 | 
            +
                      "  keyword_complex:tanning AND (booth OR bed)",
         | 
| 118 | 
            +
                      "This option can be used multiple times.") do |bool_word_field|
         | 
| 119 | 
            +
                options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field)
         | 
| 120 | 
            +
              end
         | 
| 121 | 
            +
             | 
| 87 122 | 
             
              opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time|
         | 
| 88 123 | 
             
                options[:start_time] = Time.parse(start_time)
         | 
| 89 124 | 
             
              end
         | 
| @@ -125,8 +160,8 @@ end | |
| 125 160 |  | 
| 126 161 | 
             
            parser.parse!
         | 
| 127 162 |  | 
| 128 | 
            -
            unless (options[: | 
| 129 | 
            -
              STDERR.puts "Error: Twitter  | 
| 163 | 
            +
            unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file]
         | 
| 164 | 
            +
              STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n"
         | 
| 130 165 | 
             
              STDERR.puts parser
         | 
| 131 166 | 
             
              exit 1
         | 
| 132 167 | 
             
            end
         | 
    
        data/lib/twitter_to_csv.rb
    CHANGED
    
    | @@ -7,6 +7,7 @@ require 'twitter/json_stream' | |
| 7 7 | 
             
            require 'em-http-request'
         | 
| 8 8 | 
             
            require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
         | 
| 9 9 | 
             
            require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
         | 
| 10 | 
            +
            require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "bool_word_field_parser"))
         | 
| 10 11 | 
             
            require 'unsupervised-language-detection'
         | 
| 11 12 |  | 
| 12 13 | 
             
            module TwitterToCsv
         | 
| @@ -0,0 +1,74 @@ | |
| 1 | 
            +
            # encoding: UTF-8
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module TwitterToCsv
         | 
| 4 | 
            +
              class InvalidLogicError < StandardError; end
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              class BoolWordFieldParser
         | 
| 7 | 
            +
                TOKEN_SEPARATOR = /[^a-zA-Z0-9-]+/
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def self.parse(string)
         | 
| 10 | 
            +
                  parts = string.split(":")
         | 
| 11 | 
            +
                  name = parts.shift
         | 
| 12 | 
            +
                  tokens = parts.join(":").gsub(/\)/, " ) ").gsub(/\(/, " ( ").split(/\s+/).reject {|s| s.length == 0 }
         | 
| 13 | 
            +
                  struct = []
         | 
| 14 | 
            +
                  descend_parse(struct, tokens)
         | 
| 15 | 
            +
                  { :name => name, :logic => struct }
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def self.descend_parse(struct, tokens)
         | 
| 19 | 
            +
                  while tokens.length > 0
         | 
| 20 | 
            +
                    token = tokens.shift
         | 
| 21 | 
            +
                    if token == ")"
         | 
| 22 | 
            +
                      return
         | 
| 23 | 
            +
                    elsif token == "("
         | 
| 24 | 
            +
                      if struct.length > 0
         | 
| 25 | 
            +
                        sub_struct = []
         | 
| 26 | 
            +
                        struct << sub_struct
         | 
| 27 | 
            +
                        descend_parse(sub_struct, tokens)
         | 
| 28 | 
            +
                      end
         | 
| 29 | 
            +
                    elsif %w[AND OR].include?(token)
         | 
| 30 | 
            +
                      sub_struct = []
         | 
| 31 | 
            +
                      struct << :and if token == "AND"
         | 
| 32 | 
            +
                      struct << :or if token == "OR"
         | 
| 33 | 
            +
                      struct << sub_struct
         | 
| 34 | 
            +
                      descend_parse(sub_struct, tokens)
         | 
| 35 | 
            +
                    else
         | 
| 36 | 
            +
                      if struct[0]
         | 
| 37 | 
            +
                        struct[0] += " " + token.downcase
         | 
| 38 | 
            +
                      else
         | 
| 39 | 
            +
                        struct << token.downcase
         | 
| 40 | 
            +
                      end
         | 
| 41 | 
            +
                    end
         | 
| 42 | 
            +
                  end
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                def self.check(pattern, text)
         | 
| 46 | 
            +
                  logic = pattern[:logic]
         | 
| 47 | 
            +
                  tokens = text.downcase.split(TOKEN_SEPARATOR).reject {|t| t.length == 0 }.join(" ")
         | 
| 48 | 
            +
                  !!descend_check(logic, tokens)
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                def self.descend_check(logic, tokens)
         | 
| 52 | 
            +
                  if logic.is_a?(String)
         | 
| 53 | 
            +
                    # See if the token(s) are present.
         | 
| 54 | 
            +
                    tokens =~ /\b#{Regexp::escape logic}\b/
         | 
| 55 | 
            +
                  elsif logic.length == 1
         | 
| 56 | 
            +
                    # Recurse further.
         | 
| 57 | 
            +
                    descend_check logic.first, tokens
         | 
| 58 | 
            +
                  elsif logic.length == 3
         | 
| 59 | 
            +
                    # Apply the given logical operation.
         | 
| 60 | 
            +
                    first = descend_check(logic.first, tokens)
         | 
| 61 | 
            +
                    last = descend_check(logic.last, tokens)
         | 
| 62 | 
            +
                    if logic[1] == :and
         | 
| 63 | 
            +
                      first && last
         | 
| 64 | 
            +
                    elsif logic[1] == :or
         | 
| 65 | 
            +
                      first || last
         | 
| 66 | 
            +
                    else
         | 
| 67 | 
            +
                      raise InvalidLogicError.new("Unknown operation: #{logic[1]}")
         | 
| 68 | 
            +
                    end
         | 
| 69 | 
            +
                  else
         | 
| 70 | 
            +
                    raise InvalidLogicError.new("Invalid expression length of #{logic.length}")
         | 
| 71 | 
            +
                  end
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
              end
         | 
| 74 | 
            +
            end
         | 
| @@ -25,7 +25,7 @@ module TwitterToCsv | |
| 25 25 | 
             
                        handle_status status
         | 
| 26 26 | 
             
                      end
         | 
| 27 27 | 
             
                    rescue SignalException, SystemExit
         | 
| 28 | 
            -
                      EventMachine::stop_event_loop
         | 
| 28 | 
            +
                      EventMachine::stop_event_loop if EventMachine::reactor_running?
         | 
| 29 29 | 
             
                      exit
         | 
| 30 30 | 
             
                    rescue StandardError => e
         | 
| 31 31 | 
             
                      STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
         | 
| @@ -70,13 +70,9 @@ module TwitterToCsv | |
| 70 70 | 
             
                    # This is an original status.
         | 
| 71 71 | 
             
                    if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
         | 
| 72 72 | 
             
                      if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
         | 
| 73 | 
            -
                        status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
         | 
| 73 | 
            +
                        status['retweet_count'] = @retweet_counts[status['id']] || 0 # if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
         | 
| 74 74 | 
             
                        if options[:retweet_counts_at]
         | 
| 75 | 
            -
                          retweet_hour_data = @retweet_hour_counts.delete(status['id'])
         | 
| 76 | 
            -
                          if !retweet_hour_data
         | 
| 77 | 
            -
                            puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
         | 
| 78 | 
            -
                            return false
         | 
| 79 | 
            -
                          end
         | 
| 75 | 
            +
                          retweet_hour_data = @retweet_hour_counts.delete(status['id']) || options[:retweet_counts_at].map { 0 }
         | 
| 80 76 | 
             
                          status['_retweet_hour_counts'] = retweet_hour_data
         | 
| 81 77 | 
             
                        end
         | 
| 82 78 | 
             
                        true
         | 
| @@ -90,8 +86,10 @@ module TwitterToCsv | |
| 90 86 | 
             
                end
         | 
| 91 87 |  | 
| 92 88 | 
             
                def handle_status(status, &block)
         | 
| 93 | 
            -
                  if  | 
| 94 | 
            -
                     | 
| 89 | 
            +
                  if status.has_key?('delete')
         | 
| 90 | 
            +
                    STDERR.puts "Skipping Tweet with delete." if options[:verbose]
         | 
| 91 | 
            +
                  elsif within_time_window?(status)
         | 
| 92 | 
            +
                    if (options[:require_english] && is_english?(status, options[:require_english])) || !options[:require_english]
         | 
| 95 93 | 
             
                      if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
         | 
| 96 94 | 
             
                        log_json(status) if options[:json]
         | 
| 97 95 | 
             
                        log_csv(status) if options[:csv]
         | 
| @@ -124,6 +122,10 @@ module TwitterToCsv | |
| 124 122 | 
             
                  options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
         | 
| 125 123 | 
             
                  options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0
         | 
| 126 124 |  | 
| 125 | 
            +
                  (options[:bool_word_fields] || []).each do |pattern|
         | 
| 126 | 
            +
                    header_labels << pattern[:name]
         | 
| 127 | 
            +
                  end
         | 
| 128 | 
            +
             | 
| 127 129 | 
             
                  options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
         | 
| 128 130 | 
             
                end
         | 
| 129 131 |  | 
| @@ -137,9 +139,19 @@ module TwitterToCsv | |
| 137 139 |  | 
| 138 140 | 
             
                def output_row(status)
         | 
| 139 141 | 
             
                  row = options[:fields].map do |field|
         | 
| 140 | 
            -
                    field.split(".").inject(status) { |memo, segment|
         | 
| 142 | 
            +
                    value = field.split(".").inject(status) { |memo, segment|
         | 
| 141 143 | 
             
                      memo && memo[segment]
         | 
| 142 144 | 
             
                    }.to_s
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    if options[:prefix_ids]
         | 
| 147 | 
            +
                      value = "id" + value if value.length > 0 && (field =~ /\Aid_str|id\Z/ || field =~ /_id|_id_str\Z/)
         | 
| 148 | 
            +
                    end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    if options[:remove_quotes]
         | 
| 151 | 
            +
                      value = value.gsub(/\"/, '')
         | 
| 152 | 
            +
                    end
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    value
         | 
| 143 155 | 
             
                  end
         | 
| 144 156 |  | 
| 145 157 | 
             
                  row += compute_sentiment(status["text"]) if options[:compute_sentiment]
         | 
| @@ -151,7 +163,7 @@ module TwitterToCsv | |
| 151 163 | 
             
                  (options[:date_fields] || []).each do |date_field|
         | 
| 152 164 | 
             
                    time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
         | 
| 153 165 | 
             
                      memo && memo[segment]
         | 
| 154 | 
            -
                    }.to_s)
         | 
| 166 | 
            +
                    }.to_s).utc
         | 
| 155 167 |  | 
| 156 168 | 
             
                    row << time.strftime("%w") # week_day
         | 
| 157 169 | 
             
                    row << time.strftime("%-d") # day
         | 
| @@ -179,6 +191,10 @@ module TwitterToCsv | |
| 179 191 | 
             
                    options[:user_mention_columns].times { |i| row << users[i].to_s }
         | 
| 180 192 | 
             
                  end
         | 
| 181 193 |  | 
| 194 | 
            +
                  (options[:bool_word_fields] || []).each do |pattern|
         | 
| 195 | 
            +
                    row << (!!TwitterToCsv::BoolWordFieldParser.check(pattern, status["text"])).to_s
         | 
| 196 | 
            +
                  end
         | 
| 197 | 
            +
             | 
| 182 198 | 
             
                  row
         | 
| 183 199 | 
             
                end
         | 
| 184 200 |  | 
| @@ -239,24 +255,30 @@ module TwitterToCsv | |
| 239 255 | 
             
                end
         | 
| 240 256 |  | 
| 241 257 | 
             
                def sample_fields(status)
         | 
| 242 | 
            -
                  extract_fields | 
| 258 | 
            +
                  extract_fields status, sampled_fields
         | 
| 243 259 | 
             
                  @num_samples += 1
         | 
| 244 260 | 
             
                  if @num_samples > options[:sample_fields]
         | 
| 245 261 | 
             
                    puts "Sampled fields from Twitter:"
         | 
| 246 262 | 
             
                    sampled_fields.each do |field, count|
         | 
| 247 263 | 
             
                      puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
         | 
| 248 264 | 
             
                    end
         | 
| 249 | 
            -
                    exit  | 
| 265 | 
            +
                    exit 0
         | 
| 250 266 | 
             
                  end
         | 
| 251 267 | 
             
                end
         | 
| 252 268 |  | 
| 253 | 
            -
                def extract_fields(object, fields, current_path =  | 
| 269 | 
            +
                def extract_fields(object, fields, current_path = "")
         | 
| 254 270 | 
             
                  if object.is_a?(Hash)
         | 
| 255 271 | 
             
                    object.each do |k, v|
         | 
| 256 | 
            -
                      extract_fields v, fields, current_path +  | 
| 272 | 
            +
                      extract_fields v, fields, current_path + "." + k.to_s
         | 
| 273 | 
            +
                    end
         | 
| 274 | 
            +
                  elsif object.is_a?(Array)
         | 
| 275 | 
            +
                    local_fields = {}
         | 
| 276 | 
            +
                    object.each do |v|
         | 
| 277 | 
            +
                      extract_fields v, local_fields, current_path + "[]"
         | 
| 257 278 | 
             
                    end
         | 
| 279 | 
            +
                    local_fields.keys.each { |key| fields[key] ||= 0 ; fields[key] += 1 }
         | 
| 258 280 | 
             
                  else
         | 
| 259 | 
            -
                    path = current_path | 
| 281 | 
            +
                    path = current_path[1..-1]
         | 
| 260 282 | 
             
                    fields[path] ||= 0
         | 
| 261 283 | 
             
                    fields[path] += 1
         | 
| 262 284 | 
             
                  end
         | 
| @@ -268,19 +290,19 @@ module TwitterToCsv | |
| 268 290 | 
             
                  options[:json].flush
         | 
| 269 291 | 
             
                end
         | 
| 270 292 |  | 
| 271 | 
            -
                def is_english?(status)
         | 
| 272 | 
            -
                   | 
| 273 | 
            -
                     | 
| 274 | 
            -
                    return false
         | 
| 293 | 
            +
                def is_english?(status, strategy)
         | 
| 294 | 
            +
                  unless strategy == :twitter
         | 
| 295 | 
            +
                    status['uld'] = !!UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
         | 
| 275 296 | 
             
                  end
         | 
| 276 | 
            -
             | 
| 277 | 
            -
                   | 
| 278 | 
            -
             | 
| 279 | 
            -
             | 
| 280 | 
            -
                   | 
| 281 | 
            -
             | 
| 282 | 
            -
             | 
| 283 | 
            -
             | 
| 297 | 
            +
                  
         | 
| 298 | 
            +
                  if strategy == :both && status['lang'] != 'en' && !status['uld']
         | 
| 299 | 
            +
                    STDERR.puts "Skipping \"#{status['text']}\" because both Twitter (#{status['lang']}) and UnsupervisedLanguageDetection think it is not English." if options[:verbose]
         | 
| 300 | 
            +
                    return false
         | 
| 301 | 
            +
                  elsif strategy == :uld && !status['uld']
         | 
| 302 | 
            +
                    STDERR.puts "Skipping \"#{status['text']}\" because UnsupervisedLanguageDetection thinks it is not English." if options[:verbose]
         | 
| 303 | 
            +
                    return false
         | 
| 304 | 
            +
                  elsif strategy == :twitter && status['lang'] != 'en'
         | 
| 305 | 
            +
                    STDERR.puts "Skipping \"#{status['text']}\" because Twitter (#{status['lang']}) thinks it is not English." if options[:verbose]
         | 
| 284 306 | 
             
                    return false
         | 
| 285 307 | 
             
                  end
         | 
| 286 308 |  | 
| @@ -2,11 +2,13 @@ require 'cgi' | |
| 2 2 |  | 
| 3 3 | 
             
            module TwitterToCsv
         | 
| 4 4 | 
             
              class TwitterWatcher
         | 
| 5 | 
            -
                attr_accessor : | 
| 5 | 
            +
                attr_accessor :api_key, :api_secret, :access_token, :access_token_secret, :filter, :fetch_errors
         | 
| 6 6 |  | 
| 7 7 | 
             
                def initialize(options)
         | 
| 8 | 
            -
                  @ | 
| 9 | 
            -
                  @ | 
| 8 | 
            +
                  @api_key = options[:api_key]
         | 
| 9 | 
            +
                  @api_secret = options[:api_secret]
         | 
| 10 | 
            +
                  @access_token = options[:access_token]
         | 
| 11 | 
            +
                  @access_token_secret = options[:access_token_secret]
         | 
| 10 12 | 
             
                  @filter = options[:filter]
         | 
| 11 13 | 
             
                  @fetch_errors = 0
         | 
| 12 14 | 
             
                end
         | 
| @@ -23,8 +25,13 @@ module TwitterToCsv | |
| 23 25 | 
             
                    EventMachine::run do
         | 
| 24 26 | 
             
                      stream = Twitter::JSONStream.connect(
         | 
| 25 27 | 
             
                        :path    => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.map {|f| CGI::escape(f) }.join(",")}" if filter && filter.length > 0}",
         | 
| 26 | 
            -
                        : | 
| 27 | 
            -
                        : | 
| 28 | 
            +
                        :ssl     => true,
         | 
| 29 | 
            +
                        :oauth   => {
         | 
| 30 | 
            +
                          :consumer_key    => api_key,
         | 
| 31 | 
            +
                          :consumer_secret => api_secret,
         | 
| 32 | 
            +
                          :access_key      => access_token,
         | 
| 33 | 
            +
                          :access_secret   => access_token_secret
         | 
| 34 | 
            +
                        }
         | 
| 28 35 | 
             
                      )
         | 
| 29 36 |  | 
| 30 37 | 
             
                      stream.each_item do |item|
         | 
| @@ -0,0 +1,57 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
            require 'time'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            describe TwitterToCsv::BoolWordFieldParser do
         | 
| 6 | 
            +
              describe "#parse" do
         | 
| 7 | 
            +
                it "parses name:string AND string AND string... syntax" do
         | 
| 8 | 
            +
                  TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 AND string3 AND string4").should == {
         | 
| 9 | 
            +
                      :name => "something",
         | 
| 10 | 
            +
                      :logic => ["string1 string2", :and, ["string3", :and, ["string4"]]]
         | 
| 11 | 
            +
                  }
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                it "parses name:string OR string OR string... syntax" do
         | 
| 15 | 
            +
                  TwitterToCsv::BoolWordFieldParser.parse("something:string1 string2 OR string3 OR string4").should == {
         | 
| 16 | 
            +
                      :name => "something",
         | 
| 17 | 
            +
                      :logic => ["string1 string2", :or, ["string3", :or, ["string4"]]]
         | 
| 18 | 
            +
                  }
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                it "parses parens" do
         | 
| 22 | 
            +
                  TwitterToCsv::BoolWordFieldParser.parse("something_else:string1   STRING2 OR ( string3 AND (string4 OR string5 ))").should == {
         | 
| 23 | 
            +
                      :name => "something_else",
         | 
| 24 | 
            +
                      :logic => ["string1 string2", :or, ["string3", :and, ["string4", :or, ["string5"]]]]
         | 
| 25 | 
            +
                  }
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              describe "#check" do
         | 
| 30 | 
            +
                it "returns true when an expression matches some text, false when it doesn't" do
         | 
| 31 | 
            +
                  pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND (string4 OR string5))")
         | 
| 32 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2").should be_true
         | 
| 33 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string2 string1").should be_false
         | 
| 34 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string1").should be_false
         | 
| 35 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string2").should be_false
         | 
| 36 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string3 string4").should be_true
         | 
| 37 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string4 string3").should be_true
         | 
| 38 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string5 string3").should be_true
         | 
| 39 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz").should be_true
         | 
| 40 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string5 baz").should be_false
         | 
| 41 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string4 string5 baz").should be_true
         | 
| 42 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "foo bar string3 string5 baz string4").should be_true
         | 
| 43 | 
            +
                  TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2 string3 string4").should be_true
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                it "raises errors when the input is un-evaluable" do
         | 
| 47 | 
            +
                  pattern = TwitterToCsv::BoolWordFieldParser.parse("something_else:string1 string2 OR (string3 AND OR string5))")
         | 
| 48 | 
            +
                  lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  pattern = TwitterToCsv::BoolWordFieldParser.parse("hello (")
         | 
| 51 | 
            +
                  lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  pattern = TwitterToCsv::BoolWordFieldParser.parse("hello ()")
         | 
| 54 | 
            +
                  lambda { TwitterToCsv::BoolWordFieldParser.check(pattern, "string1 string2") }.should raise_error(TwitterToCsv::InvalidLogicError)
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
              end
         | 
| 57 | 
            +
            end
         | 
    
        data/spec/csv_builder_spec.rb
    CHANGED
    
    | @@ -4,35 +4,73 @@ require 'time' | |
| 4 4 |  | 
| 5 5 | 
             
            describe TwitterToCsv::CsvBuilder do
         | 
| 6 6 | 
             
              describe "#handle_status" do
         | 
| 7 | 
            -
                describe " | 
| 8 | 
            -
                   | 
| 9 | 
            -
                     | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 7 | 
            +
                describe "English language detection" do
         | 
| 8 | 
            +
                  describe "with the :uld strategy" do
         | 
| 9 | 
            +
                    it "uses the UnsupervisedLanguageDetection library to skip non-English tweets" do
         | 
| 10 | 
            +
                      string_io = StringIO.new
         | 
| 11 | 
            +
                      csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text])
         | 
| 12 | 
            +
                      csv_builder.handle_status('text' => "This is English")
         | 
| 13 | 
            +
                      csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
         | 
| 14 | 
            +
                      csv_builder.handle_status('text' => "This is still English")
         | 
| 15 | 
            +
                      string_io.rewind
         | 
| 16 | 
            +
                      string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
         | 
| 17 | 
            +
                    end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    it "makes a new 'uld' variable available on the tweet" do
         | 
| 20 | 
            +
                      string_io = StringIO.new
         | 
| 21 | 
            +
                      csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :uld, :csv => string_io, :fields => %w[text uld])
         | 
| 22 | 
            +
                      csv_builder.handle_status('text' => "This is English")
         | 
| 23 | 
            +
                      csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.")
         | 
| 24 | 
            +
                      csv_builder.handle_status('text' => "This is still English")
         | 
| 25 | 
            +
                      string_io.rewind
         | 
| 26 | 
            +
                      string_io.read.should == "\"This is English\",\"true\"\n\"This is still English\",\"true\"\n"
         | 
| 27 | 
            +
                    end
         | 
| 16 28 | 
             
                  end
         | 
| 17 29 |  | 
| 18 | 
            -
                   | 
| 19 | 
            -
                     | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
                     | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
                     | 
| 32 | 
            -
             | 
| 30 | 
            +
                  describe "with the :twitter strategy" do
         | 
| 31 | 
            +
                    it "uses Twitter's lang field to skip non-English tweets" do
         | 
| 32 | 
            +
                      string_io = StringIO.new
         | 
| 33 | 
            +
                      csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :twitter, :csv => string_io, :fields => %w[text])
         | 
| 34 | 
            +
                      csv_builder.handle_status('text' => "This is English", 'lang' => 'en')
         | 
| 35 | 
            +
                      csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'fr')
         | 
| 36 | 
            +
                      csv_builder.handle_status('text' => "This is still English", 'lang' => 'en')
         | 
| 37 | 
            +
                      string_io.rewind
         | 
| 38 | 
            +
                      string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  describe "with the :both strategy" do
         | 
| 43 | 
            +
                    it "returns tweets unless both ULD and Twitter agree that the Tweet is non-English" do
         | 
| 44 | 
            +
                      string_io = StringIO.new
         | 
| 45 | 
            +
                      csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => :both, :csv => string_io, :fields => %w[text])
         | 
| 46 | 
            +
                      csv_builder.handle_status('text' => "This is English", 'lang' => 'en') # agree
         | 
| 47 | 
            +
                      csv_builder.handle_status('text' => "This is English 2", 'lang' => 'fr') # disagree
         | 
| 48 | 
            +
                      csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'lang' => 'en') # disagree
         | 
| 49 | 
            +
                      csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles 2.", 'lang' => 'fr') # agree
         | 
| 50 | 
            +
                      csv_builder.handle_status('text' => "This is still English", 'lang' => 'en') # agree
         | 
| 51 | 
            +
                      string_io.rewind
         | 
| 52 | 
            +
                      string_io.read.should == "\"This is English\"\n\"This is English 2\"\n\"Esta frase se encuentra en Ingles.\"\n\"This is still English\"\n"
         | 
| 53 | 
            +
                    end
         | 
| 33 54 | 
             
                  end
         | 
| 34 55 | 
             
                end
         | 
| 35 56 |  | 
| 57 | 
            +
                it "honors start_time and end_time" do
         | 
| 58 | 
            +
                  string_io = StringIO.new
         | 
| 59 | 
            +
                  csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[text],
         | 
| 60 | 
            +
                                                             :start_time => Time.parse("Mon Mar 07 07:42:22 +0000 2011"),
         | 
| 61 | 
            +
                                                             :end_time   => Time.parse("Mon Mar 08 02:00:00 +0000 2011"))
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  # Order shouldn't matter
         | 
| 64 | 
            +
                  csv_builder.handle_status('text' => "1", 'created_at' => 'Mon Mar 07 07:41:22 +0000 2011')
         | 
| 65 | 
            +
                  csv_builder.handle_status('text' => "6", 'created_at' => 'Mon Mar 08 02:01:00 +0000 2011')
         | 
| 66 | 
            +
                  csv_builder.handle_status('text' => "2", 'created_at' => 'Mon Mar 07 07:42:22 +0000 2011')
         | 
| 67 | 
            +
                  csv_builder.handle_status('text' => "4", 'created_at' => 'Mon Mar 08 01:41:22 +0000 2011')
         | 
| 68 | 
            +
                  csv_builder.handle_status('text' => "5", 'created_at' => 'Mon Mar 08 02:00:00 +0000 2011')
         | 
| 69 | 
            +
                  csv_builder.handle_status('text' => "3", 'created_at' => 'Mon Mar 07 10:00:00 +0000 2011')
         | 
| 70 | 
            +
                  string_io.rewind
         | 
| 71 | 
            +
                  string_io.read.should == "\"2\"\n\"4\"\n\"3\"\n"
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
             | 
| 36 74 | 
             
                describe "log_csv_header" do
         | 
| 37 75 | 
             
                  it "outputs the fields as header labels" do
         | 
| 38 76 | 
             
                    string_io = StringIO.new
         | 
| @@ -169,17 +207,42 @@ describe TwitterToCsv::CsvBuilder do | |
| 169 207 | 
             
                                             "\"hello2\",\"2\"\n"
         | 
| 170 208 | 
             
                  end
         | 
| 171 209 |  | 
| 172 | 
            -
                  it "can  | 
| 210 | 
            +
                  it "can extract boolean word fields" do
         | 
| 211 | 
            +
                    string_io = StringIO.new
         | 
| 212 | 
            +
                    patterns = [
         | 
| 213 | 
            +
                        TwitterToCsv::BoolWordFieldParser.parse("field1:hello AND world"),
         | 
| 214 | 
            +
                        TwitterToCsv::BoolWordFieldParser.parse("field2:hello"),
         | 
| 215 | 
            +
                        TwitterToCsv::BoolWordFieldParser.parse("field3:string OR text"),
         | 
| 216 | 
            +
                        TwitterToCsv::BoolWordFieldParser.parse("field3:hello this")
         | 
| 217 | 
            +
                    ]
         | 
| 218 | 
            +
                    csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :bool_word_fields => patterns)
         | 
| 219 | 
            +
                    csv_builder.handle_status({
         | 
| 220 | 
            +
                        'something' => "hello1",
         | 
| 221 | 
            +
                        'text' => 'hello this is a string'
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                    })
         | 
| 224 | 
            +
                    csv_builder.handle_status({
         | 
| 225 | 
            +
                        'something' => "hello2",
         | 
| 226 | 
            +
                        'text' => 'hello world this is some text'
         | 
| 227 | 
            +
                    })
         | 
| 228 | 
            +
                    string_io.rewind
         | 
| 229 | 
            +
                    string_io.read.should == "\"hello1\",\"false\",\"true\",\"true\",\"true\"\n" +
         | 
| 230 | 
            +
                                             "\"hello2\",\"true\",\"true\",\"true\",\"false\"\n"
         | 
| 231 | 
            +
                  end
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                  it "can return date fields and convert them to UTC" do
         | 
| 173 234 | 
             
                    string_io = StringIO.new
         | 
| 174 235 | 
             
                    csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :date_fields => %w[created_at])
         | 
| 175 236 | 
             
                    csv_builder.handle_status({
         | 
| 176 237 | 
             
                        'something' => "hello1",
         | 
| 177 238 | 
             
                        'text' => 'i love cheese',
         | 
| 178 239 | 
             
                        'created_at' => "2012-06-29 13:12:09 -0700"
         | 
| 179 | 
            -
             | 
| 180 240 | 
             
                    })
         | 
| 181 241 | 
             
                    string_io.rewind
         | 
| 182 | 
            -
                     | 
| 242 | 
            +
                    time = Time.parse("2012-06-29 13:12:09 -0700").utc
         | 
| 243 | 
            +
                    string_io.read.should == '"' + ["hello1", time.strftime("%w"), time.strftime("%-d"),
         | 
| 244 | 
            +
                                                    time.strftime("%-m"), time.strftime("%Y"), time.strftime("%-H"),
         | 
| 245 | 
            +
                                                    time.strftime("%M"), time.strftime("%S")].join('","') + "\"\n"
         | 
| 183 246 | 
             
                  end
         | 
| 184 247 |  | 
| 185 248 | 
             
                  it "can return a normalized source" do
         | 
| @@ -349,24 +412,33 @@ describe TwitterToCsv::CsvBuilder do | |
| 349 412 | 
             
              end
         | 
| 350 413 |  | 
| 351 414 | 
             
              describe "#extract_fields" do
         | 
| 352 | 
            -
                it "finds all the paths through a  | 
| 415 | 
            +
                it "finds all the paths through a structure" do
         | 
| 353 416 | 
             
                  obj = {
         | 
| 354 417 | 
             
                      :a => :b,
         | 
| 355 418 | 
             
                      :b => "c",
         | 
| 356 419 | 
             
                      :d => {
         | 
| 357 420 | 
             
                          :e => :f,
         | 
| 358 | 
            -
                          :g =>  | 
| 421 | 
            +
                          :g => [
         | 
| 422 | 
            +
                            {
         | 
| 359 423 | 
             
                              :h => :i,
         | 
| 360 424 | 
             
                              :j => {
         | 
| 361 | 
            -
             | 
| 425 | 
            +
                                :k => "l"
         | 
| 362 426 | 
             
                              }
         | 
| 363 | 
            -
             | 
| 427 | 
            +
                            },
         | 
| 428 | 
            +
                            {
         | 
| 429 | 
            +
                              :h => :i,
         | 
| 430 | 
            +
                              :j => {
         | 
| 431 | 
            +
                                :m => "n"
         | 
| 432 | 
            +
                              },
         | 
| 433 | 
            +
                              :hi => 2
         | 
| 434 | 
            +
                            }
         | 
| 435 | 
            +
                          ],
         | 
| 364 436 | 
             
                          :m => "n"
         | 
| 365 437 | 
             
                      }
         | 
| 366 438 | 
             
                  }
         | 
| 367 439 | 
             
                  fields = { "a" => 1 }
         | 
| 368 440 | 
             
                  TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
         | 
| 369 | 
            -
                  fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
         | 
| 441 | 
            +
                  fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g[].h" => 1, "d.g[].j.k" => 1, "d.g[].j.m" => 1, "d.g[].hi" => 1, "d.m" => 1 }
         | 
| 370 442 | 
             
                end
         | 
| 371 443 | 
             
              end
         | 
| 372 444 | 
             
            end
         |