twitter_to_csv 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.2@twitter_to_csv --create
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in twitter_to_csv.gemspec
4
+ gemspec
5
+
data/README.markdown ADDED
@@ -0,0 +1,9 @@
1
+ # Twitter To CSV
2
+
3
+ ## Usage
4
+
5
+ twitter_to_csv --username <your twitter username> --password <your twitter password> \
6
+ --json hi.json --filter zit,zits,pimple,pimples,acne
7
+
8
+
9
+ Verbosity and actually outputting to a CSV coming soon :)
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'optparse'
5
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
6
+
7
+ options = { :csv => STDOUT }
8
+ parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: #{File.basename($0)} [options]"
10
+ opts.separator ""
11
+ opts.separator "Specific options:"
12
+
13
+ opts.on("-u", "--username USERNAME", "Twitter username") do |username|
14
+ options[:username] = username
15
+ end
16
+
17
+ opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
18
+ options[:password] = password
19
+ end
20
+
21
+ opts.on("-c", "--csv [FILE]", "The CSV file to write to") do |csv|
22
+ options[:csv] = File.open(csv, 'a')
23
+ end
24
+
25
+ opts.on("-j", "--json [FILE]", "The file to log all Twitter JSON to") do |json|
26
+ options[:json] = File.open(json, 'a')
27
+ end
28
+
29
+ opts.on("-f", "--filter [KEYWORDS]", "Keywords to ask Twitter to filter on") do |filter|
30
+ options[:filter] = filter.split(/\s*,\s*/)
31
+ end
32
+
33
+ opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
34
+ options[:require_english] = e
35
+ end
36
+
37
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
38
+ options[:verbose] = v
39
+ end
40
+
41
+ opts.on_tail("-h", "--help", "Show this message") do
42
+ STDERR.puts opts
43
+ exit
44
+ end
45
+
46
+ opts.on_tail("--version", "Show version") do
47
+ STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}"
48
+ exit
49
+ end
50
+ end
51
+
52
+ parser.parse!
53
+
54
+ unless options[:username] && options[:password]
55
+ STDERR.puts "Error: Twitter username and password are required fields.\n\n"
56
+ STDERR.puts parser
57
+ exit 1
58
+ end
59
+
60
+ TwitterToCsv::CsvBuilder.new(options).run
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "version"))
3
+ require 'fastercsv'
4
+ require 'pp'
5
+ require 'json'
6
+ require 'twitter/json_stream'
7
+ require 'em-http-request'
8
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
9
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
10
+ require 'unsupervised-language-detection'
11
+
12
+ module TwitterToCsv
13
+ end
@@ -0,0 +1,53 @@
1
+ module TwitterToCsv
2
+ class CsvBuilder
3
+ attr_accessor :options
4
+
5
+ def initialize(options = {})
6
+ @options = options
7
+ end
8
+
9
+ def is_english?(status)
10
+ if status.has_key?('delete')
11
+ STDERR.puts "Skipping Tweet with delete." if options[:verbose]
12
+ return false
13
+ end
14
+
15
+ if status['text'] =~ /[^[:ascii:]]/
16
+ STDERR.puts "Skipping \"#{status['text']}\" due to non-ascii text." if options[:verbose]
17
+ return false
18
+ end
19
+
20
+ unless status['user']['lang'] == "en"
21
+ STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
22
+ return false
23
+ end
24
+
25
+ unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
26
+ STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
27
+ return false
28
+ end
29
+
30
+ true
31
+ end
32
+
33
+ def run
34
+ begin
35
+ TwitterWatcher.new(options).run do |status|
36
+ if (options[:require_english] && is_english?(status)) || !options[:require_english]
37
+ if options[:json]
38
+ options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
39
+ options[:json].puts "------SEPERATOR------"
40
+ options[:json].flush
41
+ end
42
+ STDERR.puts "Logging: #{status['text']}" if options[:verbose]
43
+ end
44
+ end
45
+ rescue StandardError => e
46
+ STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
47
+ STDERR.puts "Waiting for a couple of minutes..."
48
+ sleep 120
49
+ retry
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,49 @@
1
+ module TwitterToCsv
2
+ class TwitterWatcher
3
+ attr_accessor :username, :password, :filter, :fetch_errors
4
+
5
+ def initialize(options)
6
+ @username = options[:username]
7
+ @password = options[:password]
8
+ @filter = options[:filter]
9
+ @fetch_errors = 0
10
+ end
11
+
12
+ def progress(str)
13
+ STDERR.print "#{str}..."
14
+ STDERR.flush
15
+ yield
16
+ STDERR.puts "done."
17
+ end
18
+
19
+ def run(&block)
20
+ EventMachine::run do
21
+ stream = Twitter::JSONStream.connect(
22
+ :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
23
+ :auth => "#{username}:#{password}",
24
+ :ssl => true
25
+ )
26
+
27
+ stream.each_item do |item|
28
+ handle_status JSON.parse(item), block
29
+ end
30
+
31
+ stream.on_error do |message|
32
+ STDERR.puts " --> Twitter error: #{message} <--"
33
+ end
34
+
35
+ stream.on_max_reconnects do |timeout, retries|
36
+ STDERR.puts " --> Oops, tried too many times! <--"
37
+ EventMachine::stop_event_loop
38
+ end
39
+ end
40
+ end
41
+
42
+ def handle_status(status, block)
43
+ return unless status
44
+ return if status.has_key?('delete')
45
+ status['text'] = status['text'].gsub(/&lt;/, "<").gsub(/&gt;/, ">").gsub(/[\t\n\r]/, ' ')
46
+ block.call(status)
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,3 @@
1
+ module TwitterToCsv
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "twitter_to_csv/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "twitter_to_csv"
7
+ s.version = TwitterToCsv::VERSION
8
+ s.authors = ["Andrew Cantino"]
9
+ s.email = ["andrew@iterationlabs.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Dump the Twitter streaming API to a CSV or JSON file}
12
+ s.description = %q{}
13
+
14
+ s.rubyforge_project = "twitter_to_csv"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # s.add_development_dependency "rspec"
22
+ s.add_runtime_dependency 'fastercsv'
23
+ s.add_runtime_dependency 'twitter-stream'
24
+ s.add_runtime_dependency 'em-http-request'
25
+ s.add_runtime_dependency 'unsupervised-language-detection'
26
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twitter_to_csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew Cantino
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-02-17 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: fastercsv
16
+ requirement: &83775030 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *83775030
25
+ - !ruby/object:Gem::Dependency
26
+ name: twitter-stream
27
+ requirement: &83774820 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *83774820
36
+ - !ruby/object:Gem::Dependency
37
+ name: em-http-request
38
+ requirement: &83774610 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *83774610
47
+ - !ruby/object:Gem::Dependency
48
+ name: unsupervised-language-detection
49
+ requirement: &83774400 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *83774400
58
+ description: ''
59
+ email:
60
+ - andrew@iterationlabs.com
61
+ executables:
62
+ - twitter_to_csv
63
+ extensions: []
64
+ extra_rdoc_files: []
65
+ files:
66
+ - .gitignore
67
+ - .rvmrc
68
+ - Gemfile
69
+ - README.markdown
70
+ - Rakefile
71
+ - bin/twitter_to_csv
72
+ - lib/twitter_to_csv.rb
73
+ - lib/twitter_to_csv/csv_builder.rb
74
+ - lib/twitter_to_csv/twitter_watcher.rb
75
+ - lib/twitter_to_csv/version.rb
76
+ - twitter_to_csv.gemspec
77
+ homepage: ''
78
+ licenses: []
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project: twitter_to_csv
97
+ rubygems_version: 1.8.16
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Dump the Twitter streaming API to a CSV or JSON file
101
+ test_files: []