twitter_to_csv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .idea
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.2@twitter_to_csv --create
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in twitter_to_csv.gemspec
4
+ gemspec
5
+
data/README.markdown ADDED
@@ -0,0 +1,9 @@
1
+ # Twitter To CSV
2
+
3
+ ## Usage
4
+
5
+ twitter_to_csv --username <your twitter username> --password <your twitter password> \
6
+ --json hi.json --filter zit,zits,pimple,pimples,acne
7
+
8
+
9
+ Verbosity and actually outputting to a CSV coming soon :)
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'optparse'
5
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
6
+
7
+ options = { :csv => STDOUT }
8
+ parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: #{File.basename($0)} [options]"
10
+ opts.separator ""
11
+ opts.separator "Specific options:"
12
+
13
+ opts.on("-u", "--username USERNAME", "Twitter username") do |username|
14
+ options[:username] = username
15
+ end
16
+
17
+ opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
18
+ options[:password] = password
19
+ end
20
+
21
+ opts.on("-c", "--csv [FILE]", "The CSV file to write to") do |csv|
22
+ options[:csv] = File.open(csv, 'a')
23
+ end
24
+
25
+ opts.on("-j", "--json [FILE]", "The file to log all Twitter JSON to") do |json|
26
+ options[:json] = File.open(json, 'a')
27
+ end
28
+
29
+ opts.on("-f", "--filter [KEYWORDS]", "Keywords to ask Twitter to filter on") do |filter|
30
+ options[:filter] = filter.split(/\s*,\s*/)
31
+ end
32
+
33
+ opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
34
+ options[:require_english] = e
35
+ end
36
+
37
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
38
+ options[:verbose] = v
39
+ end
40
+
41
+ opts.on_tail("-h", "--help", "Show this message") do
42
+ STDERR.puts opts
43
+ exit
44
+ end
45
+
46
+ opts.on_tail("--version", "Show version") do
47
+ STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}"
48
+ exit
49
+ end
50
+ end
51
+
52
+ parser.parse!
53
+
54
+ unless options[:username] && options[:password]
55
+ STDERR.puts "Error: Twitter username and password are required fields.\n\n"
56
+ STDERR.puts parser
57
+ exit 1
58
+ end
59
+
60
+ TwitterToCsv::CsvBuilder.new(options).run
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "version"))
3
+ require 'fastercsv'
4
+ require 'pp'
5
+ require 'json'
6
+ require 'twitter/json_stream'
7
+ require 'em-http-request'
8
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
9
+ require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
10
+ require 'unsupervised-language-detection'
11
+
12
+ module TwitterToCsv
13
+ end
@@ -0,0 +1,53 @@
1
+ module TwitterToCsv
2
+ class CsvBuilder
3
+ attr_accessor :options
4
+
5
+ def initialize(options = {})
6
+ @options = options
7
+ end
8
+
9
+ def is_english?(status)
10
+ if status.has_key?('delete')
11
+ STDERR.puts "Skipping Tweet with delete." if options[:verbose]
12
+ return false
13
+ end
14
+
15
+ if status['text'] =~ /[^[:ascii:]]/
16
+ STDERR.puts "Skipping \"#{status['text']}\" due to non-ascii text." if options[:verbose]
17
+ return false
18
+ end
19
+
20
+ unless status['user']['lang'] == "en"
21
+ STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
22
+ return false
23
+ end
24
+
25
+ unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
26
+ STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
27
+ return false
28
+ end
29
+
30
+ true
31
+ end
32
+
33
+ def run
34
+ begin
35
+ TwitterWatcher.new(options).run do |status|
36
+ if (options[:require_english] && is_english?(status)) || !options[:require_english]
37
+ if options[:json]
38
+ options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
39
+ options[:json].puts "------SEPERATOR------"
40
+ options[:json].flush
41
+ end
42
+ STDERR.puts "Logging: #{status['text']}" if options[:verbose]
43
+ end
44
+ end
45
+ rescue StandardError => e
46
+ STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
47
+ STDERR.puts "Waiting for a couple of minutes..."
48
+ sleep 120
49
+ retry
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,49 @@
1
+ module TwitterToCsv
2
+ class TwitterWatcher
3
+ attr_accessor :username, :password, :filter, :fetch_errors
4
+
5
+ def initialize(options)
6
+ @username = options[:username]
7
+ @password = options[:password]
8
+ @filter = options[:filter]
9
+ @fetch_errors = 0
10
+ end
11
+
12
+ def progress(str)
13
+ STDERR.print "#{str}..."
14
+ STDERR.flush
15
+ yield
16
+ STDERR.puts "done."
17
+ end
18
+
19
+ def run(&block)
20
+ EventMachine::run do
21
+ stream = Twitter::JSONStream.connect(
22
+ :path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
23
+ :auth => "#{username}:#{password}",
24
+ :ssl => true
25
+ )
26
+
27
+ stream.each_item do |item|
28
+ handle_status JSON.parse(item), block
29
+ end
30
+
31
+ stream.on_error do |message|
32
+ STDERR.puts " --> Twitter error: #{message} <--"
33
+ end
34
+
35
+ stream.on_max_reconnects do |timeout, retries|
36
+ STDERR.puts " --> Oops, tried too many times! <--"
37
+ EventMachine::stop_event_loop
38
+ end
39
+ end
40
+ end
41
+
42
+ def handle_status(status, block)
43
+ return unless status
44
+ return if status.has_key?('delete')
45
+ status['text'] = status['text'].gsub(/&lt;/, "<").gsub(/&gt;/, ">").gsub(/[\t\n\r]/, ' ')
46
+ block.call(status)
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,3 @@
1
+ module TwitterToCsv
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "twitter_to_csv/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "twitter_to_csv"
7
+ s.version = TwitterToCsv::VERSION
8
+ s.authors = ["Andrew Cantino"]
9
+ s.email = ["andrew@iterationlabs.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Dump the Twitter streaming API to a CSV or JSON file}
12
+ s.description = %q{}
13
+
14
+ s.rubyforge_project = "twitter_to_csv"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # s.add_development_dependency "rspec"
22
+ s.add_runtime_dependency 'fastercsv'
23
+ s.add_runtime_dependency 'twitter-stream'
24
+ s.add_runtime_dependency 'em-http-request'
25
+ s.add_runtime_dependency 'unsupervised-language-detection'
26
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twitter_to_csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew Cantino
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-02-17 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: fastercsv
16
+ requirement: &83775030 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *83775030
25
+ - !ruby/object:Gem::Dependency
26
+ name: twitter-stream
27
+ requirement: &83774820 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *83774820
36
+ - !ruby/object:Gem::Dependency
37
+ name: em-http-request
38
+ requirement: &83774610 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *83774610
47
+ - !ruby/object:Gem::Dependency
48
+ name: unsupervised-language-detection
49
+ requirement: &83774400 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *83774400
58
+ description: ''
59
+ email:
60
+ - andrew@iterationlabs.com
61
+ executables:
62
+ - twitter_to_csv
63
+ extensions: []
64
+ extra_rdoc_files: []
65
+ files:
66
+ - .gitignore
67
+ - .rvmrc
68
+ - Gemfile
69
+ - README.markdown
70
+ - Rakefile
71
+ - bin/twitter_to_csv
72
+ - lib/twitter_to_csv.rb
73
+ - lib/twitter_to_csv/csv_builder.rb
74
+ - lib/twitter_to_csv/twitter_watcher.rb
75
+ - lib/twitter_to_csv/version.rb
76
+ - twitter_to_csv.gemspec
77
+ homepage: ''
78
+ licenses: []
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project: twitter_to_csv
97
+ rubygems_version: 1.8.16
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Dump the Twitter streaming API to a CSV or JSON file
101
+ test_files: []