twitter_to_csv 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/.rvmrc +1 -0
- data/Gemfile +5 -0
- data/README.markdown +9 -0
- data/Rakefile +1 -0
- data/bin/twitter_to_csv +60 -0
- data/lib/twitter_to_csv.rb +13 -0
- data/lib/twitter_to_csv/csv_builder.rb +53 -0
- data/lib/twitter_to_csv/twitter_watcher.rb +49 -0
- data/lib/twitter_to_csv/version.rb +3 -0
- data/twitter_to_csv.gemspec +26 -0
- metadata +101 -0
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.9.2@twitter_to_csv --create
|
data/Gemfile
ADDED
data/README.markdown
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/twitter_to_csv
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'optparse'
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
6
|
+
|
7
|
+
options = { :csv => STDOUT }
|
8
|
+
parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: #{File.basename($0)} [options]"
|
10
|
+
opts.separator ""
|
11
|
+
opts.separator "Specific options:"
|
12
|
+
|
13
|
+
opts.on("-u", "--username USERNAME", "Twitter username") do |username|
|
14
|
+
options[:username] = username
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
|
18
|
+
options[:password] = password
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-c", "--csv [FILE]", "The CSV file to write to") do |csv|
|
22
|
+
options[:csv] = File.open(csv, 'a')
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-j", "--json [FILE]", "The file to log all Twitter JSON to") do |json|
|
26
|
+
options[:json] = File.open(json, 'a')
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-f", "--filter [KEYWORDS]", "Keywords to ask Twitter to filter on") do |filter|
|
30
|
+
options[:filter] = filter.split(/\s*,\s*/)
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
|
34
|
+
options[:require_english] = e
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
38
|
+
options[:verbose] = v
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
42
|
+
STDERR.puts opts
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on_tail("--version", "Show version") do
|
47
|
+
STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}"
|
48
|
+
exit
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
parser.parse!
|
53
|
+
|
54
|
+
unless options[:username] && options[:password]
|
55
|
+
STDERR.puts "Error: Twitter username and password are required fields.\n\n"
|
56
|
+
STDERR.puts parser
|
57
|
+
exit 1
|
58
|
+
end
|
59
|
+
|
60
|
+
TwitterToCsv::CsvBuilder.new(options).run
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "version"))
|
3
|
+
require 'fastercsv'
|
4
|
+
require 'pp'
|
5
|
+
require 'json'
|
6
|
+
require 'twitter/json_stream'
|
7
|
+
require 'em-http-request'
|
8
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
|
10
|
+
require 'unsupervised-language-detection'
|
11
|
+
|
12
|
+
module TwitterToCsv
|
13
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module TwitterToCsv
|
2
|
+
class CsvBuilder
|
3
|
+
attr_accessor :options
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
def is_english?(status)
|
10
|
+
if status.has_key?('delete')
|
11
|
+
STDERR.puts "Skipping Tweet with delete." if options[:verbose]
|
12
|
+
return false
|
13
|
+
end
|
14
|
+
|
15
|
+
if status['text'] =~ /[^[:ascii:]]/
|
16
|
+
STDERR.puts "Skipping \"#{status['text']}\" due to non-ascii text." if options[:verbose]
|
17
|
+
return false
|
18
|
+
end
|
19
|
+
|
20
|
+
unless status['user']['lang'] == "en"
|
21
|
+
STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
|
22
|
+
return false
|
23
|
+
end
|
24
|
+
|
25
|
+
unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
|
26
|
+
STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
|
27
|
+
return false
|
28
|
+
end
|
29
|
+
|
30
|
+
true
|
31
|
+
end
|
32
|
+
|
33
|
+
def run
|
34
|
+
begin
|
35
|
+
TwitterWatcher.new(options).run do |status|
|
36
|
+
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
37
|
+
if options[:json]
|
38
|
+
options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
|
39
|
+
options[:json].puts "------SEPERATOR------"
|
40
|
+
options[:json].flush
|
41
|
+
end
|
42
|
+
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
rescue StandardError => e
|
46
|
+
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
47
|
+
STDERR.puts "Waiting for a couple of minutes..."
|
48
|
+
sleep 120
|
49
|
+
retry
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module TwitterToCsv
|
2
|
+
class TwitterWatcher
|
3
|
+
attr_accessor :username, :password, :filter, :fetch_errors
|
4
|
+
|
5
|
+
def initialize(options)
|
6
|
+
@username = options[:username]
|
7
|
+
@password = options[:password]
|
8
|
+
@filter = options[:filter]
|
9
|
+
@fetch_errors = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def progress(str)
|
13
|
+
STDERR.print "#{str}..."
|
14
|
+
STDERR.flush
|
15
|
+
yield
|
16
|
+
STDERR.puts "done."
|
17
|
+
end
|
18
|
+
|
19
|
+
def run(&block)
|
20
|
+
EventMachine::run do
|
21
|
+
stream = Twitter::JSONStream.connect(
|
22
|
+
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
|
23
|
+
:auth => "#{username}:#{password}",
|
24
|
+
:ssl => true
|
25
|
+
)
|
26
|
+
|
27
|
+
stream.each_item do |item|
|
28
|
+
handle_status JSON.parse(item), block
|
29
|
+
end
|
30
|
+
|
31
|
+
stream.on_error do |message|
|
32
|
+
STDERR.puts " --> Twitter error: #{message} <--"
|
33
|
+
end
|
34
|
+
|
35
|
+
stream.on_max_reconnects do |timeout, retries|
|
36
|
+
STDERR.puts " --> Oops, tried too many times! <--"
|
37
|
+
EventMachine::stop_event_loop
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def handle_status(status, block)
|
43
|
+
return unless status
|
44
|
+
return if status.has_key?('delete')
|
45
|
+
status['text'] = status['text'].gsub(/</, "<").gsub(/>/, ">").gsub(/[\t\n\r]/, ' ')
|
46
|
+
block.call(status)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "twitter_to_csv/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "twitter_to_csv"
|
7
|
+
s.version = TwitterToCsv::VERSION
|
8
|
+
s.authors = ["Andrew Cantino"]
|
9
|
+
s.email = ["andrew@iterationlabs.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Dump the Twitter streaming API to a CSV or JSON file}
|
12
|
+
s.description = %q{}
|
13
|
+
|
14
|
+
s.rubyforge_project = "twitter_to_csv"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# s.add_development_dependency "rspec"
|
22
|
+
s.add_runtime_dependency 'fastercsv'
|
23
|
+
s.add_runtime_dependency 'twitter-stream'
|
24
|
+
s.add_runtime_dependency 'em-http-request'
|
25
|
+
s.add_runtime_dependency 'unsupervised-language-detection'
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twitter_to_csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andrew Cantino
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-17 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: fastercsv
|
16
|
+
requirement: &83775030 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *83775030
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: twitter-stream
|
27
|
+
requirement: &83774820 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *83774820
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: em-http-request
|
38
|
+
requirement: &83774610 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *83774610
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: unsupervised-language-detection
|
49
|
+
requirement: &83774400 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *83774400
|
58
|
+
description: ''
|
59
|
+
email:
|
60
|
+
- andrew@iterationlabs.com
|
61
|
+
executables:
|
62
|
+
- twitter_to_csv
|
63
|
+
extensions: []
|
64
|
+
extra_rdoc_files: []
|
65
|
+
files:
|
66
|
+
- .gitignore
|
67
|
+
- .rvmrc
|
68
|
+
- Gemfile
|
69
|
+
- README.markdown
|
70
|
+
- Rakefile
|
71
|
+
- bin/twitter_to_csv
|
72
|
+
- lib/twitter_to_csv.rb
|
73
|
+
- lib/twitter_to_csv/csv_builder.rb
|
74
|
+
- lib/twitter_to_csv/twitter_watcher.rb
|
75
|
+
- lib/twitter_to_csv/version.rb
|
76
|
+
- twitter_to_csv.gemspec
|
77
|
+
homepage: ''
|
78
|
+
licenses: []
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project: twitter_to_csv
|
97
|
+
rubygems_version: 1.8.16
|
98
|
+
signing_key:
|
99
|
+
specification_version: 3
|
100
|
+
summary: Dump the Twitter streaming API to a CSV or JSON file
|
101
|
+
test_files: []
|