twitter_to_csv 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/.rvmrc +1 -0
- data/Gemfile +5 -0
- data/README.markdown +9 -0
- data/Rakefile +1 -0
- data/bin/twitter_to_csv +60 -0
- data/lib/twitter_to_csv.rb +13 -0
- data/lib/twitter_to_csv/csv_builder.rb +53 -0
- data/lib/twitter_to_csv/twitter_watcher.rb +49 -0
- data/lib/twitter_to_csv/version.rb +3 -0
- data/twitter_to_csv.gemspec +26 -0
- metadata +101 -0
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.9.2@twitter_to_csv --create
|
data/Gemfile
ADDED
data/README.markdown
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/twitter_to_csv
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'optparse'
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
6
|
+
|
7
|
+
options = { :csv => STDOUT }
|
8
|
+
parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: #{File.basename($0)} [options]"
|
10
|
+
opts.separator ""
|
11
|
+
opts.separator "Specific options:"
|
12
|
+
|
13
|
+
opts.on("-u", "--username USERNAME", "Twitter username") do |username|
|
14
|
+
options[:username] = username
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on("-p", "--password PASSWORD", "Twitter password") do |password|
|
18
|
+
options[:password] = password
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-c", "--csv [FILE]", "The CSV file to write to") do |csv|
|
22
|
+
options[:csv] = File.open(csv, 'a')
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-j", "--json [FILE]", "The file to log all Twitter JSON to") do |json|
|
26
|
+
options[:json] = File.open(json, 'a')
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-f", "--filter [KEYWORDS]", "Keywords to ask Twitter to filter on") do |filter|
|
30
|
+
options[:filter] = filter.split(/\s*,\s*/)
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
|
34
|
+
options[:require_english] = e
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
38
|
+
options[:verbose] = v
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
42
|
+
STDERR.puts opts
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on_tail("--version", "Show version") do
|
47
|
+
STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}"
|
48
|
+
exit
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
parser.parse!
|
53
|
+
|
54
|
+
unless options[:username] && options[:password]
|
55
|
+
STDERR.puts "Error: Twitter username and password are required fields.\n\n"
|
56
|
+
STDERR.puts parser
|
57
|
+
exit 1
|
58
|
+
end
|
59
|
+
|
60
|
+
TwitterToCsv::CsvBuilder.new(options).run
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "version"))
|
3
|
+
require 'fastercsv'
|
4
|
+
require 'pp'
|
5
|
+
require 'json'
|
6
|
+
require 'twitter/json_stream'
|
7
|
+
require 'em-http-request'
|
8
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "twitter_watcher"))
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "csv_builder"))
|
10
|
+
require 'unsupervised-language-detection'
|
11
|
+
|
12
|
+
module TwitterToCsv
|
13
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module TwitterToCsv
|
2
|
+
class CsvBuilder
|
3
|
+
attr_accessor :options
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
def is_english?(status)
|
10
|
+
if status.has_key?('delete')
|
11
|
+
STDERR.puts "Skipping Tweet with delete." if options[:verbose]
|
12
|
+
return false
|
13
|
+
end
|
14
|
+
|
15
|
+
if status['text'] =~ /[^[:ascii:]]/
|
16
|
+
STDERR.puts "Skipping \"#{status['text']}\" due to non-ascii text." if options[:verbose]
|
17
|
+
return false
|
18
|
+
end
|
19
|
+
|
20
|
+
unless status['user']['lang'] == "en"
|
21
|
+
STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
|
22
|
+
return false
|
23
|
+
end
|
24
|
+
|
25
|
+
unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
|
26
|
+
STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
|
27
|
+
return false
|
28
|
+
end
|
29
|
+
|
30
|
+
true
|
31
|
+
end
|
32
|
+
|
33
|
+
def run
|
34
|
+
begin
|
35
|
+
TwitterWatcher.new(options).run do |status|
|
36
|
+
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
37
|
+
if options[:json]
|
38
|
+
options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
|
39
|
+
options[:json].puts "------SEPERATOR------"
|
40
|
+
options[:json].flush
|
41
|
+
end
|
42
|
+
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
rescue StandardError => e
|
46
|
+
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
47
|
+
STDERR.puts "Waiting for a couple of minutes..."
|
48
|
+
sleep 120
|
49
|
+
retry
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module TwitterToCsv
|
2
|
+
class TwitterWatcher
|
3
|
+
attr_accessor :username, :password, :filter, :fetch_errors
|
4
|
+
|
5
|
+
def initialize(options)
|
6
|
+
@username = options[:username]
|
7
|
+
@password = options[:password]
|
8
|
+
@filter = options[:filter]
|
9
|
+
@fetch_errors = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def progress(str)
|
13
|
+
STDERR.print "#{str}..."
|
14
|
+
STDERR.flush
|
15
|
+
yield
|
16
|
+
STDERR.puts "done."
|
17
|
+
end
|
18
|
+
|
19
|
+
def run(&block)
|
20
|
+
EventMachine::run do
|
21
|
+
stream = Twitter::JSONStream.connect(
|
22
|
+
:path => "/1/statuses/#{(filter && filter.length > 0) ? 'filter' : 'sample'}.json#{"?track=#{filter.join(",")}" if filter && filter.length > 0}",
|
23
|
+
:auth => "#{username}:#{password}",
|
24
|
+
:ssl => true
|
25
|
+
)
|
26
|
+
|
27
|
+
stream.each_item do |item|
|
28
|
+
handle_status JSON.parse(item), block
|
29
|
+
end
|
30
|
+
|
31
|
+
stream.on_error do |message|
|
32
|
+
STDERR.puts " --> Twitter error: #{message} <--"
|
33
|
+
end
|
34
|
+
|
35
|
+
stream.on_max_reconnects do |timeout, retries|
|
36
|
+
STDERR.puts " --> Oops, tried too many times! <--"
|
37
|
+
EventMachine::stop_event_loop
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def handle_status(status, block)
|
43
|
+
return unless status
|
44
|
+
return if status.has_key?('delete')
|
45
|
+
status['text'] = status['text'].gsub(/</, "<").gsub(/>/, ">").gsub(/[\t\n\r]/, ' ')
|
46
|
+
block.call(status)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "twitter_to_csv/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "twitter_to_csv"
|
7
|
+
s.version = TwitterToCsv::VERSION
|
8
|
+
s.authors = ["Andrew Cantino"]
|
9
|
+
s.email = ["andrew@iterationlabs.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Dump the Twitter streaming API to a CSV or JSON file}
|
12
|
+
s.description = %q{}
|
13
|
+
|
14
|
+
s.rubyforge_project = "twitter_to_csv"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# s.add_development_dependency "rspec"
|
22
|
+
s.add_runtime_dependency 'fastercsv'
|
23
|
+
s.add_runtime_dependency 'twitter-stream'
|
24
|
+
s.add_runtime_dependency 'em-http-request'
|
25
|
+
s.add_runtime_dependency 'unsupervised-language-detection'
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twitter_to_csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andrew Cantino
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-17 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: fastercsv
|
16
|
+
requirement: &83775030 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *83775030
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: twitter-stream
|
27
|
+
requirement: &83774820 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *83774820
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: em-http-request
|
38
|
+
requirement: &83774610 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *83774610
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: unsupervised-language-detection
|
49
|
+
requirement: &83774400 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *83774400
|
58
|
+
description: ''
|
59
|
+
email:
|
60
|
+
- andrew@iterationlabs.com
|
61
|
+
executables:
|
62
|
+
- twitter_to_csv
|
63
|
+
extensions: []
|
64
|
+
extra_rdoc_files: []
|
65
|
+
files:
|
66
|
+
- .gitignore
|
67
|
+
- .rvmrc
|
68
|
+
- Gemfile
|
69
|
+
- README.markdown
|
70
|
+
- Rakefile
|
71
|
+
- bin/twitter_to_csv
|
72
|
+
- lib/twitter_to_csv.rb
|
73
|
+
- lib/twitter_to_csv/csv_builder.rb
|
74
|
+
- lib/twitter_to_csv/twitter_watcher.rb
|
75
|
+
- lib/twitter_to_csv/version.rb
|
76
|
+
- twitter_to_csv.gemspec
|
77
|
+
homepage: ''
|
78
|
+
licenses: []
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project: twitter_to_csv
|
97
|
+
rubygems_version: 1.8.16
|
98
|
+
signing_key:
|
99
|
+
specification_version: 3
|
100
|
+
summary: Dump the Twitter streaming API to a CSV or JSON file
|
101
|
+
test_files: []
|