twitter_to_csv 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/README.markdown +6 -3
- data/Rakefile +4 -0
- data/bin/twitter_to_csv +17 -4
- data/lib/twitter_to_csv.rb +1 -1
- data/lib/twitter_to_csv/csv_builder.rb +87 -21
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +62 -0
- data/spec/spec_helper.rb +7 -0
- data/twitter_to_csv.gemspec +1 -2
- metadata +15 -12
data/.gitignore
CHANGED
data/.rspec
ADDED
data/README.markdown
CHANGED
@@ -3,7 +3,10 @@
|
|
3
3
|
## Usage
|
4
4
|
|
5
5
|
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
6
|
-
--json hi.json --filter zit,zits,pimple,pimples,acne
|
6
|
+
--json hi.json --filter zit,zits,pimple,pimples,acne \
|
7
|
+
--csv out.csv --fields text,
|
8
|
+
--fields text,retweeted_status.id,retweeted_status.favorited,...
|
9
|
+
|
10
|
+
Use `--sample-fields 1000`` to output the occurrence count of different Twitter fields.
|
7
11
|
|
8
|
-
|
9
|
-
Verbosity and actually outputting to a CSV coming soon :)
|
12
|
+
You can also `--replay-from-file` if you have a JSON output file and you want to run it back through the exporter.
|
data/Rakefile
CHANGED
data/bin/twitter_to_csv
CHANGED
@@ -4,7 +4,7 @@ require 'open-uri'
|
|
4
4
|
require 'optparse'
|
5
5
|
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
6
6
|
|
7
|
-
options = { :csv => STDOUT }
|
7
|
+
options = { :csv => STDOUT, :fields => %w[text user.lang] }
|
8
8
|
parser = OptionParser.new do |opts|
|
9
9
|
opts.banner = "Usage: #{File.basename($0)} [options]"
|
10
10
|
opts.separator ""
|
@@ -18,11 +18,12 @@ parser = OptionParser.new do |opts|
|
|
18
18
|
options[:password] = password
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on("-c", "--csv [FILE]", "The CSV file to
|
21
|
+
opts.on("-c", "--csv [FILE]", "The CSV file to append to") do |csv|
|
22
|
+
options[:csv_appending] = File.exists?(csv)
|
22
23
|
options[:csv] = File.open(csv, 'a')
|
23
24
|
end
|
24
25
|
|
25
|
-
opts.on("-j", "--json [FILE]", "The file to
|
26
|
+
opts.on("-j", "--json [FILE]", "The JSON file to append to") do |json|
|
26
27
|
options[:json] = File.open(json, 'a')
|
27
28
|
end
|
28
29
|
|
@@ -30,6 +31,10 @@ parser = OptionParser.new do |opts|
|
|
30
31
|
options[:filter] = filter.split(/\s*,\s*/)
|
31
32
|
end
|
32
33
|
|
34
|
+
opts.on("-x", "--fields [FIELDS]", "Fields to include in the CSV") do |fields|
|
35
|
+
options[:fields] = fields.split(/\s*,\s*/)
|
36
|
+
end
|
37
|
+
|
33
38
|
opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
|
34
39
|
options[:require_english] = e
|
35
40
|
end
|
@@ -38,6 +43,14 @@ parser = OptionParser.new do |opts|
|
|
38
43
|
options[:verbose] = v
|
39
44
|
end
|
40
45
|
|
46
|
+
opts.on_tail("", "--sample-fields [NUMBER_OF_SAMPLES]", "Sample fields from Twitter, display them, and then exit.") do |samples|
|
47
|
+
options[:sample_fields] = samples && samples.to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.on_tail("", "--replay-from-file FILENAME", "Replay status from a JSON dump file") do |replay_file|
|
51
|
+
options[:replay_from_file] = replay_file
|
52
|
+
end
|
53
|
+
|
41
54
|
opts.on_tail("-h", "--help", "Show this message") do
|
42
55
|
STDERR.puts opts
|
43
56
|
exit
|
@@ -51,7 +64,7 @@ end
|
|
51
64
|
|
52
65
|
parser.parse!
|
53
66
|
|
54
|
-
unless options[:username] && options[:password]
|
67
|
+
unless (options[:username] && options[:password]) || options[:replay_from_file]
|
55
68
|
STDERR.puts "Error: Twitter username and password are required fields.\n\n"
|
56
69
|
STDERR.puts parser
|
57
70
|
exit 1
|
data/lib/twitter_to_csv.rb
CHANGED
@@ -1,9 +1,95 @@
|
|
1
|
+
require 'pp'
|
2
|
+
|
1
3
|
module TwitterToCsv
|
2
4
|
class CsvBuilder
|
3
|
-
attr_accessor :options
|
5
|
+
attr_accessor :options, :sampled_fields
|
4
6
|
|
5
7
|
def initialize(options = {})
|
6
8
|
@options = options
|
9
|
+
@sampled_fields = {}
|
10
|
+
@num_samples = 0
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
log_csv_header if options[:csv] && !options[:csv_appending]
|
15
|
+
if options[:replay_from_file]
|
16
|
+
replay_from options[:replay_from_file]
|
17
|
+
else
|
18
|
+
begin
|
19
|
+
TwitterWatcher.new(options).run do |status|
|
20
|
+
handle_status status
|
21
|
+
end
|
22
|
+
rescue SignalException, SystemExit
|
23
|
+
exit
|
24
|
+
rescue StandardError => e
|
25
|
+
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
26
|
+
STDERR.puts "Waiting for a couple of minutes..."
|
27
|
+
sleep 120
|
28
|
+
retry
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def handle_status(status)
|
34
|
+
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
35
|
+
log_json(status) if options[:json]
|
36
|
+
log_csv(status) if options[:csv]
|
37
|
+
sample_fields(status) if options[:sample_fields]
|
38
|
+
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def log_csv_header
|
43
|
+
options[:csv].puts options[:fields].to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
44
|
+
end
|
45
|
+
|
46
|
+
def log_csv(status)
|
47
|
+
csv_row = options[:fields].map do |field|
|
48
|
+
field.split(".").inject(status) { |memo, segment|
|
49
|
+
memo && memo[segment]
|
50
|
+
}.to_s
|
51
|
+
end.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
52
|
+
options[:csv].puts csv_row
|
53
|
+
end
|
54
|
+
|
55
|
+
def replay_from(filename)
|
56
|
+
File.open(filename, "r") do |file|
|
57
|
+
until file.eof?
|
58
|
+
line = file.readline
|
59
|
+
next if line =~ /\A------SEP.RATOR------\Z/i
|
60
|
+
handle_status JSON.parse(line)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def sample_fields(status)
|
66
|
+
extract_fields(status, sampled_fields)
|
67
|
+
@num_samples += 1
|
68
|
+
if @num_samples > options[:sample_fields]
|
69
|
+
puts "Sampled fields from Twitter:"
|
70
|
+
sampled_fields.each do |field, count|
|
71
|
+
puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
|
72
|
+
end
|
73
|
+
exit 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def extract_fields(object, fields, current_path = [])
|
78
|
+
if object.is_a?(Hash)
|
79
|
+
object.each do |k, v|
|
80
|
+
extract_fields v, fields, current_path + [k]
|
81
|
+
end
|
82
|
+
else
|
83
|
+
path = current_path.join(".")
|
84
|
+
fields[path] ||= 0
|
85
|
+
fields[path] += 1
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def log_json(status)
|
90
|
+
options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
|
91
|
+
options[:json].puts "------SEPARATOR------"
|
92
|
+
options[:json].flush
|
7
93
|
end
|
8
94
|
|
9
95
|
def is_english?(status)
|
@@ -29,25 +115,5 @@ module TwitterToCsv
|
|
29
115
|
|
30
116
|
true
|
31
117
|
end
|
32
|
-
|
33
|
-
def run
|
34
|
-
begin
|
35
|
-
TwitterWatcher.new(options).run do |status|
|
36
|
-
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
37
|
-
if options[:json]
|
38
|
-
options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
|
39
|
-
options[:json].puts "------SEPERATOR------"
|
40
|
-
options[:json].flush
|
41
|
-
end
|
42
|
-
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
43
|
-
end
|
44
|
-
end
|
45
|
-
rescue StandardError => e
|
46
|
-
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
47
|
-
STDERR.puts "Waiting for a couple of minutes..."
|
48
|
-
sleep 120
|
49
|
-
retry
|
50
|
-
end
|
51
|
-
end
|
52
118
|
end
|
53
119
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe TwitterToCsv::CsvBuilder do
|
5
|
+
describe "#handle_status" do
|
6
|
+
describe "when :english is set" do
|
7
|
+
it "skips non-English tweets" do
|
8
|
+
string_io = StringIO.new
|
9
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
|
10
|
+
csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
|
11
|
+
csv_builder.handle_status('text' => "هذه الجملة باللغة الإنجليزية.", 'user' => { 'lang' => 'en' })
|
12
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
|
13
|
+
csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
|
14
|
+
csv_builder.handle_status('text' => "The lang code can lie, but we trust it for now.", 'user' => { 'lang' => 'fr' })
|
15
|
+
string_io.rewind
|
16
|
+
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "logging to a CSV" do
|
21
|
+
it "outputs the requested fields when requested in dot-notation" do
|
22
|
+
string_io = StringIO.new
|
23
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
|
24
|
+
csv_builder.handle_status({
|
25
|
+
'something' => "hello",
|
26
|
+
'something_else' => {
|
27
|
+
'a' => 'b',
|
28
|
+
'c' => {
|
29
|
+
'd' => "foo",
|
30
|
+
'e' => 'bar'
|
31
|
+
},
|
32
|
+
'blah' => 'hi'
|
33
|
+
}
|
34
|
+
})
|
35
|
+
string_io.rewind
|
36
|
+
string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "#extract_fields" do
|
42
|
+
it "finds all the paths through a hash" do
|
43
|
+
obj = {
|
44
|
+
:a => :b,
|
45
|
+
:b => "c",
|
46
|
+
:d => {
|
47
|
+
:e => :f,
|
48
|
+
:g => {
|
49
|
+
:h => :i,
|
50
|
+
:j => {
|
51
|
+
:k => "l"
|
52
|
+
}
|
53
|
+
},
|
54
|
+
:m => "n"
|
55
|
+
}
|
56
|
+
}
|
57
|
+
fields = { "a" => 1 }
|
58
|
+
TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
|
59
|
+
fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/twitter_to_csv.gemspec
CHANGED
@@ -18,8 +18,7 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
|
21
|
-
|
22
|
-
s.add_runtime_dependency 'fastercsv'
|
21
|
+
s.add_development_dependency "rspec"
|
23
22
|
s.add_runtime_dependency 'twitter-stream'
|
24
23
|
s.add_runtime_dependency 'em-http-request'
|
25
24
|
s.add_runtime_dependency 'unsupervised-language-detection'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_to_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,22 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: &
|
15
|
+
name: rspec
|
16
|
+
requirement: &71632880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
20
20
|
- !ruby/object:Gem::Version
|
21
21
|
version: '0'
|
22
|
-
type: :
|
22
|
+
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *71632880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: twitter-stream
|
27
|
-
requirement: &
|
27
|
+
requirement: &71632670 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *71632670
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-http-request
|
38
|
-
requirement: &
|
38
|
+
requirement: &71632460 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *71632460
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: unsupervised-language-detection
|
49
|
-
requirement: &
|
49
|
+
requirement: &71632250 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *71632250
|
58
58
|
description: ''
|
59
59
|
email:
|
60
60
|
- andrew@iterationlabs.com
|
@@ -64,6 +64,7 @@ extensions: []
|
|
64
64
|
extra_rdoc_files: []
|
65
65
|
files:
|
66
66
|
- .gitignore
|
67
|
+
- .rspec
|
67
68
|
- .rvmrc
|
68
69
|
- Gemfile
|
69
70
|
- README.markdown
|
@@ -73,6 +74,8 @@ files:
|
|
73
74
|
- lib/twitter_to_csv/csv_builder.rb
|
74
75
|
- lib/twitter_to_csv/twitter_watcher.rb
|
75
76
|
- lib/twitter_to_csv/version.rb
|
77
|
+
- spec/csv_builder_spec.rb
|
78
|
+
- spec/spec_helper.rb
|
76
79
|
- twitter_to_csv.gemspec
|
77
80
|
homepage: ''
|
78
81
|
licenses: []
|