twitter_to_csv 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/README.markdown +6 -3
- data/Rakefile +4 -0
- data/bin/twitter_to_csv +17 -4
- data/lib/twitter_to_csv.rb +1 -1
- data/lib/twitter_to_csv/csv_builder.rb +87 -21
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +62 -0
- data/spec/spec_helper.rb +7 -0
- data/twitter_to_csv.gemspec +1 -2
- metadata +15 -12
data/.gitignore
CHANGED
data/.rspec
ADDED
data/README.markdown
CHANGED
|
@@ -3,7 +3,10 @@
|
|
|
3
3
|
## Usage
|
|
4
4
|
|
|
5
5
|
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
|
6
|
-
--json hi.json --filter zit,zits,pimple,pimples,acne
|
|
6
|
+
--json hi.json --filter zit,zits,pimple,pimples,acne \
|
|
7
|
+
--csv out.csv --fields text,
|
|
8
|
+
--fields text,retweeted_status.id,retweeted_status.favorited,...
|
|
9
|
+
|
|
10
|
+
Use `--sample-fields 1000`` to output the occurrence count of different Twitter fields.
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
Verbosity and actually outputting to a CSV coming soon :)
|
|
12
|
+
You can also `--replay-from-file` if you have a JSON output file and you want to run it back through the exporter.
|
data/Rakefile
CHANGED
data/bin/twitter_to_csv
CHANGED
|
@@ -4,7 +4,7 @@ require 'open-uri'
|
|
|
4
4
|
require 'optparse'
|
|
5
5
|
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
|
6
6
|
|
|
7
|
-
options = { :csv => STDOUT }
|
|
7
|
+
options = { :csv => STDOUT, :fields => %w[text user.lang] }
|
|
8
8
|
parser = OptionParser.new do |opts|
|
|
9
9
|
opts.banner = "Usage: #{File.basename($0)} [options]"
|
|
10
10
|
opts.separator ""
|
|
@@ -18,11 +18,12 @@ parser = OptionParser.new do |opts|
|
|
|
18
18
|
options[:password] = password
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
-
opts.on("-c", "--csv [FILE]", "The CSV file to
|
|
21
|
+
opts.on("-c", "--csv [FILE]", "The CSV file to append to") do |csv|
|
|
22
|
+
options[:csv_appending] = File.exists?(csv)
|
|
22
23
|
options[:csv] = File.open(csv, 'a')
|
|
23
24
|
end
|
|
24
25
|
|
|
25
|
-
opts.on("-j", "--json [FILE]", "The file to
|
|
26
|
+
opts.on("-j", "--json [FILE]", "The JSON file to append to") do |json|
|
|
26
27
|
options[:json] = File.open(json, 'a')
|
|
27
28
|
end
|
|
28
29
|
|
|
@@ -30,6 +31,10 @@ parser = OptionParser.new do |opts|
|
|
|
30
31
|
options[:filter] = filter.split(/\s*,\s*/)
|
|
31
32
|
end
|
|
32
33
|
|
|
34
|
+
opts.on("-x", "--fields [FIELDS]", "Fields to include in the CSV") do |fields|
|
|
35
|
+
options[:fields] = fields.split(/\s*,\s*/)
|
|
36
|
+
end
|
|
37
|
+
|
|
33
38
|
opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
|
|
34
39
|
options[:require_english] = e
|
|
35
40
|
end
|
|
@@ -38,6 +43,14 @@ parser = OptionParser.new do |opts|
|
|
|
38
43
|
options[:verbose] = v
|
|
39
44
|
end
|
|
40
45
|
|
|
46
|
+
opts.on_tail("", "--sample-fields [NUMBER_OF_SAMPLES]", "Sample fields from Twitter, display them, and then exit.") do |samples|
|
|
47
|
+
options[:sample_fields] = samples && samples.to_i
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
opts.on_tail("", "--replay-from-file FILENAME", "Replay status from a JSON dump file") do |replay_file|
|
|
51
|
+
options[:replay_from_file] = replay_file
|
|
52
|
+
end
|
|
53
|
+
|
|
41
54
|
opts.on_tail("-h", "--help", "Show this message") do
|
|
42
55
|
STDERR.puts opts
|
|
43
56
|
exit
|
|
@@ -51,7 +64,7 @@ end
|
|
|
51
64
|
|
|
52
65
|
parser.parse!
|
|
53
66
|
|
|
54
|
-
unless options[:username] && options[:password]
|
|
67
|
+
unless (options[:username] && options[:password]) || options[:replay_from_file]
|
|
55
68
|
STDERR.puts "Error: Twitter username and password are required fields.\n\n"
|
|
56
69
|
STDERR.puts parser
|
|
57
70
|
exit 1
|
data/lib/twitter_to_csv.rb
CHANGED
|
@@ -1,9 +1,95 @@
|
|
|
1
|
+
require 'pp'
|
|
2
|
+
|
|
1
3
|
module TwitterToCsv
|
|
2
4
|
class CsvBuilder
|
|
3
|
-
attr_accessor :options
|
|
5
|
+
attr_accessor :options, :sampled_fields
|
|
4
6
|
|
|
5
7
|
def initialize(options = {})
|
|
6
8
|
@options = options
|
|
9
|
+
@sampled_fields = {}
|
|
10
|
+
@num_samples = 0
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def run
|
|
14
|
+
log_csv_header if options[:csv] && !options[:csv_appending]
|
|
15
|
+
if options[:replay_from_file]
|
|
16
|
+
replay_from options[:replay_from_file]
|
|
17
|
+
else
|
|
18
|
+
begin
|
|
19
|
+
TwitterWatcher.new(options).run do |status|
|
|
20
|
+
handle_status status
|
|
21
|
+
end
|
|
22
|
+
rescue SignalException, SystemExit
|
|
23
|
+
exit
|
|
24
|
+
rescue StandardError => e
|
|
25
|
+
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
|
26
|
+
STDERR.puts "Waiting for a couple of minutes..."
|
|
27
|
+
sleep 120
|
|
28
|
+
retry
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def handle_status(status)
|
|
34
|
+
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
|
35
|
+
log_json(status) if options[:json]
|
|
36
|
+
log_csv(status) if options[:csv]
|
|
37
|
+
sample_fields(status) if options[:sample_fields]
|
|
38
|
+
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def log_csv_header
|
|
43
|
+
options[:csv].puts options[:fields].to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def log_csv(status)
|
|
47
|
+
csv_row = options[:fields].map do |field|
|
|
48
|
+
field.split(".").inject(status) { |memo, segment|
|
|
49
|
+
memo && memo[segment]
|
|
50
|
+
}.to_s
|
|
51
|
+
end.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
|
52
|
+
options[:csv].puts csv_row
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def replay_from(filename)
|
|
56
|
+
File.open(filename, "r") do |file|
|
|
57
|
+
until file.eof?
|
|
58
|
+
line = file.readline
|
|
59
|
+
next if line =~ /\A------SEP.RATOR------\Z/i
|
|
60
|
+
handle_status JSON.parse(line)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def sample_fields(status)
|
|
66
|
+
extract_fields(status, sampled_fields)
|
|
67
|
+
@num_samples += 1
|
|
68
|
+
if @num_samples > options[:sample_fields]
|
|
69
|
+
puts "Sampled fields from Twitter:"
|
|
70
|
+
sampled_fields.each do |field, count|
|
|
71
|
+
puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
|
|
72
|
+
end
|
|
73
|
+
exit 1
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def extract_fields(object, fields, current_path = [])
|
|
78
|
+
if object.is_a?(Hash)
|
|
79
|
+
object.each do |k, v|
|
|
80
|
+
extract_fields v, fields, current_path + [k]
|
|
81
|
+
end
|
|
82
|
+
else
|
|
83
|
+
path = current_path.join(".")
|
|
84
|
+
fields[path] ||= 0
|
|
85
|
+
fields[path] += 1
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def log_json(status)
|
|
90
|
+
options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
|
|
91
|
+
options[:json].puts "------SEPARATOR------"
|
|
92
|
+
options[:json].flush
|
|
7
93
|
end
|
|
8
94
|
|
|
9
95
|
def is_english?(status)
|
|
@@ -29,25 +115,5 @@ module TwitterToCsv
|
|
|
29
115
|
|
|
30
116
|
true
|
|
31
117
|
end
|
|
32
|
-
|
|
33
|
-
def run
|
|
34
|
-
begin
|
|
35
|
-
TwitterWatcher.new(options).run do |status|
|
|
36
|
-
if (options[:require_english] && is_english?(status)) || !options[:require_english]
|
|
37
|
-
if options[:json]
|
|
38
|
-
options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
|
|
39
|
-
options[:json].puts "------SEPERATOR------"
|
|
40
|
-
options[:json].flush
|
|
41
|
-
end
|
|
42
|
-
STDERR.puts "Logging: #{status['text']}" if options[:verbose]
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
rescue StandardError => e
|
|
46
|
-
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
|
47
|
-
STDERR.puts "Waiting for a couple of minutes..."
|
|
48
|
-
sleep 120
|
|
49
|
-
retry
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
118
|
end
|
|
53
119
|
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
describe TwitterToCsv::CsvBuilder do
|
|
5
|
+
describe "#handle_status" do
|
|
6
|
+
describe "when :english is set" do
|
|
7
|
+
it "skips non-English tweets" do
|
|
8
|
+
string_io = StringIO.new
|
|
9
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
|
|
10
|
+
csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
|
|
11
|
+
csv_builder.handle_status('text' => "هذه الجملة باللغة الإنجليزية.", 'user' => { 'lang' => 'en' })
|
|
12
|
+
csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
|
|
13
|
+
csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
|
|
14
|
+
csv_builder.handle_status('text' => "The lang code can lie, but we trust it for now.", 'user' => { 'lang' => 'fr' })
|
|
15
|
+
string_io.rewind
|
|
16
|
+
string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe "logging to a CSV" do
|
|
21
|
+
it "outputs the requested fields when requested in dot-notation" do
|
|
22
|
+
string_io = StringIO.new
|
|
23
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
|
|
24
|
+
csv_builder.handle_status({
|
|
25
|
+
'something' => "hello",
|
|
26
|
+
'something_else' => {
|
|
27
|
+
'a' => 'b',
|
|
28
|
+
'c' => {
|
|
29
|
+
'd' => "foo",
|
|
30
|
+
'e' => 'bar'
|
|
31
|
+
},
|
|
32
|
+
'blah' => 'hi'
|
|
33
|
+
}
|
|
34
|
+
})
|
|
35
|
+
string_io.rewind
|
|
36
|
+
string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
describe "#extract_fields" do
|
|
42
|
+
it "finds all the paths through a hash" do
|
|
43
|
+
obj = {
|
|
44
|
+
:a => :b,
|
|
45
|
+
:b => "c",
|
|
46
|
+
:d => {
|
|
47
|
+
:e => :f,
|
|
48
|
+
:g => {
|
|
49
|
+
:h => :i,
|
|
50
|
+
:j => {
|
|
51
|
+
:k => "l"
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
:m => "n"
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
fields = { "a" => 1 }
|
|
58
|
+
TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
|
|
59
|
+
fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/twitter_to_csv.gemspec
CHANGED
|
@@ -18,8 +18,7 @@ Gem::Specification.new do |s|
|
|
|
18
18
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
19
19
|
s.require_paths = ["lib"]
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
s.add_runtime_dependency 'fastercsv'
|
|
21
|
+
s.add_development_dependency "rspec"
|
|
23
22
|
s.add_runtime_dependency 'twitter-stream'
|
|
24
23
|
s.add_runtime_dependency 'em-http-request'
|
|
25
24
|
s.add_runtime_dependency 'unsupervised-language-detection'
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: twitter_to_csv
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.2
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,22 +9,22 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-02-
|
|
12
|
+
date: 2012-02-21 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
|
-
name:
|
|
16
|
-
requirement: &
|
|
15
|
+
name: rspec
|
|
16
|
+
requirement: &71632880 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
20
20
|
- !ruby/object:Gem::Version
|
|
21
21
|
version: '0'
|
|
22
|
-
type: :
|
|
22
|
+
type: :development
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *71632880
|
|
25
25
|
- !ruby/object:Gem::Dependency
|
|
26
26
|
name: twitter-stream
|
|
27
|
-
requirement: &
|
|
27
|
+
requirement: &71632670 !ruby/object:Gem::Requirement
|
|
28
28
|
none: false
|
|
29
29
|
requirements:
|
|
30
30
|
- - ! '>='
|
|
@@ -32,10 +32,10 @@ dependencies:
|
|
|
32
32
|
version: '0'
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *71632670
|
|
36
36
|
- !ruby/object:Gem::Dependency
|
|
37
37
|
name: em-http-request
|
|
38
|
-
requirement: &
|
|
38
|
+
requirement: &71632460 !ruby/object:Gem::Requirement
|
|
39
39
|
none: false
|
|
40
40
|
requirements:
|
|
41
41
|
- - ! '>='
|
|
@@ -43,10 +43,10 @@ dependencies:
|
|
|
43
43
|
version: '0'
|
|
44
44
|
type: :runtime
|
|
45
45
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *71632460
|
|
47
47
|
- !ruby/object:Gem::Dependency
|
|
48
48
|
name: unsupervised-language-detection
|
|
49
|
-
requirement: &
|
|
49
|
+
requirement: &71632250 !ruby/object:Gem::Requirement
|
|
50
50
|
none: false
|
|
51
51
|
requirements:
|
|
52
52
|
- - ! '>='
|
|
@@ -54,7 +54,7 @@ dependencies:
|
|
|
54
54
|
version: '0'
|
|
55
55
|
type: :runtime
|
|
56
56
|
prerelease: false
|
|
57
|
-
version_requirements: *
|
|
57
|
+
version_requirements: *71632250
|
|
58
58
|
description: ''
|
|
59
59
|
email:
|
|
60
60
|
- andrew@iterationlabs.com
|
|
@@ -64,6 +64,7 @@ extensions: []
|
|
|
64
64
|
extra_rdoc_files: []
|
|
65
65
|
files:
|
|
66
66
|
- .gitignore
|
|
67
|
+
- .rspec
|
|
67
68
|
- .rvmrc
|
|
68
69
|
- Gemfile
|
|
69
70
|
- README.markdown
|
|
@@ -73,6 +74,8 @@ files:
|
|
|
73
74
|
- lib/twitter_to_csv/csv_builder.rb
|
|
74
75
|
- lib/twitter_to_csv/twitter_watcher.rb
|
|
75
76
|
- lib/twitter_to_csv/version.rb
|
|
77
|
+
- spec/csv_builder_spec.rb
|
|
78
|
+
- spec/spec_helper.rb
|
|
76
79
|
- twitter_to_csv.gemspec
|
|
77
80
|
homepage: ''
|
|
78
81
|
licenses: []
|