twitter_to_csv 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -3,3 +3,4 @@
3
3
  Gemfile.lock
4
4
  pkg/*
5
5
  .idea
6
+ data
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/README.markdown CHANGED
@@ -3,7 +3,10 @@
3
3
  ## Usage
4
4
 
5
5
  twitter_to_csv --username <your twitter username> --password <your twitter password> \
6
- --json hi.json --filter zit,zits,pimple,pimples,acne
6
+ --json hi.json --filter zit,zits,pimple,pimples,acne \
7
+ --csv out.csv --fields text,
8
+ --fields text,retweeted_status.id,retweeted_status.favorited,...
9
+
10
+ Use `--sample-fields 1000`` to output the occurrence count of different Twitter fields.
7
11
 
8
-
9
- Verbosity and actually outputting to a CSV coming soon :)
12
+ You can also `--replay-from-file` if you have a JSON output file and you want to run it back through the exporter.
data/Rakefile CHANGED
@@ -1 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+ task :default => :spec
data/bin/twitter_to_csv CHANGED
@@ -4,7 +4,7 @@ require 'open-uri'
4
4
  require 'optparse'
5
5
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
6
6
 
7
- options = { :csv => STDOUT }
7
+ options = { :csv => STDOUT, :fields => %w[text user.lang] }
8
8
  parser = OptionParser.new do |opts|
9
9
  opts.banner = "Usage: #{File.basename($0)} [options]"
10
10
  opts.separator ""
@@ -18,11 +18,12 @@ parser = OptionParser.new do |opts|
18
18
  options[:password] = password
19
19
  end
20
20
 
21
- opts.on("-c", "--csv [FILE]", "The CSV file to write to") do |csv|
21
+ opts.on("-c", "--csv [FILE]", "The CSV file to append to") do |csv|
22
+ options[:csv_appending] = File.exists?(csv)
22
23
  options[:csv] = File.open(csv, 'a')
23
24
  end
24
25
 
25
- opts.on("-j", "--json [FILE]", "The file to log all Twitter JSON to") do |json|
26
+ opts.on("-j", "--json [FILE]", "The JSON file to append to") do |json|
26
27
  options[:json] = File.open(json, 'a')
27
28
  end
28
29
 
@@ -30,6 +31,10 @@ parser = OptionParser.new do |opts|
30
31
  options[:filter] = filter.split(/\s*,\s*/)
31
32
  end
32
33
 
34
+ opts.on("-x", "--fields [FIELDS]", "Fields to include in the CSV") do |fields|
35
+ options[:fields] = fields.split(/\s*,\s*/)
36
+ end
37
+
33
38
  opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
34
39
  options[:require_english] = e
35
40
  end
@@ -38,6 +43,14 @@ parser = OptionParser.new do |opts|
38
43
  options[:verbose] = v
39
44
  end
40
45
 
46
+ opts.on_tail("", "--sample-fields [NUMBER_OF_SAMPLES]", "Sample fields from Twitter, display them, and then exit.") do |samples|
47
+ options[:sample_fields] = samples && samples.to_i
48
+ end
49
+
50
+ opts.on_tail("", "--replay-from-file FILENAME", "Replay status from a JSON dump file") do |replay_file|
51
+ options[:replay_from_file] = replay_file
52
+ end
53
+
41
54
  opts.on_tail("-h", "--help", "Show this message") do
42
55
  STDERR.puts opts
43
56
  exit
@@ -51,7 +64,7 @@ end
51
64
 
52
65
  parser.parse!
53
66
 
54
- unless options[:username] && options[:password]
67
+ unless (options[:username] && options[:password]) || options[:replay_from_file]
55
68
  STDERR.puts "Error: Twitter username and password are required fields.\n\n"
56
69
  STDERR.puts parser
57
70
  exit 1
@@ -1,6 +1,6 @@
1
1
  require 'rubygems'
2
2
  require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "version"))
3
- require 'fastercsv'
3
+ require 'csv'
4
4
  require 'pp'
5
5
  require 'json'
6
6
  require 'twitter/json_stream'
@@ -1,9 +1,95 @@
1
+ require 'pp'
2
+
1
3
  module TwitterToCsv
2
4
  class CsvBuilder
3
- attr_accessor :options
5
+ attr_accessor :options, :sampled_fields
4
6
 
5
7
  def initialize(options = {})
6
8
  @options = options
9
+ @sampled_fields = {}
10
+ @num_samples = 0
11
+ end
12
+
13
+ def run
14
+ log_csv_header if options[:csv] && !options[:csv_appending]
15
+ if options[:replay_from_file]
16
+ replay_from options[:replay_from_file]
17
+ else
18
+ begin
19
+ TwitterWatcher.new(options).run do |status|
20
+ handle_status status
21
+ end
22
+ rescue SignalException, SystemExit
23
+ exit
24
+ rescue StandardError => e
25
+ STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
26
+ STDERR.puts "Waiting for a couple of minutes..."
27
+ sleep 120
28
+ retry
29
+ end
30
+ end
31
+ end
32
+
33
+ def handle_status(status)
34
+ if (options[:require_english] && is_english?(status)) || !options[:require_english]
35
+ log_json(status) if options[:json]
36
+ log_csv(status) if options[:csv]
37
+ sample_fields(status) if options[:sample_fields]
38
+ STDERR.puts "Logging: #{status['text']}" if options[:verbose]
39
+ end
40
+ end
41
+
42
+ def log_csv_header
43
+ options[:csv].puts options[:fields].to_csv(:encoding => 'UTF-8', :force_quotes => true)
44
+ end
45
+
46
+ def log_csv(status)
47
+ csv_row = options[:fields].map do |field|
48
+ field.split(".").inject(status) { |memo, segment|
49
+ memo && memo[segment]
50
+ }.to_s
51
+ end.to_csv(:encoding => 'UTF-8', :force_quotes => true)
52
+ options[:csv].puts csv_row
53
+ end
54
+
55
+ def replay_from(filename)
56
+ File.open(filename, "r") do |file|
57
+ until file.eof?
58
+ line = file.readline
59
+ next if line =~ /\A------SEP.RATOR------\Z/i
60
+ handle_status JSON.parse(line)
61
+ end
62
+ end
63
+ end
64
+
65
+ def sample_fields(status)
66
+ extract_fields(status, sampled_fields)
67
+ @num_samples += 1
68
+ if @num_samples > options[:sample_fields]
69
+ puts "Sampled fields from Twitter:"
70
+ sampled_fields.each do |field, count|
71
+ puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
72
+ end
73
+ exit 1
74
+ end
75
+ end
76
+
77
+ def extract_fields(object, fields, current_path = [])
78
+ if object.is_a?(Hash)
79
+ object.each do |k, v|
80
+ extract_fields v, fields, current_path + [k]
81
+ end
82
+ else
83
+ path = current_path.join(".")
84
+ fields[path] ||= 0
85
+ fields[path] += 1
86
+ end
87
+ end
88
+
89
+ def log_json(status)
90
+ options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
91
+ options[:json].puts "------SEPARATOR------"
92
+ options[:json].flush
7
93
  end
8
94
 
9
95
  def is_english?(status)
@@ -29,25 +115,5 @@ module TwitterToCsv
29
115
 
30
116
  true
31
117
  end
32
-
33
- def run
34
- begin
35
- TwitterWatcher.new(options).run do |status|
36
- if (options[:require_english] && is_english?(status)) || !options[:require_english]
37
- if options[:json]
38
- options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
39
- options[:json].puts "------SEPERATOR------"
40
- options[:json].flush
41
- end
42
- STDERR.puts "Logging: #{status['text']}" if options[:verbose]
43
- end
44
- end
45
- rescue StandardError => e
46
- STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
47
- STDERR.puts "Waiting for a couple of minutes..."
48
- sleep 120
49
- retry
50
- end
51
- end
52
118
  end
53
119
  end
@@ -1,3 +1,3 @@
1
1
  module TwitterToCsv
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe TwitterToCsv::CsvBuilder do
5
+ describe "#handle_status" do
6
+ describe "when :english is set" do
7
+ it "skips non-English tweets" do
8
+ string_io = StringIO.new
9
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
10
+ csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
11
+ csv_builder.handle_status('text' => "هذه الجملة باللغة الإنجليزية.", 'user' => { 'lang' => 'en' })
12
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
13
+ csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
14
+ csv_builder.handle_status('text' => "The lang code can lie, but we trust it for now.", 'user' => { 'lang' => 'fr' })
15
+ string_io.rewind
16
+ string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
17
+ end
18
+ end
19
+
20
+ describe "logging to a CSV" do
21
+ it "outputs the requested fields when requested in dot-notation" do
22
+ string_io = StringIO.new
23
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
24
+ csv_builder.handle_status({
25
+ 'something' => "hello",
26
+ 'something_else' => {
27
+ 'a' => 'b',
28
+ 'c' => {
29
+ 'd' => "foo",
30
+ 'e' => 'bar'
31
+ },
32
+ 'blah' => 'hi'
33
+ }
34
+ })
35
+ string_io.rewind
36
+ string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
37
+ end
38
+ end
39
+ end
40
+
41
+ describe "#extract_fields" do
42
+ it "finds all the paths through a hash" do
43
+ obj = {
44
+ :a => :b,
45
+ :b => "c",
46
+ :d => {
47
+ :e => :f,
48
+ :g => {
49
+ :h => :i,
50
+ :j => {
51
+ :k => "l"
52
+ }
53
+ },
54
+ :m => "n"
55
+ }
56
+ }
57
+ fields = { "a" => 1 }
58
+ TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
59
+ fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'twitter_to_csv'
4
+
5
+ RSpec.configure do |config|
6
+ # some (optional) config here
7
+ end
@@ -18,8 +18,7 @@ Gem::Specification.new do |s|
18
18
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
19
  s.require_paths = ["lib"]
20
20
 
21
- # s.add_development_dependency "rspec"
22
- s.add_runtime_dependency 'fastercsv'
21
+ s.add_development_dependency "rspec"
23
22
  s.add_runtime_dependency 'twitter-stream'
24
23
  s.add_runtime_dependency 'em-http-request'
25
24
  s.add_runtime_dependency 'unsupervised-language-detection'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_to_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-17 00:00:00.000000000Z
12
+ date: 2012-02-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: fastercsv
16
- requirement: &83775030 !ruby/object:Gem::Requirement
15
+ name: rspec
16
+ requirement: &71632880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
21
  version: '0'
22
- type: :runtime
22
+ type: :development
23
23
  prerelease: false
24
- version_requirements: *83775030
24
+ version_requirements: *71632880
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: twitter-stream
27
- requirement: &83774820 !ruby/object:Gem::Requirement
27
+ requirement: &71632670 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *83774820
35
+ version_requirements: *71632670
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: em-http-request
38
- requirement: &83774610 !ruby/object:Gem::Requirement
38
+ requirement: &71632460 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *83774610
46
+ version_requirements: *71632460
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: unsupervised-language-detection
49
- requirement: &83774400 !ruby/object:Gem::Requirement
49
+ requirement: &71632250 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *83774400
57
+ version_requirements: *71632250
58
58
  description: ''
59
59
  email:
60
60
  - andrew@iterationlabs.com
@@ -64,6 +64,7 @@ extensions: []
64
64
  extra_rdoc_files: []
65
65
  files:
66
66
  - .gitignore
67
+ - .rspec
67
68
  - .rvmrc
68
69
  - Gemfile
69
70
  - README.markdown
@@ -73,6 +74,8 @@ files:
73
74
  - lib/twitter_to_csv/csv_builder.rb
74
75
  - lib/twitter_to_csv/twitter_watcher.rb
75
76
  - lib/twitter_to_csv/version.rb
77
+ - spec/csv_builder_spec.rb
78
+ - spec/spec_helper.rb
76
79
  - twitter_to_csv.gemspec
77
80
  homepage: ''
78
81
  licenses: []