twitter_to_csv 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -3,3 +3,4 @@
3
3
  Gemfile.lock
4
4
  pkg/*
5
5
  .idea
6
+ data
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/README.markdown CHANGED
@@ -3,7 +3,10 @@
3
3
  ## Usage
4
4
 
5
5
  twitter_to_csv --username <your twitter username> --password <your twitter password> \
6
- --json hi.json --filter zit,zits,pimple,pimples,acne
6
+ --json hi.json --filter zit,zits,pimple,pimples,acne \
7
+ --csv out.csv --fields text,
8
+ --fields text,retweeted_status.id,retweeted_status.favorited,...
9
+
10
+ Use `--sample-fields 1000`` to output the occurrence count of different Twitter fields.
7
11
 
8
-
9
- Verbosity and actually outputting to a CSV coming soon :)
12
+ You can also `--replay-from-file` if you have a JSON output file and you want to run it back through the exporter.
data/Rakefile CHANGED
@@ -1 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+ task :default => :spec
data/bin/twitter_to_csv CHANGED
@@ -4,7 +4,7 @@ require 'open-uri'
4
4
  require 'optparse'
5
5
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
6
6
 
7
- options = { :csv => STDOUT }
7
+ options = { :csv => STDOUT, :fields => %w[text user.lang] }
8
8
  parser = OptionParser.new do |opts|
9
9
  opts.banner = "Usage: #{File.basename($0)} [options]"
10
10
  opts.separator ""
@@ -18,11 +18,12 @@ parser = OptionParser.new do |opts|
18
18
  options[:password] = password
19
19
  end
20
20
 
21
- opts.on("-c", "--csv [FILE]", "The CSV file to write to") do |csv|
21
+ opts.on("-c", "--csv [FILE]", "The CSV file to append to") do |csv|
22
+ options[:csv_appending] = File.exists?(csv)
22
23
  options[:csv] = File.open(csv, 'a')
23
24
  end
24
25
 
25
- opts.on("-j", "--json [FILE]", "The file to log all Twitter JSON to") do |json|
26
+ opts.on("-j", "--json [FILE]", "The JSON file to append to") do |json|
26
27
  options[:json] = File.open(json, 'a')
27
28
  end
28
29
 
@@ -30,6 +31,10 @@ parser = OptionParser.new do |opts|
30
31
  options[:filter] = filter.split(/\s*,\s*/)
31
32
  end
32
33
 
34
+ opts.on("-x", "--fields [FIELDS]", "Fields to include in the CSV") do |fields|
35
+ options[:fields] = fields.split(/\s*,\s*/)
36
+ end
37
+
33
38
  opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e|
34
39
  options[:require_english] = e
35
40
  end
@@ -38,6 +43,14 @@ parser = OptionParser.new do |opts|
38
43
  options[:verbose] = v
39
44
  end
40
45
 
46
+ opts.on_tail("", "--sample-fields [NUMBER_OF_SAMPLES]", "Sample fields from Twitter, display them, and then exit.") do |samples|
47
+ options[:sample_fields] = samples && samples.to_i
48
+ end
49
+
50
+ opts.on_tail("", "--replay-from-file FILENAME", "Replay status from a JSON dump file") do |replay_file|
51
+ options[:replay_from_file] = replay_file
52
+ end
53
+
41
54
  opts.on_tail("-h", "--help", "Show this message") do
42
55
  STDERR.puts opts
43
56
  exit
@@ -51,7 +64,7 @@ end
51
64
 
52
65
  parser.parse!
53
66
 
54
- unless options[:username] && options[:password]
67
+ unless (options[:username] && options[:password]) || options[:replay_from_file]
55
68
  STDERR.puts "Error: Twitter username and password are required fields.\n\n"
56
69
  STDERR.puts parser
57
70
  exit 1
@@ -1,6 +1,6 @@
1
1
  require 'rubygems'
2
2
  require File.expand_path(File.join(File.dirname(__FILE__), "twitter_to_csv", "version"))
3
- require 'fastercsv'
3
+ require 'csv'
4
4
  require 'pp'
5
5
  require 'json'
6
6
  require 'twitter/json_stream'
@@ -1,9 +1,95 @@
1
+ require 'pp'
2
+
1
3
  module TwitterToCsv
2
4
  class CsvBuilder
3
- attr_accessor :options
5
+ attr_accessor :options, :sampled_fields
4
6
 
5
7
  def initialize(options = {})
6
8
  @options = options
9
+ @sampled_fields = {}
10
+ @num_samples = 0
11
+ end
12
+
13
+ def run
14
+ log_csv_header if options[:csv] && !options[:csv_appending]
15
+ if options[:replay_from_file]
16
+ replay_from options[:replay_from_file]
17
+ else
18
+ begin
19
+ TwitterWatcher.new(options).run do |status|
20
+ handle_status status
21
+ end
22
+ rescue SignalException, SystemExit
23
+ exit
24
+ rescue StandardError => e
25
+ STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
26
+ STDERR.puts "Waiting for a couple of minutes..."
27
+ sleep 120
28
+ retry
29
+ end
30
+ end
31
+ end
32
+
33
+ def handle_status(status)
34
+ if (options[:require_english] && is_english?(status)) || !options[:require_english]
35
+ log_json(status) if options[:json]
36
+ log_csv(status) if options[:csv]
37
+ sample_fields(status) if options[:sample_fields]
38
+ STDERR.puts "Logging: #{status['text']}" if options[:verbose]
39
+ end
40
+ end
41
+
42
+ def log_csv_header
43
+ options[:csv].puts options[:fields].to_csv(:encoding => 'UTF-8', :force_quotes => true)
44
+ end
45
+
46
+ def log_csv(status)
47
+ csv_row = options[:fields].map do |field|
48
+ field.split(".").inject(status) { |memo, segment|
49
+ memo && memo[segment]
50
+ }.to_s
51
+ end.to_csv(:encoding => 'UTF-8', :force_quotes => true)
52
+ options[:csv].puts csv_row
53
+ end
54
+
55
+ def replay_from(filename)
56
+ File.open(filename, "r") do |file|
57
+ until file.eof?
58
+ line = file.readline
59
+ next if line =~ /\A------SEP.RATOR------\Z/i
60
+ handle_status JSON.parse(line)
61
+ end
62
+ end
63
+ end
64
+
65
+ def sample_fields(status)
66
+ extract_fields(status, sampled_fields)
67
+ @num_samples += 1
68
+ if @num_samples > options[:sample_fields]
69
+ puts "Sampled fields from Twitter:"
70
+ sampled_fields.each do |field, count|
71
+ puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
72
+ end
73
+ exit 1
74
+ end
75
+ end
76
+
77
+ def extract_fields(object, fields, current_path = [])
78
+ if object.is_a?(Hash)
79
+ object.each do |k, v|
80
+ extract_fields v, fields, current_path + [k]
81
+ end
82
+ else
83
+ path = current_path.join(".")
84
+ fields[path] ||= 0
85
+ fields[path] += 1
86
+ end
87
+ end
88
+
89
+ def log_json(status)
90
+ options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
91
+ options[:json].puts "------SEPARATOR------"
92
+ options[:json].flush
7
93
  end
8
94
 
9
95
  def is_english?(status)
@@ -29,25 +115,5 @@ module TwitterToCsv
29
115
 
30
116
  true
31
117
  end
32
-
33
- def run
34
- begin
35
- TwitterWatcher.new(options).run do |status|
36
- if (options[:require_english] && is_english?(status)) || !options[:require_english]
37
- if options[:json]
38
- options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
39
- options[:json].puts "------SEPERATOR------"
40
- options[:json].flush
41
- end
42
- STDERR.puts "Logging: #{status['text']}" if options[:verbose]
43
- end
44
- end
45
- rescue StandardError => e
46
- STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
47
- STDERR.puts "Waiting for a couple of minutes..."
48
- sleep 120
49
- retry
50
- end
51
- end
52
118
  end
53
119
  end
@@ -1,3 +1,3 @@
1
1
  module TwitterToCsv
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe TwitterToCsv::CsvBuilder do
5
+ describe "#handle_status" do
6
+ describe "when :english is set" do
7
+ it "skips non-English tweets" do
8
+ string_io = StringIO.new
9
+ csv_builder = TwitterToCsv::CsvBuilder.new(:require_english => true, :csv => string_io, :fields => %w[text])
10
+ csv_builder.handle_status('text' => "This is English", 'user' => { 'lang' => 'en' })
11
+ csv_builder.handle_status('text' => "هذه الجملة باللغة الإنجليزية.", 'user' => { 'lang' => 'en' })
12
+ csv_builder.handle_status('text' => "Esta frase se encuentra en Ingles.", 'user' => { 'lang' => 'en' })
13
+ csv_builder.handle_status('text' => "This is still English", 'user' => { 'lang' => 'en' })
14
+ csv_builder.handle_status('text' => "The lang code can lie, but we trust it for now.", 'user' => { 'lang' => 'fr' })
15
+ string_io.rewind
16
+ string_io.read.should == "\"This is English\"\n\"This is still English\"\n"
17
+ end
18
+ end
19
+
20
+ describe "logging to a CSV" do
21
+ it "outputs the requested fields when requested in dot-notation" do
22
+ string_io = StringIO.new
23
+ csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a something_else.c.d])
24
+ csv_builder.handle_status({
25
+ 'something' => "hello",
26
+ 'something_else' => {
27
+ 'a' => 'b',
28
+ 'c' => {
29
+ 'd' => "foo",
30
+ 'e' => 'bar'
31
+ },
32
+ 'blah' => 'hi'
33
+ }
34
+ })
35
+ string_io.rewind
36
+ string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
37
+ end
38
+ end
39
+ end
40
+
41
+ describe "#extract_fields" do
42
+ it "finds all the paths through a hash" do
43
+ obj = {
44
+ :a => :b,
45
+ :b => "c",
46
+ :d => {
47
+ :e => :f,
48
+ :g => {
49
+ :h => :i,
50
+ :j => {
51
+ :k => "l"
52
+ }
53
+ },
54
+ :m => "n"
55
+ }
56
+ }
57
+ fields = { "a" => 1 }
58
+ TwitterToCsv::CsvBuilder.new.extract_fields(obj, fields)
59
+ fields.should == { "a" => 2, "b" => 1, "d.e" => 1, "d.g.h" => 1, "d.g.j.k" => 1, "d.m" => 1 }
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'twitter_to_csv'
4
+
5
+ RSpec.configure do |config|
6
+ # some (optional) config here
7
+ end
@@ -18,8 +18,7 @@ Gem::Specification.new do |s|
18
18
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
19
  s.require_paths = ["lib"]
20
20
 
21
- # s.add_development_dependency "rspec"
22
- s.add_runtime_dependency 'fastercsv'
21
+ s.add_development_dependency "rspec"
23
22
  s.add_runtime_dependency 'twitter-stream'
24
23
  s.add_runtime_dependency 'em-http-request'
25
24
  s.add_runtime_dependency 'unsupervised-language-detection'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_to_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-17 00:00:00.000000000Z
12
+ date: 2012-02-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: fastercsv
16
- requirement: &83775030 !ruby/object:Gem::Requirement
15
+ name: rspec
16
+ requirement: &71632880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
21
  version: '0'
22
- type: :runtime
22
+ type: :development
23
23
  prerelease: false
24
- version_requirements: *83775030
24
+ version_requirements: *71632880
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: twitter-stream
27
- requirement: &83774820 !ruby/object:Gem::Requirement
27
+ requirement: &71632670 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *83774820
35
+ version_requirements: *71632670
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: em-http-request
38
- requirement: &83774610 !ruby/object:Gem::Requirement
38
+ requirement: &71632460 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *83774610
46
+ version_requirements: *71632460
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: unsupervised-language-detection
49
- requirement: &83774400 !ruby/object:Gem::Requirement
49
+ requirement: &71632250 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *83774400
57
+ version_requirements: *71632250
58
58
  description: ''
59
59
  email:
60
60
  - andrew@iterationlabs.com
@@ -64,6 +64,7 @@ extensions: []
64
64
  extra_rdoc_files: []
65
65
  files:
66
66
  - .gitignore
67
+ - .rspec
67
68
  - .rvmrc
68
69
  - Gemfile
69
70
  - README.markdown
@@ -73,6 +74,8 @@ files:
73
74
  - lib/twitter_to_csv/csv_builder.rb
74
75
  - lib/twitter_to_csv/twitter_watcher.rb
75
76
  - lib/twitter_to_csv/version.rb
77
+ - spec/csv_builder_spec.rb
78
+ - spec/spec_helper.rb
76
79
  - twitter_to_csv.gemspec
77
80
  homepage: ''
78
81
  licenses: []