twitter_to_csv 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +177 -7
- data/bin/twitter_to_csv +14 -10
- data/lib/twitter_to_csv/csv_builder.rb +20 -3
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +34 -0
- metadata +14 -12
data/README.markdown
CHANGED
@@ -1,12 +1,182 @@
|
|
1
1
|
# Twitter To CSV
|
2
2
|
|
3
|
+
A tool for exporting the Twitter stream into a CSV file.
|
4
|
+
|
5
|
+
(sudo) gem install twitter_to_csv
|
6
|
+
|
3
7
|
## Usage
|
4
8
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
You might start by first running the script for a while to dump the Twitter stream into a JSON file:
|
10
|
+
|
11
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
12
|
+
--json out.json --filter your,keywords,of,interest
|
13
|
+
|
14
|
+
Then, later, you could export to CSV:
|
15
|
+
|
16
|
+
twitter_to_csv --replay-from-file out.json --csv out.csv \
|
17
|
+
--fields text,created_at,user.name,retweeted_status.id,retweeted_status.favorited,...
|
18
|
+
|
19
|
+
Alternatively, you can always stream directly to CSV:
|
20
|
+
|
21
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
22
|
+
--filter your,keywords,of,interest --csv out.csv \
|
23
|
+
--fields text,created_at,user.name,retweeted_status.id,retweeted_status.favorited,...
|
24
|
+
|
25
|
+
## Requiring English
|
26
|
+
|
27
|
+
You may want to limit to Tweets that appear to be writen in English.
|
28
|
+
|
29
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
30
|
+
--require-english --fields ...
|
31
|
+
|
32
|
+
This filter isn't perfect and will have both false positives and false negatives, but it works pretty well.
|
33
|
+
|
34
|
+
## URLS
|
35
|
+
|
36
|
+
You can extract URLs from the tweet into their own columns by including `--url-columns 3`, for example, to get up to 3 extracted URLs in their own columns.
|
37
|
+
|
38
|
+
## Field names
|
39
|
+
|
40
|
+
Use `--sample-fields 1000`` to output the occurrence count of different Twitter fields, like so:
|
41
|
+
|
42
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> --sample-fields 1000
|
43
|
+
|
44
|
+
Here's a partial list:
|
11
45
|
|
12
|
-
|
46
|
+
in_reply_to_screen_name
|
47
|
+
favorited
|
48
|
+
text
|
49
|
+
entities.urls
|
50
|
+
entities.user_mentions
|
51
|
+
entities.hashtags
|
52
|
+
in_reply_to_user_id
|
53
|
+
contributors
|
54
|
+
place
|
55
|
+
coordinates
|
56
|
+
source
|
57
|
+
geo
|
58
|
+
retweeted
|
59
|
+
retweet_count
|
60
|
+
in_reply_to_status_id
|
61
|
+
in_reply_to_status_id_str
|
62
|
+
id_str
|
63
|
+
user.default_profile_image
|
64
|
+
user.verified
|
65
|
+
user.notifications
|
66
|
+
user.profile_sidebar_border_color
|
67
|
+
user.screen_name
|
68
|
+
user.lang
|
69
|
+
user.favourites_count
|
70
|
+
user.contributors_enabled
|
71
|
+
user.profile_use_background_image
|
72
|
+
user.friends_count
|
73
|
+
user.location
|
74
|
+
user.profile_text_color
|
75
|
+
user.followers_count
|
76
|
+
user.profile_image_url
|
77
|
+
user.description
|
78
|
+
user.statuses_count
|
79
|
+
user.following
|
80
|
+
user.profile_background_image_url
|
81
|
+
user.show_all_inline_media
|
82
|
+
user.listed_count
|
83
|
+
user.profile_link_color
|
84
|
+
user.is_translator
|
85
|
+
user.default_profile
|
86
|
+
user.time_zone
|
87
|
+
user.profile_background_color
|
88
|
+
user.protected
|
89
|
+
user.id_str
|
90
|
+
user.geo_enabled
|
91
|
+
user.profile_background_tile
|
92
|
+
user.name
|
93
|
+
user.profile_background_image_url_https
|
94
|
+
user.created_at
|
95
|
+
user.profile_sidebar_fill_color
|
96
|
+
user.id
|
97
|
+
user.follow_request_sent
|
98
|
+
user.utc_offset
|
99
|
+
user.url
|
100
|
+
user.profile_image_url_https
|
101
|
+
truncated
|
102
|
+
id
|
103
|
+
created_at
|
104
|
+
in_reply_to_user_id_str
|
105
|
+
retweeted_status.in_reply_to_screen_name
|
106
|
+
retweeted_status.favorited
|
107
|
+
retweeted_status.text
|
108
|
+
retweeted_status.entities.urls
|
109
|
+
retweeted_status.entities.user_mentions
|
110
|
+
retweeted_status.entities.hashtags
|
111
|
+
retweeted_status.in_reply_to_user_id
|
112
|
+
retweeted_status.contributors
|
113
|
+
retweeted_status.place
|
114
|
+
retweeted_status.coordinates
|
115
|
+
retweeted_status.source
|
116
|
+
retweeted_status.geo
|
117
|
+
retweeted_status.retweeted
|
118
|
+
retweeted_status.retweet_count
|
119
|
+
retweeted_status.in_reply_to_status_id
|
120
|
+
retweeted_status.in_reply_to_status_id_str
|
121
|
+
retweeted_status.id_str
|
122
|
+
retweeted_status.user.default_profile_image
|
123
|
+
retweeted_status.user.verified
|
124
|
+
retweeted_status.user.notifications
|
125
|
+
retweeted_status.user.profile_sidebar_border_color
|
126
|
+
retweeted_status.user.screen_name
|
127
|
+
retweeted_status.user.lang
|
128
|
+
retweeted_status.user.favourites_count
|
129
|
+
retweeted_status.user.contributors_enabled
|
130
|
+
retweeted_status.user.profile_use_background_image
|
131
|
+
retweeted_status.user.friends_count
|
132
|
+
retweeted_status.user.location
|
133
|
+
retweeted_status.user.profile_text_color
|
134
|
+
retweeted_status.user.followers_count
|
135
|
+
retweeted_status.user.profile_image_url
|
136
|
+
retweeted_status.user.description
|
137
|
+
retweeted_status.user.statuses_count
|
138
|
+
retweeted_status.user.following
|
139
|
+
retweeted_status.user.profile_background_image_url
|
140
|
+
retweeted_status.user.show_all_inline_media
|
141
|
+
retweeted_status.user.listed_count
|
142
|
+
retweeted_status.user.profile_link_color
|
143
|
+
retweeted_status.user.is_translator
|
144
|
+
retweeted_status.user.default_profile
|
145
|
+
retweeted_status.user.time_zone
|
146
|
+
retweeted_status.user.profile_background_color
|
147
|
+
retweeted_status.user.protected
|
148
|
+
retweeted_status.user.id_str
|
149
|
+
retweeted_status.user.geo_enabled
|
150
|
+
retweeted_status.user.profile_background_tile
|
151
|
+
retweeted_status.user.name
|
152
|
+
retweeted_status.user.profile_background_image_url_https
|
153
|
+
retweeted_status.user.created_at
|
154
|
+
retweeted_status.user.profile_sidebar_fill_color
|
155
|
+
retweeted_status.user.id
|
156
|
+
retweeted_status.user.follow_request_sent
|
157
|
+
retweeted_status.user.utc_offset
|
158
|
+
retweeted_status.user.url
|
159
|
+
retweeted_status.user.profile_image_url_https
|
160
|
+
retweeted_status.truncated
|
161
|
+
retweeted_status.id
|
162
|
+
retweeted_status.created_at
|
163
|
+
retweeted_status.in_reply_to_user_id_str
|
164
|
+
possibly_sensitive
|
165
|
+
possibly_sensitive_editable
|
166
|
+
retweeted_status.possibly_sensitive
|
167
|
+
retweeted_status.possibly_sensitive_editable
|
168
|
+
place.country_code
|
169
|
+
place.place_type
|
170
|
+
place.country
|
171
|
+
place.bounding_box.type
|
172
|
+
place.bounding_box.coordinates
|
173
|
+
place.full_name
|
174
|
+
place.name
|
175
|
+
place.id
|
176
|
+
place.url
|
177
|
+
coordinates.type
|
178
|
+
coordinates.coordinates
|
179
|
+
geo.type
|
180
|
+
geo.coordinates
|
181
|
+
retweeted_status.entities.media
|
182
|
+
entities.media
|
data/bin/twitter_to_csv
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'optparse'
|
5
5
|
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
6
6
|
|
7
|
-
options = { :csv => STDOUT, :fields => %w[text
|
7
|
+
options = { :csv => STDOUT, :fields => %w[text] }
|
8
8
|
parser = OptionParser.new do |opts|
|
9
9
|
opts.banner = "Usage: #{File.basename($0)} [options]"
|
10
10
|
opts.separator ""
|
@@ -18,20 +18,20 @@ parser = OptionParser.new do |opts|
|
|
18
18
|
options[:password] = password
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on("-c", "--csv
|
21
|
+
opts.on("-c", "--csv FILE", "The CSV file to append to") do |csv|
|
22
22
|
options[:csv_appending] = File.exists?(csv)
|
23
23
|
options[:csv] = File.open(csv, 'a')
|
24
24
|
end
|
25
|
-
|
26
|
-
opts.on("-j", "--json
|
25
|
+
|
26
|
+
opts.on("-j", "--json FILE", "The JSON file to append to") do |json|
|
27
27
|
options[:json] = File.open(json, 'a')
|
28
28
|
end
|
29
29
|
|
30
|
-
opts.on("-f", "--filter
|
30
|
+
opts.on("-f", "--filter KEYWORDS", "Keywords to ask Twitter to filter on") do |filter|
|
31
31
|
options[:filter] = filter.split(/\s*,\s*/)
|
32
32
|
end
|
33
33
|
|
34
|
-
opts.on("-x", "--fields
|
34
|
+
opts.on("-x", "--fields FIELDS", "Fields to include in the CSV") do |fields|
|
35
35
|
options[:fields] = fields.split(/\s*,\s*/)
|
36
36
|
end
|
37
37
|
|
@@ -43,19 +43,23 @@ parser = OptionParser.new do |opts|
|
|
43
43
|
options[:verbose] = v
|
44
44
|
end
|
45
45
|
|
46
|
-
opts.on_tail(
|
46
|
+
opts.on_tail(nil, "--sample-fields NUMBER_OF_SAMPLES", "Record NUMBER_OF_SAMPLES tweets and then print out all","of the field names seen. Use to find out what can be passed to.") do |samples|
|
47
47
|
options[:sample_fields] = samples && samples.to_i
|
48
48
|
end
|
49
49
|
|
50
|
-
opts.on_tail("", "--replay-from-file FILENAME", "Replay
|
50
|
+
opts.on_tail("", "--replay-from-file FILENAME", "Replay tweets from a JSON dump file") do |replay_file|
|
51
51
|
options[:replay_from_file] = replay_file
|
52
52
|
end
|
53
53
|
|
54
|
+
opts.on_tail("", "--url-columns NUMBER_OF_COLUMNS", "Extract up to NUMBER_OF_COLUMNS urls from the status and include them in the CSV") do |url_columns|
|
55
|
+
options[:url_columns] = url_columns.to_i
|
56
|
+
end
|
57
|
+
|
54
58
|
opts.on_tail("-h", "--help", "Show this message") do
|
55
59
|
STDERR.puts opts
|
56
60
|
exit
|
57
61
|
end
|
58
|
-
|
62
|
+
|
59
63
|
opts.on_tail("--version", "Show version") do
|
60
64
|
STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}"
|
61
65
|
exit
|
@@ -1,9 +1,13 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
require 'pp'
|
2
3
|
|
3
4
|
module TwitterToCsv
|
4
5
|
class CsvBuilder
|
5
6
|
attr_accessor :options, :sampled_fields
|
6
7
|
|
8
|
+
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
|
9
|
+
URL_REGEX = %r"\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s\`\!\(\)\[\]\{\};:'\".,<>\?«»“”‘’]))"i
|
10
|
+
|
7
11
|
def initialize(options = {})
|
8
12
|
@options = options
|
9
13
|
@sampled_fields = {}
|
@@ -20,6 +24,7 @@ module TwitterToCsv
|
|
20
24
|
handle_status status
|
21
25
|
end
|
22
26
|
rescue SignalException, SystemExit
|
27
|
+
EventMachine::stop_event_loop
|
23
28
|
exit
|
24
29
|
rescue StandardError => e
|
25
30
|
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
@@ -40,7 +45,13 @@ module TwitterToCsv
|
|
40
45
|
end
|
41
46
|
|
42
47
|
def log_csv_header
|
43
|
-
|
48
|
+
header_labels = options[:fields].dup
|
49
|
+
|
50
|
+
if options[:url_columns] && options[:url_columns] > 0
|
51
|
+
options[:url_columns].times { |i| header_labels << "url_#{i+1}" }
|
52
|
+
end
|
53
|
+
|
54
|
+
options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
44
55
|
end
|
45
56
|
|
46
57
|
def log_csv(status)
|
@@ -48,8 +59,14 @@ module TwitterToCsv
|
|
48
59
|
field.split(".").inject(status) { |memo, segment|
|
49
60
|
memo && memo[segment]
|
50
61
|
}.to_s
|
51
|
-
end
|
52
|
-
|
62
|
+
end
|
63
|
+
|
64
|
+
if options[:url_columns] && options[:url_columns] > 0
|
65
|
+
urls = status['text'].scan(URL_REGEX).flatten.compact
|
66
|
+
options[:url_columns].times { |i| csv_row << urls[i].to_s }
|
67
|
+
end
|
68
|
+
|
69
|
+
options[:csv].puts csv_row.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
53
70
|
end
|
54
71
|
|
55
72
|
def replay_from(filename)
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -17,6 +17,24 @@ describe TwitterToCsv::CsvBuilder do
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
+
describe "log_csv_header" do
|
21
|
+
it "outputs the fields as header labels" do
|
22
|
+
string_io = StringIO.new
|
23
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a])
|
24
|
+
csv_builder.log_csv_header
|
25
|
+
string_io.rewind
|
26
|
+
string_io.read.should == '"something","something_else.a"' + "\n"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "includes urls if requested" do
|
30
|
+
string_io = StringIO.new
|
31
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
|
32
|
+
csv_builder.log_csv_header
|
33
|
+
string_io.rewind
|
34
|
+
string_io.read.should == '"something","url_1","url_2"' + "\n"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
20
38
|
describe "logging to a CSV" do
|
21
39
|
it "outputs the requested fields when requested in dot-notation" do
|
22
40
|
string_io = StringIO.new
|
@@ -35,6 +53,22 @@ describe TwitterToCsv::CsvBuilder do
|
|
35
53
|
string_io.rewind
|
36
54
|
string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
|
37
55
|
end
|
56
|
+
|
57
|
+
it "can extract URLs" do
|
58
|
+
string_io = StringIO.new
|
59
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
|
60
|
+
csv_builder.handle_status({
|
61
|
+
'something' => "hello",
|
62
|
+
'text' => 'this is http://a.com/url and http://a.com/nother'
|
63
|
+
})
|
64
|
+
csv_builder.handle_status({
|
65
|
+
'something' => "hello",
|
66
|
+
'text' => 'this is http://a.com/url/again'
|
67
|
+
})
|
68
|
+
string_io.rewind
|
69
|
+
string_io.read.should == "\"hello\",\"http://a.com/url\",\"http://a.com/nother\"\n" +
|
70
|
+
"\"hello\",\"http://a.com/url/again\",\"\"\n"
|
71
|
+
end
|
38
72
|
end
|
39
73
|
end
|
40
74
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_to_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-28 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70236843171320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70236843171320
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: twitter-stream
|
27
|
-
requirement: &
|
27
|
+
requirement: &70236843170900 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70236843170900
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-http-request
|
38
|
-
requirement: &
|
38
|
+
requirement: &70236843170480 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70236843170480
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: unsupervised-language-detection
|
49
|
-
requirement: &
|
49
|
+
requirement: &70236843170060 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70236843170060
|
58
58
|
description: ''
|
59
59
|
email:
|
60
60
|
- andrew@iterationlabs.com
|
@@ -97,8 +97,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
version: '0'
|
98
98
|
requirements: []
|
99
99
|
rubyforge_project: twitter_to_csv
|
100
|
-
rubygems_version: 1.8.
|
100
|
+
rubygems_version: 1.8.6
|
101
101
|
signing_key:
|
102
102
|
specification_version: 3
|
103
103
|
summary: Dump the Twitter streaming API to a CSV or JSON file
|
104
|
-
test_files:
|
104
|
+
test_files:
|
105
|
+
- spec/csv_builder_spec.rb
|
106
|
+
- spec/spec_helper.rb
|