twitter_to_csv 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +177 -7
- data/bin/twitter_to_csv +14 -10
- data/lib/twitter_to_csv/csv_builder.rb +20 -3
- data/lib/twitter_to_csv/version.rb +1 -1
- data/spec/csv_builder_spec.rb +34 -0
- metadata +14 -12
data/README.markdown
CHANGED
@@ -1,12 +1,182 @@
|
|
1
1
|
# Twitter To CSV
|
2
2
|
|
3
|
+
A tool for exporting the Twitter stream into a CSV file.
|
4
|
+
|
5
|
+
(sudo) gem install twitter_to_csv
|
6
|
+
|
3
7
|
## Usage
|
4
8
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
You might start by first running the script for a while to dump the Twitter stream into a JSON file:
|
10
|
+
|
11
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
12
|
+
--json out.json --filter your,keywords,of,interest
|
13
|
+
|
14
|
+
Then, later, you could export to CSV:
|
15
|
+
|
16
|
+
twitter_to_csv --replay-from-file out.json --csv out.csv \
|
17
|
+
--fields text,created_at,user.name,retweeted_status.id,retweeted_status.favorited,...
|
18
|
+
|
19
|
+
Alternatively, you can always stream directly to CSV:
|
20
|
+
|
21
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
22
|
+
--filter your,keywords,of,interest --csv out.csv \
|
23
|
+
--fields text,created_at,user.name,retweeted_status.id,retweeted_status.favorited,...
|
24
|
+
|
25
|
+
## Requiring English
|
26
|
+
|
27
|
+
You may want to limit to Tweets that appear to be writen in English.
|
28
|
+
|
29
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> \
|
30
|
+
--require-english --fields ...
|
31
|
+
|
32
|
+
This filter isn't perfect and will have both false positives and false negatives, but it works pretty well.
|
33
|
+
|
34
|
+
## URLS
|
35
|
+
|
36
|
+
You can extract URLs from the tweet into their own columns by including `--url-columns 3`, for example, to get up to 3 extracted URLs in their own columns.
|
37
|
+
|
38
|
+
## Field names
|
39
|
+
|
40
|
+
Use `--sample-fields 1000`` to output the occurrence count of different Twitter fields, like so:
|
41
|
+
|
42
|
+
twitter_to_csv --username <your twitter username> --password <your twitter password> --sample-fields 1000
|
43
|
+
|
44
|
+
Here's a partial list:
|
11
45
|
|
12
|
-
|
46
|
+
in_reply_to_screen_name
|
47
|
+
favorited
|
48
|
+
text
|
49
|
+
entities.urls
|
50
|
+
entities.user_mentions
|
51
|
+
entities.hashtags
|
52
|
+
in_reply_to_user_id
|
53
|
+
contributors
|
54
|
+
place
|
55
|
+
coordinates
|
56
|
+
source
|
57
|
+
geo
|
58
|
+
retweeted
|
59
|
+
retweet_count
|
60
|
+
in_reply_to_status_id
|
61
|
+
in_reply_to_status_id_str
|
62
|
+
id_str
|
63
|
+
user.default_profile_image
|
64
|
+
user.verified
|
65
|
+
user.notifications
|
66
|
+
user.profile_sidebar_border_color
|
67
|
+
user.screen_name
|
68
|
+
user.lang
|
69
|
+
user.favourites_count
|
70
|
+
user.contributors_enabled
|
71
|
+
user.profile_use_background_image
|
72
|
+
user.friends_count
|
73
|
+
user.location
|
74
|
+
user.profile_text_color
|
75
|
+
user.followers_count
|
76
|
+
user.profile_image_url
|
77
|
+
user.description
|
78
|
+
user.statuses_count
|
79
|
+
user.following
|
80
|
+
user.profile_background_image_url
|
81
|
+
user.show_all_inline_media
|
82
|
+
user.listed_count
|
83
|
+
user.profile_link_color
|
84
|
+
user.is_translator
|
85
|
+
user.default_profile
|
86
|
+
user.time_zone
|
87
|
+
user.profile_background_color
|
88
|
+
user.protected
|
89
|
+
user.id_str
|
90
|
+
user.geo_enabled
|
91
|
+
user.profile_background_tile
|
92
|
+
user.name
|
93
|
+
user.profile_background_image_url_https
|
94
|
+
user.created_at
|
95
|
+
user.profile_sidebar_fill_color
|
96
|
+
user.id
|
97
|
+
user.follow_request_sent
|
98
|
+
user.utc_offset
|
99
|
+
user.url
|
100
|
+
user.profile_image_url_https
|
101
|
+
truncated
|
102
|
+
id
|
103
|
+
created_at
|
104
|
+
in_reply_to_user_id_str
|
105
|
+
retweeted_status.in_reply_to_screen_name
|
106
|
+
retweeted_status.favorited
|
107
|
+
retweeted_status.text
|
108
|
+
retweeted_status.entities.urls
|
109
|
+
retweeted_status.entities.user_mentions
|
110
|
+
retweeted_status.entities.hashtags
|
111
|
+
retweeted_status.in_reply_to_user_id
|
112
|
+
retweeted_status.contributors
|
113
|
+
retweeted_status.place
|
114
|
+
retweeted_status.coordinates
|
115
|
+
retweeted_status.source
|
116
|
+
retweeted_status.geo
|
117
|
+
retweeted_status.retweeted
|
118
|
+
retweeted_status.retweet_count
|
119
|
+
retweeted_status.in_reply_to_status_id
|
120
|
+
retweeted_status.in_reply_to_status_id_str
|
121
|
+
retweeted_status.id_str
|
122
|
+
retweeted_status.user.default_profile_image
|
123
|
+
retweeted_status.user.verified
|
124
|
+
retweeted_status.user.notifications
|
125
|
+
retweeted_status.user.profile_sidebar_border_color
|
126
|
+
retweeted_status.user.screen_name
|
127
|
+
retweeted_status.user.lang
|
128
|
+
retweeted_status.user.favourites_count
|
129
|
+
retweeted_status.user.contributors_enabled
|
130
|
+
retweeted_status.user.profile_use_background_image
|
131
|
+
retweeted_status.user.friends_count
|
132
|
+
retweeted_status.user.location
|
133
|
+
retweeted_status.user.profile_text_color
|
134
|
+
retweeted_status.user.followers_count
|
135
|
+
retweeted_status.user.profile_image_url
|
136
|
+
retweeted_status.user.description
|
137
|
+
retweeted_status.user.statuses_count
|
138
|
+
retweeted_status.user.following
|
139
|
+
retweeted_status.user.profile_background_image_url
|
140
|
+
retweeted_status.user.show_all_inline_media
|
141
|
+
retweeted_status.user.listed_count
|
142
|
+
retweeted_status.user.profile_link_color
|
143
|
+
retweeted_status.user.is_translator
|
144
|
+
retweeted_status.user.default_profile
|
145
|
+
retweeted_status.user.time_zone
|
146
|
+
retweeted_status.user.profile_background_color
|
147
|
+
retweeted_status.user.protected
|
148
|
+
retweeted_status.user.id_str
|
149
|
+
retweeted_status.user.geo_enabled
|
150
|
+
retweeted_status.user.profile_background_tile
|
151
|
+
retweeted_status.user.name
|
152
|
+
retweeted_status.user.profile_background_image_url_https
|
153
|
+
retweeted_status.user.created_at
|
154
|
+
retweeted_status.user.profile_sidebar_fill_color
|
155
|
+
retweeted_status.user.id
|
156
|
+
retweeted_status.user.follow_request_sent
|
157
|
+
retweeted_status.user.utc_offset
|
158
|
+
retweeted_status.user.url
|
159
|
+
retweeted_status.user.profile_image_url_https
|
160
|
+
retweeted_status.truncated
|
161
|
+
retweeted_status.id
|
162
|
+
retweeted_status.created_at
|
163
|
+
retweeted_status.in_reply_to_user_id_str
|
164
|
+
possibly_sensitive
|
165
|
+
possibly_sensitive_editable
|
166
|
+
retweeted_status.possibly_sensitive
|
167
|
+
retweeted_status.possibly_sensitive_editable
|
168
|
+
place.country_code
|
169
|
+
place.place_type
|
170
|
+
place.country
|
171
|
+
place.bounding_box.type
|
172
|
+
place.bounding_box.coordinates
|
173
|
+
place.full_name
|
174
|
+
place.name
|
175
|
+
place.id
|
176
|
+
place.url
|
177
|
+
coordinates.type
|
178
|
+
coordinates.coordinates
|
179
|
+
geo.type
|
180
|
+
geo.coordinates
|
181
|
+
retweeted_status.entities.media
|
182
|
+
entities.media
|
data/bin/twitter_to_csv
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'optparse'
|
5
5
|
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv'))
|
6
6
|
|
7
|
-
options = { :csv => STDOUT, :fields => %w[text
|
7
|
+
options = { :csv => STDOUT, :fields => %w[text] }
|
8
8
|
parser = OptionParser.new do |opts|
|
9
9
|
opts.banner = "Usage: #{File.basename($0)} [options]"
|
10
10
|
opts.separator ""
|
@@ -18,20 +18,20 @@ parser = OptionParser.new do |opts|
|
|
18
18
|
options[:password] = password
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on("-c", "--csv
|
21
|
+
opts.on("-c", "--csv FILE", "The CSV file to append to") do |csv|
|
22
22
|
options[:csv_appending] = File.exists?(csv)
|
23
23
|
options[:csv] = File.open(csv, 'a')
|
24
24
|
end
|
25
|
-
|
26
|
-
opts.on("-j", "--json
|
25
|
+
|
26
|
+
opts.on("-j", "--json FILE", "The JSON file to append to") do |json|
|
27
27
|
options[:json] = File.open(json, 'a')
|
28
28
|
end
|
29
29
|
|
30
|
-
opts.on("-f", "--filter
|
30
|
+
opts.on("-f", "--filter KEYWORDS", "Keywords to ask Twitter to filter on") do |filter|
|
31
31
|
options[:filter] = filter.split(/\s*,\s*/)
|
32
32
|
end
|
33
33
|
|
34
|
-
opts.on("-x", "--fields
|
34
|
+
opts.on("-x", "--fields FIELDS", "Fields to include in the CSV") do |fields|
|
35
35
|
options[:fields] = fields.split(/\s*,\s*/)
|
36
36
|
end
|
37
37
|
|
@@ -43,19 +43,23 @@ parser = OptionParser.new do |opts|
|
|
43
43
|
options[:verbose] = v
|
44
44
|
end
|
45
45
|
|
46
|
-
opts.on_tail(
|
46
|
+
opts.on_tail(nil, "--sample-fields NUMBER_OF_SAMPLES", "Record NUMBER_OF_SAMPLES tweets and then print out all","of the field names seen. Use to find out what can be passed to.") do |samples|
|
47
47
|
options[:sample_fields] = samples && samples.to_i
|
48
48
|
end
|
49
49
|
|
50
|
-
opts.on_tail("", "--replay-from-file FILENAME", "Replay
|
50
|
+
opts.on_tail("", "--replay-from-file FILENAME", "Replay tweets from a JSON dump file") do |replay_file|
|
51
51
|
options[:replay_from_file] = replay_file
|
52
52
|
end
|
53
53
|
|
54
|
+
opts.on_tail("", "--url-columns NUMBER_OF_COLUMNS", "Extract up to NUMBER_OF_COLUMNS urls from the status and include them in the CSV") do |url_columns|
|
55
|
+
options[:url_columns] = url_columns.to_i
|
56
|
+
end
|
57
|
+
|
54
58
|
opts.on_tail("-h", "--help", "Show this message") do
|
55
59
|
STDERR.puts opts
|
56
60
|
exit
|
57
61
|
end
|
58
|
-
|
62
|
+
|
59
63
|
opts.on_tail("--version", "Show version") do
|
60
64
|
STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}"
|
61
65
|
exit
|
@@ -1,9 +1,13 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
require 'pp'
|
2
3
|
|
3
4
|
module TwitterToCsv
|
4
5
|
class CsvBuilder
|
5
6
|
attr_accessor :options, :sampled_fields
|
6
7
|
|
8
|
+
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
|
9
|
+
URL_REGEX = %r"\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s\`\!\(\)\[\]\{\};:'\".,<>\?«»“”‘’]))"i
|
10
|
+
|
7
11
|
def initialize(options = {})
|
8
12
|
@options = options
|
9
13
|
@sampled_fields = {}
|
@@ -20,6 +24,7 @@ module TwitterToCsv
|
|
20
24
|
handle_status status
|
21
25
|
end
|
22
26
|
rescue SignalException, SystemExit
|
27
|
+
EventMachine::stop_event_loop
|
23
28
|
exit
|
24
29
|
rescue StandardError => e
|
25
30
|
STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
|
@@ -40,7 +45,13 @@ module TwitterToCsv
|
|
40
45
|
end
|
41
46
|
|
42
47
|
def log_csv_header
|
43
|
-
|
48
|
+
header_labels = options[:fields].dup
|
49
|
+
|
50
|
+
if options[:url_columns] && options[:url_columns] > 0
|
51
|
+
options[:url_columns].times { |i| header_labels << "url_#{i+1}" }
|
52
|
+
end
|
53
|
+
|
54
|
+
options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
44
55
|
end
|
45
56
|
|
46
57
|
def log_csv(status)
|
@@ -48,8 +59,14 @@ module TwitterToCsv
|
|
48
59
|
field.split(".").inject(status) { |memo, segment|
|
49
60
|
memo && memo[segment]
|
50
61
|
}.to_s
|
51
|
-
end
|
52
|
-
|
62
|
+
end
|
63
|
+
|
64
|
+
if options[:url_columns] && options[:url_columns] > 0
|
65
|
+
urls = status['text'].scan(URL_REGEX).flatten.compact
|
66
|
+
options[:url_columns].times { |i| csv_row << urls[i].to_s }
|
67
|
+
end
|
68
|
+
|
69
|
+
options[:csv].puts csv_row.to_csv(:encoding => 'UTF-8', :force_quotes => true)
|
53
70
|
end
|
54
71
|
|
55
72
|
def replay_from(filename)
|
data/spec/csv_builder_spec.rb
CHANGED
@@ -17,6 +17,24 @@ describe TwitterToCsv::CsvBuilder do
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
+
describe "log_csv_header" do
|
21
|
+
it "outputs the fields as header labels" do
|
22
|
+
string_io = StringIO.new
|
23
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something something_else.a])
|
24
|
+
csv_builder.log_csv_header
|
25
|
+
string_io.rewind
|
26
|
+
string_io.read.should == '"something","something_else.a"' + "\n"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "includes urls if requested" do
|
30
|
+
string_io = StringIO.new
|
31
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
|
32
|
+
csv_builder.log_csv_header
|
33
|
+
string_io.rewind
|
34
|
+
string_io.read.should == '"something","url_1","url_2"' + "\n"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
20
38
|
describe "logging to a CSV" do
|
21
39
|
it "outputs the requested fields when requested in dot-notation" do
|
22
40
|
string_io = StringIO.new
|
@@ -35,6 +53,22 @@ describe TwitterToCsv::CsvBuilder do
|
|
35
53
|
string_io.rewind
|
36
54
|
string_io.read.should == "\"hello\",\"b\",\"foo\"\n"
|
37
55
|
end
|
56
|
+
|
57
|
+
it "can extract URLs" do
|
58
|
+
string_io = StringIO.new
|
59
|
+
csv_builder = TwitterToCsv::CsvBuilder.new(:csv => string_io, :fields => %w[something], :url_columns => 2)
|
60
|
+
csv_builder.handle_status({
|
61
|
+
'something' => "hello",
|
62
|
+
'text' => 'this is http://a.com/url and http://a.com/nother'
|
63
|
+
})
|
64
|
+
csv_builder.handle_status({
|
65
|
+
'something' => "hello",
|
66
|
+
'text' => 'this is http://a.com/url/again'
|
67
|
+
})
|
68
|
+
string_io.rewind
|
69
|
+
string_io.read.should == "\"hello\",\"http://a.com/url\",\"http://a.com/nother\"\n" +
|
70
|
+
"\"hello\",\"http://a.com/url/again\",\"\"\n"
|
71
|
+
end
|
38
72
|
end
|
39
73
|
end
|
40
74
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_to_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-28 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70236843171320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70236843171320
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: twitter-stream
|
27
|
-
requirement: &
|
27
|
+
requirement: &70236843170900 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70236843170900
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-http-request
|
38
|
-
requirement: &
|
38
|
+
requirement: &70236843170480 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70236843170480
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: unsupervised-language-detection
|
49
|
-
requirement: &
|
49
|
+
requirement: &70236843170060 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70236843170060
|
58
58
|
description: ''
|
59
59
|
email:
|
60
60
|
- andrew@iterationlabs.com
|
@@ -97,8 +97,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
version: '0'
|
98
98
|
requirements: []
|
99
99
|
rubyforge_project: twitter_to_csv
|
100
|
-
rubygems_version: 1.8.
|
100
|
+
rubygems_version: 1.8.6
|
101
101
|
signing_key:
|
102
102
|
specification_version: 3
|
103
103
|
summary: Dump the Twitter streaming API to a CSV or JSON file
|
104
|
-
test_files:
|
104
|
+
test_files:
|
105
|
+
- spec/csv_builder_spec.rb
|
106
|
+
- spec/spec_helper.rb
|