twitterscraper-ruby 0.15.2 → 0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README.md +4 -0
- data/lib/twitterscraper/cache.rb +13 -1
- data/lib/twitterscraper/cli.rb +25 -12
- data/lib/twitterscraper/client.rb +27 -1
- data/lib/twitterscraper/query.rb +94 -68
- data/lib/twitterscraper/template.rb +52 -42
- data/lib/twitterscraper/template/tweets.html.erb +112 -0
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/twitterscraper/type.rb +4 -0
- data/lib/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 73a9e9108284fc79cf5ec6b36b6f7ad3f83f2b4f03a2bc527dc18cb4b33e83c7
|
4
|
+
data.tar.gz: c7fcfdbdd1d808780c56610be9b8717352c812759b9344d9fa87cbd430a8d8e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1019547fe8c37a1bb5b4a9cd96a2737a14491087075ff448b48f72538758337c76ab513e153d4567454b192d30fafaa374913ae0c3548d7802e7bdd478fe4a2f
|
7
|
+
data.tar.gz: 48134e8b6858154850003da8684d3c8b7f124cab6d19e0ce76d05326dc8fef44694b32211e245509993e8b7b1afafa6d95914b05c66b9c95c54bb27d041983fe
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,7 @@ end
|
|
98
98
|
"screen_name": "@name",
|
99
99
|
"name": "Name",
|
100
100
|
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
101
102
|
"tweet_id": 1234000000000000,
|
102
103
|
"text": "Thanks Twitter!",
|
103
104
|
"links": [],
|
@@ -122,6 +123,7 @@ end
|
|
122
123
|
- screen_name
|
123
124
|
- name
|
124
125
|
- user_id
|
126
|
+
- profile_image_url
|
125
127
|
- tweet_id
|
126
128
|
- text
|
127
129
|
- links
|
@@ -173,6 +175,8 @@ Search operators documentation is in [Standard search operators](https://develop
|
|
173
175
|
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
176
|
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
177
|
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | day or hour | auto |
|
179
|
+
| `--chart_grouping` | string | day, hour or minute | auto |
|
176
180
|
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
181
|
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
182
|
| `--format` | string | The format of the output. | json(default) or html |
|
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl =
|
7
|
+
@ttl = 86400 * 3 # 3 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,18 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def exist?(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.exist?(file)
|
32
|
+
end
|
33
|
+
|
34
|
+
def delete(key)
|
35
|
+
key = cache_key(key)
|
36
|
+
file = File.join(@dir, key)
|
37
|
+
File.delete(file) if File.exist?(file)
|
38
|
+
end
|
39
|
+
|
28
40
|
def fetch(key, &block)
|
29
41
|
if (value = read(key))
|
30
42
|
value
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -24,21 +24,25 @@ module Twitterscraper
|
|
24
24
|
daily_limit: options['daily_limit'],
|
25
25
|
order: options['order'],
|
26
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
27
28
|
}
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
30
|
-
export(tweets) unless tweets.empty?
|
31
|
+
export(options['query'], tweets) unless tweets.empty?
|
31
32
|
end
|
32
33
|
|
33
|
-
def export(tweets)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
def export(name, tweets)
|
35
|
+
options['format'].split(',').map(&:strip).each do |format|
|
36
|
+
file = build_output_name(format, options)
|
37
|
+
Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
|
38
|
+
|
39
|
+
if format == 'json'
|
40
|
+
File.write(file, generate_json(tweets))
|
41
|
+
elsif format == 'html'
|
42
|
+
File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
|
43
|
+
else
|
44
|
+
puts "Invalid format #{format}"
|
45
|
+
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
|
@@ -69,6 +73,8 @@ module Twitterscraper
|
|
69
73
|
'daily_limit:',
|
70
74
|
'order:',
|
71
75
|
'threads:',
|
76
|
+
'threads_granularity:',
|
77
|
+
'chart_grouping:',
|
72
78
|
'output:',
|
73
79
|
'format:',
|
74
80
|
'cache:',
|
@@ -82,10 +88,10 @@ module Twitterscraper
|
|
82
88
|
options['lang'] ||= ''
|
83
89
|
options['limit'] = (options['limit'] || 100).to_i
|
84
90
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
-
options['threads'] = (options['threads'] ||
|
91
|
+
options['threads'] = (options['threads'] || 10).to_i
|
92
|
+
options['threads_granularity'] ||= 'auto'
|
86
93
|
options['format'] ||= 'json'
|
87
94
|
options['order'] ||= 'desc'
|
88
|
-
options['output'] ||= "tweets.#{options['format']}"
|
89
95
|
|
90
96
|
options['cache'] = options['cache'] != 'false'
|
91
97
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -93,6 +99,13 @@ module Twitterscraper
|
|
93
99
|
options
|
94
100
|
end
|
95
101
|
|
102
|
+
def build_output_name(format, options)
|
103
|
+
query = options['query'].gsub(/[ :?#&]/, '_')
|
104
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
105
|
+
file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
|
106
|
+
File.join('out', file)
|
107
|
+
end
|
108
|
+
|
96
109
|
def initialize_logger
|
97
110
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
111
|
end
|
@@ -2,9 +2,31 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
+
USER_AGENT_LIST = [
|
6
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
7
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
8
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
9
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
10
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
11
|
+
]
|
12
|
+
|
5
13
|
def initialize(cache: true, proxy: true)
|
14
|
+
@request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
15
|
+
Twitterscraper.logger.info "Headers #{@request_headers}"
|
16
|
+
|
6
17
|
@cache = cache
|
7
|
-
|
18
|
+
|
19
|
+
if (@proxy = proxy)
|
20
|
+
@proxies = Proxy::Pool.new
|
21
|
+
Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
|
22
|
+
else
|
23
|
+
@proxies = []
|
24
|
+
Twitterscraper.logger.debug 'Proxy disabled'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def request_headers
|
29
|
+
@request_headers
|
8
30
|
end
|
9
31
|
|
10
32
|
def cache_enabled?
|
@@ -14,5 +36,9 @@ module Twitterscraper
|
|
14
36
|
def proxy_enabled?
|
15
37
|
@proxy
|
16
38
|
end
|
39
|
+
|
40
|
+
def proxies
|
41
|
+
@proxies
|
42
|
+
end
|
17
43
|
end
|
18
44
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -10,14 +10,6 @@ module Twitterscraper
|
|
10
10
|
module Query
|
11
11
|
include Logger
|
12
12
|
|
13
|
-
USER_AGENT_LIST = [
|
14
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
15
|
-
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
16
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
17
|
-
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
18
|
-
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
19
|
-
]
|
20
|
-
|
21
13
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
22
14
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
15
|
'default&include_available_features=1&include_entities=1&' +
|
@@ -43,13 +35,13 @@ module Twitterscraper
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
def get_single_page(url,
|
38
|
+
def get_single_page(url, timeout = 6, retries = 30)
|
47
39
|
return nil if stop_requested?
|
48
|
-
|
40
|
+
if proxy_enabled?
|
49
41
|
proxy = proxies.sample
|
50
42
|
logger.info("Using proxy #{proxy}")
|
51
43
|
end
|
52
|
-
Http.get(url,
|
44
|
+
Http.get(url, request_headers, proxy, timeout)
|
53
45
|
rescue => e
|
54
46
|
logger.debug "get_single_page: #{e.inspect}"
|
55
47
|
if (retries -= 1) > 0
|
@@ -69,30 +61,33 @@ module Twitterscraper
|
|
69
61
|
else
|
70
62
|
json_resp = JSON.parse(text)
|
71
63
|
items_html = json_resp['items_html'] || ''
|
72
|
-
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
73
64
|
end
|
74
65
|
|
75
66
|
[items_html, json_resp]
|
76
67
|
end
|
77
68
|
|
78
|
-
def query_single_page(query, lang, type, pos
|
69
|
+
def query_single_page(query, lang, type, pos)
|
79
70
|
logger.info "Querying #{query}"
|
80
|
-
|
71
|
+
encoded_query = ERB::Util.url_encode(query)
|
81
72
|
|
82
|
-
url = build_query_url(
|
73
|
+
url = build_query_url(encoded_query, lang, type, pos)
|
83
74
|
http_request = lambda do
|
84
|
-
logger.debug "Scraping tweets from
|
85
|
-
get_single_page(url
|
75
|
+
logger.debug "Scraping tweets from url=#{url}"
|
76
|
+
get_single_page(url)
|
86
77
|
end
|
87
78
|
|
88
79
|
if cache_enabled?
|
89
80
|
client = Cache.new
|
90
81
|
if (response = client.read(url))
|
91
|
-
logger.debug
|
82
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
92
83
|
else
|
93
84
|
response = http_request.call
|
94
85
|
client.write(url, response) unless stop_requested?
|
95
86
|
end
|
87
|
+
if @queries && query == @queries.last && pos.nil?
|
88
|
+
logger.debug "Delete a cache query=#{query}"
|
89
|
+
client.delete(url)
|
90
|
+
end
|
96
91
|
else
|
97
92
|
response = http_request.call
|
98
93
|
end
|
@@ -100,6 +95,12 @@ module Twitterscraper
|
|
100
95
|
|
101
96
|
html, json_resp = parse_single_page(response, pos.nil?)
|
102
97
|
|
98
|
+
if json_resp && json_resp['message']
|
99
|
+
logger.warn json_resp['message'] # Sorry, you are rate limited.
|
100
|
+
@stop_requested = true
|
101
|
+
Cache.new.delete(url) if cache_enabled?
|
102
|
+
end
|
103
|
+
|
103
104
|
tweets = Tweet.from_html(html)
|
104
105
|
|
105
106
|
if tweets.empty?
|
@@ -130,126 +131,151 @@ module Twitterscraper
|
|
130
131
|
if start_date && end_date
|
131
132
|
if start_date == end_date
|
132
133
|
raise Error.new('Please specify different values for :start_date and :end_date.')
|
133
|
-
elsif start_date > end_date
|
134
|
+
elsif Date.parse(start_date) > Date.parse(end_date)
|
134
135
|
raise Error.new(':start_date must occur before :end_date.')
|
135
136
|
end
|
136
137
|
end
|
137
138
|
|
138
139
|
if start_date
|
139
|
-
if start_date < OLDEST_DATE
|
140
|
+
if Date.parse(start_date) < OLDEST_DATE
|
140
141
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
141
142
|
end
|
142
143
|
end
|
144
|
+
end
|
143
145
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
146
|
+
def build_queries(query, start_date, end_date, threads_granularity, type)
|
147
|
+
if type.search?
|
148
|
+
start_date = Date.parse(start_date) if start_date.is_a?(String)
|
149
|
+
end_date = Date.parse(end_date) if end_date.is_a?(String)
|
150
|
+
elsif type.user?
|
151
|
+
start_date = nil
|
152
|
+
end_date = nil
|
149
153
|
end
|
150
|
-
end
|
151
154
|
|
152
|
-
def build_queries(query, start_date, end_date)
|
153
155
|
if start_date && end_date
|
154
|
-
|
155
|
-
|
156
|
+
if threads_granularity == 'auto'
|
157
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
158
|
+
end
|
159
|
+
|
160
|
+
if threads_granularity == 'day'
|
161
|
+
date_range = start_date.upto(end_date - 1)
|
162
|
+
queries = date_range.map { |date| query + " since:#{date}_00:00:00_UTC until:#{date + 1}_00:00:00_UTC" }
|
163
|
+
elsif threads_granularity == 'hour'
|
164
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
165
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
166
|
+
queries = []
|
167
|
+
|
168
|
+
while true
|
169
|
+
if time < Time.now.utc
|
170
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H')}:00:00_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H')}:00:00_UTC")
|
171
|
+
end
|
172
|
+
time += 3600
|
173
|
+
break if time >= end_time
|
174
|
+
end
|
175
|
+
else
|
176
|
+
raise Error.new("Invalid :threads_granularity value=#{threads_granularity}")
|
177
|
+
end
|
178
|
+
|
179
|
+
@queries = queries
|
180
|
+
|
156
181
|
elsif start_date
|
157
|
-
[query + " since:#{start_date}"]
|
182
|
+
[query + " since:#{start_date}_00:00:00_UTC"]
|
158
183
|
elsif end_date
|
159
|
-
[query + " until:#{end_date}"]
|
184
|
+
[query + " until:#{end_date}_00:00:00_UTC"]
|
160
185
|
else
|
161
186
|
[query]
|
162
187
|
end
|
163
188
|
end
|
164
189
|
|
165
|
-
def main_loop(query, lang, type, limit, daily_limit
|
190
|
+
def main_loop(query, lang, type, limit, daily_limit)
|
166
191
|
pos = nil
|
167
|
-
|
192
|
+
tmp_tweets = []
|
168
193
|
|
169
194
|
while true
|
170
|
-
new_tweets, new_pos = query_single_page(query, lang, type, pos
|
195
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos)
|
171
196
|
unless new_tweets.empty?
|
172
|
-
|
173
|
-
|
197
|
+
tmp_tweets.concat(new_tweets)
|
198
|
+
tmp_tweets.uniq! { |t| t.tweet_id }
|
199
|
+
end
|
174
200
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
201
|
+
@results_counter[Parallel.worker_number] = tmp_tweets.size
|
202
|
+
total_size = @all_tweets.size + @results_counter.values.sum
|
203
|
+
logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
|
204
|
+
|
205
|
+
if !@stop_requested && total_size >= limit
|
206
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
|
207
|
+
@stop_requested = true
|
179
208
|
end
|
180
|
-
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
181
209
|
|
182
210
|
break unless new_pos
|
183
|
-
break if
|
211
|
+
break if @stop_requested
|
212
|
+
break if daily_limit && tmp_tweets.size >= daily_limit
|
184
213
|
break if @all_tweets.size >= limit
|
185
214
|
|
186
215
|
pos = new_pos
|
187
216
|
end
|
188
217
|
|
189
|
-
|
190
|
-
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
191
|
-
@stop_requested = true
|
192
|
-
end
|
218
|
+
tmp_tweets
|
193
219
|
end
|
194
220
|
|
195
221
|
def stop_requested?
|
196
222
|
@stop_requested
|
197
223
|
end
|
198
224
|
|
199
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
200
|
-
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
201
|
-
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
202
|
-
queries = build_queries(query, start_date, end_date)
|
225
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
203
226
|
type = Type.new(type)
|
227
|
+
queries = build_queries(query, start_date, end_date, threads_granularity, type)
|
204
228
|
if threads > queries.size
|
205
|
-
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
206
229
|
threads = queries.size
|
207
230
|
end
|
208
|
-
if proxy_enabled?
|
209
|
-
proxies = Proxy::Pool.new
|
210
|
-
logger.debug "Fetch #{proxies.size} proxies"
|
211
|
-
else
|
212
|
-
proxies = []
|
213
|
-
logger.debug 'Proxy disabled'
|
214
|
-
end
|
215
231
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
216
232
|
|
217
|
-
|
218
233
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
219
234
|
|
235
|
+
logger.info "The number of queries #{queries.size}"
|
220
236
|
logger.info "The number of threads #{threads}"
|
221
237
|
|
222
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
223
|
-
logger.info "Headers #{headers}"
|
224
|
-
|
225
238
|
@all_tweets = []
|
226
|
-
@mutex = Mutex.new
|
227
239
|
@stop_requested = false
|
240
|
+
@results_counter = {}
|
228
241
|
|
229
242
|
if threads > 1
|
243
|
+
@mutex = Mutex.new
|
230
244
|
Thread.abort_on_exception = true
|
231
245
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
232
246
|
|
233
247
|
Parallel.each(queries, in_threads: threads) do |query|
|
234
|
-
|
248
|
+
@results_counter[Parallel.worker_number] = 0
|
249
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
250
|
+
@mutex.synchronize {
|
251
|
+
@all_tweets.concat(tmp_tweets)
|
252
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
253
|
+
}
|
254
|
+
@results_counter[Parallel.worker_number] = 0
|
255
|
+
|
235
256
|
raise Parallel::Break if stop_requested?
|
236
257
|
end
|
237
258
|
else
|
238
259
|
queries.each do |query|
|
239
|
-
main_loop(query, lang, type, limit, daily_limit
|
260
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
261
|
+
@all_tweets.concat(tmp_tweets)
|
262
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
263
|
+
|
240
264
|
break if stop_requested?
|
241
265
|
end
|
242
266
|
end
|
243
267
|
|
268
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
269
|
+
|
244
270
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
245
271
|
end
|
246
272
|
|
247
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
248
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
273
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
274
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
249
275
|
end
|
250
276
|
|
251
277
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
252
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
278
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
253
279
|
end
|
254
280
|
end
|
255
281
|
end
|
@@ -1,48 +1,58 @@
|
|
1
1
|
module Twitterscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Template
|
3
|
+
def tweets_embedded_html(name, tweets, options)
|
4
|
+
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
5
|
+
template = ERB.new(File.read(path))
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
tweets = tweets.sort_by { |t| t.created_at.to_i }
|
8
|
+
grouping = options['chart_grouping'] || 'auto'
|
9
|
+
|
10
|
+
template.result_with_hash(
|
11
|
+
chart_name: name,
|
12
|
+
chart_data: chart_data(tweets, grouping: grouping).to_json,
|
13
|
+
first_tweet: tweets[0],
|
14
|
+
last_tweet: tweets[-1],
|
15
|
+
tweets: tweets,
|
16
|
+
convert_limit: 30,
|
17
|
+
)
|
8
18
|
end
|
9
19
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
20
|
+
def chart_data(tweets, grouping: 'auto')
|
21
|
+
if grouping && tweets.size > 100
|
22
|
+
if grouping == 'auto'
|
23
|
+
month = 28 * 24 * 60 * 60 # 28 days
|
24
|
+
duration = tweets[-1].created_at - tweets[0].created_at
|
25
|
+
|
26
|
+
if duration > 3 * month
|
27
|
+
grouping = 'day'
|
28
|
+
elsif duration > month || tweets.size > 10000
|
29
|
+
grouping = 'hour'
|
30
|
+
else
|
31
|
+
grouping = 'minute'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
Twitterscraper.logger.info "Chart grouping #{grouping}"
|
37
|
+
|
38
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
39
|
+
t = tweet.created_at
|
40
|
+
|
41
|
+
if grouping == 'day'
|
42
|
+
time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
|
43
|
+
elsif grouping == 'hour'
|
44
|
+
time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
|
45
|
+
elsif grouping == 'minute'
|
46
|
+
time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
|
47
|
+
else
|
48
|
+
time = t
|
49
|
+
end
|
50
|
+
memo[time.to_i] += 1
|
51
|
+
end
|
52
|
+
|
53
|
+
data.sort_by { |k, _| k }.map do |timestamp, count|
|
54
|
+
[timestamp * 1000, count]
|
55
|
+
end
|
56
|
+
end
|
47
57
|
end
|
48
58
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="ja">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
|
6
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
7
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
8
|
+
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
9
|
+
<script>
|
10
|
+
function updateTweets() {
|
11
|
+
window.twttr = (function (d, s, id) {
|
12
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
13
|
+
if (d.getElementById(id)) return t;
|
14
|
+
js = d.createElement(s);
|
15
|
+
js.id = id;
|
16
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
17
|
+
fjs.parentNode.insertBefore(js, fjs);
|
18
|
+
|
19
|
+
t._e = [];
|
20
|
+
t.ready = function (f) {
|
21
|
+
t._e.push(f);
|
22
|
+
};
|
23
|
+
|
24
|
+
return t;
|
25
|
+
}(document, "script", "twitter-wjs"));
|
26
|
+
}
|
27
|
+
|
28
|
+
function drawChart() {
|
29
|
+
Highcharts.setOptions({
|
30
|
+
time: {
|
31
|
+
timezone: moment.tz.guess()
|
32
|
+
}
|
33
|
+
});
|
34
|
+
|
35
|
+
var data = <%= chart_data %>;
|
36
|
+
var config = {
|
37
|
+
title: {
|
38
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
39
|
+
},
|
40
|
+
subtitle: {
|
41
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
42
|
+
},
|
43
|
+
series: [{
|
44
|
+
data: data
|
45
|
+
}],
|
46
|
+
rangeSelector: {enabled: false},
|
47
|
+
scrollbar: {enabled: false},
|
48
|
+
navigator: {enabled: false},
|
49
|
+
exporting: {enabled: false},
|
50
|
+
credits: {enabled: false}
|
51
|
+
};
|
52
|
+
|
53
|
+
Highcharts.stockChart('chart-container', config);
|
54
|
+
}
|
55
|
+
|
56
|
+
document.addEventListener("DOMContentLoaded", function () {
|
57
|
+
drawChart();
|
58
|
+
updateTweets();
|
59
|
+
});
|
60
|
+
</script>
|
61
|
+
|
62
|
+
<style type=text/css>
|
63
|
+
#chart-container {
|
64
|
+
max-width: 1200px;
|
65
|
+
height: 675px;
|
66
|
+
margin: 0 auto;
|
67
|
+
border: 1px solid rgb(204, 214, 221);
|
68
|
+
display: flex;
|
69
|
+
justify-content: center;
|
70
|
+
align-items: center;
|
71
|
+
}
|
72
|
+
.tweets-container {
|
73
|
+
max-width: 550px;
|
74
|
+
margin: 0 auto 0 auto;
|
75
|
+
}
|
76
|
+
|
77
|
+
.twitter-tweet {
|
78
|
+
margin: 15px 0 15px 0 !important;
|
79
|
+
}
|
80
|
+
</style>
|
81
|
+
</head>
|
82
|
+
<body>
|
83
|
+
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
84
|
+
|
85
|
+
<div class="tweets-container">
|
86
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
|
87
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
88
|
+
<% if i < convert_limit %>
|
89
|
+
<blockquote class="twitter-tweet">
|
90
|
+
<% else %>
|
91
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
92
|
+
<% end %>
|
93
|
+
|
94
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
95
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
96
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
97
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
98
|
+
</div>
|
99
|
+
|
100
|
+
<div><%= tweet.text %></div>
|
101
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
102
|
+
|
103
|
+
<% if i < convert_limit %>
|
104
|
+
</blockquote>
|
105
|
+
<% else %>
|
106
|
+
</div>
|
107
|
+
<% end %>
|
108
|
+
<% end %>
|
109
|
+
</div>
|
110
|
+
|
111
|
+
</body>
|
112
|
+
</html>
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/twitterscraper/type.rb
CHANGED
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.20.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -72,6 +72,7 @@ files:
|
|
72
72
|
- lib/twitterscraper/proxy.rb
|
73
73
|
- lib/twitterscraper/query.rb
|
74
74
|
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/template/tweets.html.erb
|
75
76
|
- lib/twitterscraper/tweet.rb
|
76
77
|
- lib/twitterscraper/type.rb
|
77
78
|
- lib/version.rb
|