twitterscraper-ruby 0.16.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +3 -0
- data/lib/twitterscraper/cli.rb +11 -2
- data/lib/twitterscraper/query.rb +36 -21
- data/lib/twitterscraper/template.rb +2 -2
- data/lib/twitterscraper/template/tweets.html.erb +45 -29
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ac0c10b18d836983cc6b73e25b9ed333af2f620106a07c6bc6a40058fb127895
|
4
|
+
data.tar.gz: e6fc18219d9127fb30ba57e39dc4656c0f0a3c108428d959de5bac9e7d317088
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90cbf06b606878dc36b4bba44669139c273bf03b08a777ad87036834841bcb4b052e0559813dc56e4be124442abfc5a7fc44c5c9524c74929ca02b1d287d346b
|
7
|
+
data.tar.gz: ada0b74ee42ff62964b73ad9b49358227cdaf4fc87420cf12cf65af95168ad9775615a504345ebc83d3b791e9c0d892691c55bc477eddd647b3e8934f752fb9c
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,7 @@ end
|
|
98
98
|
"screen_name": "@name",
|
99
99
|
"name": "Name",
|
100
100
|
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
101
102
|
"tweet_id": 1234000000000000,
|
102
103
|
"text": "Thanks Twitter!",
|
103
104
|
"links": [],
|
@@ -122,6 +123,7 @@ end
|
|
122
123
|
- screen_name
|
123
124
|
- name
|
124
125
|
- user_id
|
126
|
+
- profile_image_url
|
125
127
|
- tweet_id
|
126
128
|
- text
|
127
129
|
- links
|
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
|
|
173
175
|
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
176
|
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
177
|
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | | auto |
|
176
179
|
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
180
|
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
181
|
| `--format` | string | The format of the output. | json(default) or html |
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -24,6 +24,7 @@ module Twitterscraper
|
|
24
24
|
daily_limit: options['daily_limit'],
|
25
25
|
order: options['order'],
|
26
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
27
28
|
}
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
@@ -69,6 +70,7 @@ module Twitterscraper
|
|
69
70
|
'daily_limit:',
|
70
71
|
'order:',
|
71
72
|
'threads:',
|
73
|
+
'threads_granularity:',
|
72
74
|
'output:',
|
73
75
|
'format:',
|
74
76
|
'cache:',
|
@@ -82,10 +84,11 @@ module Twitterscraper
|
|
82
84
|
options['lang'] ||= ''
|
83
85
|
options['limit'] = (options['limit'] || 100).to_i
|
84
86
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
-
options['threads'] = (options['threads'] ||
|
87
|
+
options['threads'] = (options['threads'] || 10).to_i
|
88
|
+
options['threads_granularity'] ||= 'auto'
|
86
89
|
options['format'] ||= 'json'
|
87
90
|
options['order'] ||= 'desc'
|
88
|
-
options['output'] ||=
|
91
|
+
options['output'] ||= build_output_name(options)
|
89
92
|
|
90
93
|
options['cache'] = options['cache'] != 'false'
|
91
94
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -93,6 +96,12 @@ module Twitterscraper
|
|
93
96
|
options
|
94
97
|
end
|
95
98
|
|
99
|
+
def build_output_name(options)
|
100
|
+
query = ERB::Util.url_encode(options['query'])
|
101
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
102
|
+
[options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format']
|
103
|
+
end
|
104
|
+
|
96
105
|
def initialize_logger
|
97
106
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
107
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -80,14 +80,14 @@ module Twitterscraper
|
|
80
80
|
|
81
81
|
url = build_query_url(query, lang, type, pos)
|
82
82
|
http_request = lambda do
|
83
|
-
logger.debug "Scraping tweets from
|
83
|
+
logger.debug "Scraping tweets from url=#{url}"
|
84
84
|
get_single_page(url, headers, proxies)
|
85
85
|
end
|
86
86
|
|
87
87
|
if cache_enabled?
|
88
88
|
client = Cache.new
|
89
89
|
if (response = client.read(url))
|
90
|
-
logger.debug
|
90
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
91
91
|
else
|
92
92
|
response = http_request.call
|
93
93
|
client.write(url, response) unless stop_requested?
|
@@ -147,21 +147,27 @@ module Twitterscraper
|
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
150
|
-
def build_queries(query, start_date, end_date)
|
150
|
+
def build_queries(query, start_date, end_date, threads_granularity)
|
151
151
|
if start_date && end_date
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
queries = []
|
156
|
-
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
157
|
-
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
152
|
+
if threads_granularity == 'auto'
|
153
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
154
|
+
end
|
158
155
|
|
159
|
-
|
160
|
-
|
161
|
-
|
156
|
+
if threads_granularity == 'day'
|
157
|
+
date_range = start_date.upto(end_date - 1)
|
158
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
159
|
+
else
|
160
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
161
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
162
|
+
queries = []
|
163
|
+
|
164
|
+
while true
|
165
|
+
if time < Time.now.utc
|
166
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
|
167
|
+
end
|
168
|
+
time += 3600
|
169
|
+
break if time >= end_time
|
162
170
|
end
|
163
|
-
time += 3600
|
164
|
-
break if time >= end_time
|
165
171
|
end
|
166
172
|
|
167
173
|
queries
|
@@ -209,11 +215,17 @@ module Twitterscraper
|
|
209
215
|
@stop_requested
|
210
216
|
end
|
211
217
|
|
212
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
213
|
-
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
214
|
-
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
215
|
-
queries = build_queries(query, start_date, end_date)
|
218
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
216
219
|
type = Type.new(type)
|
220
|
+
if type.search?
|
221
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
222
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
223
|
+
elsif type.user?
|
224
|
+
start_date = nil
|
225
|
+
end_date = nil
|
226
|
+
end
|
227
|
+
|
228
|
+
queries = build_queries(query, start_date, end_date, threads_granularity)
|
217
229
|
if threads > queries.size
|
218
230
|
threads = queries.size
|
219
231
|
end
|
@@ -228,6 +240,7 @@ module Twitterscraper
|
|
228
240
|
|
229
241
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
230
242
|
|
243
|
+
logger.info "The number of queries #{queries.size}"
|
231
244
|
logger.info "The number of threads #{threads}"
|
232
245
|
|
233
246
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
@@ -252,15 +265,17 @@ module Twitterscraper
|
|
252
265
|
end
|
253
266
|
end
|
254
267
|
|
268
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
269
|
+
|
255
270
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
256
271
|
end
|
257
272
|
|
258
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
259
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
273
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
274
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
260
275
|
end
|
261
276
|
|
262
277
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
263
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
278
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
264
279
|
end
|
265
280
|
end
|
266
281
|
end
|
@@ -9,8 +9,8 @@ module Twitterscraper
|
|
9
9
|
chart_data: chart_data(tweets).to_json,
|
10
10
|
first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
|
11
11
|
last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
|
12
|
-
|
13
|
-
|
12
|
+
tweets: tweets,
|
13
|
+
convert_limit: 30,
|
14
14
|
)
|
15
15
|
end
|
16
16
|
|
@@ -1,28 +1,29 @@
|
|
1
1
|
<html>
|
2
2
|
<head>
|
3
|
-
<script>
|
4
|
-
window.twttr = (function (d, s, id) {
|
5
|
-
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
6
|
-
if (d.getElementById(id)) return t;
|
7
|
-
js = d.createElement(s);
|
8
|
-
js.id = id;
|
9
|
-
js.src = "https://platform.twitter.com/widgets.js";
|
10
|
-
fjs.parentNode.insertBefore(js, fjs);
|
11
|
-
|
12
|
-
t._e = [];
|
13
|
-
t.ready = function (f) {
|
14
|
-
t._e.push(f);
|
15
|
-
};
|
16
|
-
|
17
|
-
return t;
|
18
|
-
}(document, "script", "twitter-wjs"));
|
19
|
-
</script>
|
20
|
-
|
21
3
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
22
4
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
23
5
|
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
24
6
|
<script>
|
7
|
+
function updateTweets() {
|
8
|
+
window.twttr = (function (d, s, id) {
|
9
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
10
|
+
if (d.getElementById(id)) return t;
|
11
|
+
js = d.createElement(s);
|
12
|
+
js.id = id;
|
13
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
14
|
+
fjs.parentNode.insertBefore(js, fjs);
|
15
|
+
|
16
|
+
t._e = [];
|
17
|
+
t.ready = function (f) {
|
18
|
+
t._e.push(f);
|
19
|
+
};
|
20
|
+
|
21
|
+
return t;
|
22
|
+
}(document, "script", "twitter-wjs"));
|
23
|
+
}
|
24
|
+
|
25
25
|
function drawChart() {
|
26
|
+
var data = <%= chart_data %>;
|
26
27
|
Highcharts.setOptions({
|
27
28
|
time: {
|
28
29
|
timezone: moment.tz.guess()
|
@@ -31,13 +32,13 @@
|
|
31
32
|
|
32
33
|
Highcharts.stockChart('chart', {
|
33
34
|
title: {
|
34
|
-
text: '<%=
|
35
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
35
36
|
},
|
36
37
|
subtitle: {
|
37
|
-
text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
|
38
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
38
39
|
},
|
39
40
|
series: [{
|
40
|
-
data:
|
41
|
+
data: data
|
41
42
|
}],
|
42
43
|
rangeSelector: {enabled: false},
|
43
44
|
scrollbar: {enabled: false},
|
@@ -49,6 +50,7 @@
|
|
49
50
|
|
50
51
|
document.addEventListener("DOMContentLoaded", function () {
|
51
52
|
drawChart();
|
53
|
+
updateTweets();
|
52
54
|
});
|
53
55
|
</script>
|
54
56
|
|
@@ -64,17 +66,31 @@
|
|
64
66
|
</style>
|
65
67
|
</head>
|
66
68
|
<body>
|
67
|
-
<div id="chart"></div>
|
69
|
+
<div id="chart" style="width: 100vw; height: 400px;"></div>
|
68
70
|
|
69
71
|
<div class="tweets-container">
|
70
|
-
<% tweets.each do |tweet| %>
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
<% tweets.each.with_index do |tweet, i| %>
|
73
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
74
|
+
<% if i < convert_limit %>
|
75
|
+
<blockquote class="twitter-tweet">
|
76
|
+
<% else %>
|
77
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
78
|
+
<% end %>
|
79
|
+
|
80
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
81
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
82
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
83
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
84
|
+
</div>
|
85
|
+
|
86
|
+
<div><%= tweet.text %></div>
|
87
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
75
88
|
|
76
|
-
|
77
|
-
|
89
|
+
<% if i < convert_limit %>
|
90
|
+
</blockquote>
|
91
|
+
<% else %>
|
92
|
+
</div>
|
93
|
+
<% end %>
|
78
94
|
<% end %>
|
79
95
|
</div>
|
80
96
|
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/version.rb
CHANGED