twitterscraper-ruby 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +3 -0
- data/lib/twitterscraper/cli.rb +11 -2
- data/lib/twitterscraper/query.rb +36 -21
- data/lib/twitterscraper/template.rb +2 -2
- data/lib/twitterscraper/template/tweets.html.erb +45 -29
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ac0c10b18d836983cc6b73e25b9ed333af2f620106a07c6bc6a40058fb127895
|
4
|
+
data.tar.gz: e6fc18219d9127fb30ba57e39dc4656c0f0a3c108428d959de5bac9e7d317088
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90cbf06b606878dc36b4bba44669139c273bf03b08a777ad87036834841bcb4b052e0559813dc56e4be124442abfc5a7fc44c5c9524c74929ca02b1d287d346b
|
7
|
+
data.tar.gz: ada0b74ee42ff62964b73ad9b49358227cdaf4fc87420cf12cf65af95168ad9775615a504345ebc83d3b791e9c0d892691c55bc477eddd647b3e8934f752fb9c
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,7 @@ end
|
|
98
98
|
"screen_name": "@name",
|
99
99
|
"name": "Name",
|
100
100
|
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
101
102
|
"tweet_id": 1234000000000000,
|
102
103
|
"text": "Thanks Twitter!",
|
103
104
|
"links": [],
|
@@ -122,6 +123,7 @@ end
|
|
122
123
|
- screen_name
|
123
124
|
- name
|
124
125
|
- user_id
|
126
|
+
- profile_image_url
|
125
127
|
- tweet_id
|
126
128
|
- text
|
127
129
|
- links
|
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
|
|
173
175
|
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
176
|
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
177
|
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | | auto |
|
176
179
|
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
180
|
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
181
|
| `--format` | string | The format of the output. | json(default) or html |
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -24,6 +24,7 @@ module Twitterscraper
|
|
24
24
|
daily_limit: options['daily_limit'],
|
25
25
|
order: options['order'],
|
26
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
27
28
|
}
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
@@ -69,6 +70,7 @@ module Twitterscraper
|
|
69
70
|
'daily_limit:',
|
70
71
|
'order:',
|
71
72
|
'threads:',
|
73
|
+
'threads_granularity:',
|
72
74
|
'output:',
|
73
75
|
'format:',
|
74
76
|
'cache:',
|
@@ -82,10 +84,11 @@ module Twitterscraper
|
|
82
84
|
options['lang'] ||= ''
|
83
85
|
options['limit'] = (options['limit'] || 100).to_i
|
84
86
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
-
options['threads'] = (options['threads'] ||
|
87
|
+
options['threads'] = (options['threads'] || 10).to_i
|
88
|
+
options['threads_granularity'] ||= 'auto'
|
86
89
|
options['format'] ||= 'json'
|
87
90
|
options['order'] ||= 'desc'
|
88
|
-
options['output'] ||=
|
91
|
+
options['output'] ||= build_output_name(options)
|
89
92
|
|
90
93
|
options['cache'] = options['cache'] != 'false'
|
91
94
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -93,6 +96,12 @@ module Twitterscraper
|
|
93
96
|
options
|
94
97
|
end
|
95
98
|
|
99
|
+
def build_output_name(options)
|
100
|
+
query = ERB::Util.url_encode(options['query'])
|
101
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
102
|
+
[options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format']
|
103
|
+
end
|
104
|
+
|
96
105
|
def initialize_logger
|
97
106
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
107
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -80,14 +80,14 @@ module Twitterscraper
|
|
80
80
|
|
81
81
|
url = build_query_url(query, lang, type, pos)
|
82
82
|
http_request = lambda do
|
83
|
-
logger.debug "Scraping tweets from
|
83
|
+
logger.debug "Scraping tweets from url=#{url}"
|
84
84
|
get_single_page(url, headers, proxies)
|
85
85
|
end
|
86
86
|
|
87
87
|
if cache_enabled?
|
88
88
|
client = Cache.new
|
89
89
|
if (response = client.read(url))
|
90
|
-
logger.debug
|
90
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
91
91
|
else
|
92
92
|
response = http_request.call
|
93
93
|
client.write(url, response) unless stop_requested?
|
@@ -147,21 +147,27 @@ module Twitterscraper
|
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
150
|
-
def build_queries(query, start_date, end_date)
|
150
|
+
def build_queries(query, start_date, end_date, threads_granularity)
|
151
151
|
if start_date && end_date
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
queries = []
|
156
|
-
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
157
|
-
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
152
|
+
if threads_granularity == 'auto'
|
153
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
154
|
+
end
|
158
155
|
|
159
|
-
|
160
|
-
|
161
|
-
|
156
|
+
if threads_granularity == 'day'
|
157
|
+
date_range = start_date.upto(end_date - 1)
|
158
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
159
|
+
else
|
160
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
161
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
162
|
+
queries = []
|
163
|
+
|
164
|
+
while true
|
165
|
+
if time < Time.now.utc
|
166
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
|
167
|
+
end
|
168
|
+
time += 3600
|
169
|
+
break if time >= end_time
|
162
170
|
end
|
163
|
-
time += 3600
|
164
|
-
break if time >= end_time
|
165
171
|
end
|
166
172
|
|
167
173
|
queries
|
@@ -209,11 +215,17 @@ module Twitterscraper
|
|
209
215
|
@stop_requested
|
210
216
|
end
|
211
217
|
|
212
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
213
|
-
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
214
|
-
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
215
|
-
queries = build_queries(query, start_date, end_date)
|
218
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
216
219
|
type = Type.new(type)
|
220
|
+
if type.search?
|
221
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
222
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
223
|
+
elsif type.user?
|
224
|
+
start_date = nil
|
225
|
+
end_date = nil
|
226
|
+
end
|
227
|
+
|
228
|
+
queries = build_queries(query, start_date, end_date, threads_granularity)
|
217
229
|
if threads > queries.size
|
218
230
|
threads = queries.size
|
219
231
|
end
|
@@ -228,6 +240,7 @@ module Twitterscraper
|
|
228
240
|
|
229
241
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
230
242
|
|
243
|
+
logger.info "The number of queries #{queries.size}"
|
231
244
|
logger.info "The number of threads #{threads}"
|
232
245
|
|
233
246
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
@@ -252,15 +265,17 @@ module Twitterscraper
|
|
252
265
|
end
|
253
266
|
end
|
254
267
|
|
268
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
269
|
+
|
255
270
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
256
271
|
end
|
257
272
|
|
258
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
259
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
273
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
274
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
260
275
|
end
|
261
276
|
|
262
277
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
263
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
278
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
264
279
|
end
|
265
280
|
end
|
266
281
|
end
|
@@ -9,8 +9,8 @@ module Twitterscraper
|
|
9
9
|
chart_data: chart_data(tweets).to_json,
|
10
10
|
first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
|
11
11
|
last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
|
12
|
-
|
13
|
-
|
12
|
+
tweets: tweets,
|
13
|
+
convert_limit: 30,
|
14
14
|
)
|
15
15
|
end
|
16
16
|
|
@@ -1,28 +1,29 @@
|
|
1
1
|
<html>
|
2
2
|
<head>
|
3
|
-
<script>
|
4
|
-
window.twttr = (function (d, s, id) {
|
5
|
-
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
6
|
-
if (d.getElementById(id)) return t;
|
7
|
-
js = d.createElement(s);
|
8
|
-
js.id = id;
|
9
|
-
js.src = "https://platform.twitter.com/widgets.js";
|
10
|
-
fjs.parentNode.insertBefore(js, fjs);
|
11
|
-
|
12
|
-
t._e = [];
|
13
|
-
t.ready = function (f) {
|
14
|
-
t._e.push(f);
|
15
|
-
};
|
16
|
-
|
17
|
-
return t;
|
18
|
-
}(document, "script", "twitter-wjs"));
|
19
|
-
</script>
|
20
|
-
|
21
3
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
22
4
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
23
5
|
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
24
6
|
<script>
|
7
|
+
function updateTweets() {
|
8
|
+
window.twttr = (function (d, s, id) {
|
9
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
10
|
+
if (d.getElementById(id)) return t;
|
11
|
+
js = d.createElement(s);
|
12
|
+
js.id = id;
|
13
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
14
|
+
fjs.parentNode.insertBefore(js, fjs);
|
15
|
+
|
16
|
+
t._e = [];
|
17
|
+
t.ready = function (f) {
|
18
|
+
t._e.push(f);
|
19
|
+
};
|
20
|
+
|
21
|
+
return t;
|
22
|
+
}(document, "script", "twitter-wjs"));
|
23
|
+
}
|
24
|
+
|
25
25
|
function drawChart() {
|
26
|
+
var data = <%= chart_data %>;
|
26
27
|
Highcharts.setOptions({
|
27
28
|
time: {
|
28
29
|
timezone: moment.tz.guess()
|
@@ -31,13 +32,13 @@
|
|
31
32
|
|
32
33
|
Highcharts.stockChart('chart', {
|
33
34
|
title: {
|
34
|
-
text: '<%=
|
35
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
35
36
|
},
|
36
37
|
subtitle: {
|
37
|
-
text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
|
38
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
38
39
|
},
|
39
40
|
series: [{
|
40
|
-
data:
|
41
|
+
data: data
|
41
42
|
}],
|
42
43
|
rangeSelector: {enabled: false},
|
43
44
|
scrollbar: {enabled: false},
|
@@ -49,6 +50,7 @@
|
|
49
50
|
|
50
51
|
document.addEventListener("DOMContentLoaded", function () {
|
51
52
|
drawChart();
|
53
|
+
updateTweets();
|
52
54
|
});
|
53
55
|
</script>
|
54
56
|
|
@@ -64,17 +66,31 @@
|
|
64
66
|
</style>
|
65
67
|
</head>
|
66
68
|
<body>
|
67
|
-
<div id="chart"></div>
|
69
|
+
<div id="chart" style="width: 100vw; height: 400px;"></div>
|
68
70
|
|
69
71
|
<div class="tweets-container">
|
70
|
-
<% tweets.each do |tweet| %>
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
<% tweets.each.with_index do |tweet, i| %>
|
73
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
74
|
+
<% if i < convert_limit %>
|
75
|
+
<blockquote class="twitter-tweet">
|
76
|
+
<% else %>
|
77
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
78
|
+
<% end %>
|
79
|
+
|
80
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
81
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
82
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
83
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
84
|
+
</div>
|
85
|
+
|
86
|
+
<div><%= tweet.text %></div>
|
87
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
75
88
|
|
76
|
-
|
77
|
-
|
89
|
+
<% if i < convert_limit %>
|
90
|
+
</blockquote>
|
91
|
+
<% else %>
|
92
|
+
</div>
|
93
|
+
<% end %>
|
78
94
|
<% end %>
|
79
95
|
</div>
|
80
96
|
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/version.rb
CHANGED