twitterscraper-ruby 0.15.2 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README.md +4 -0
- data/lib/twitterscraper/cache.rb +13 -1
- data/lib/twitterscraper/cli.rb +25 -12
- data/lib/twitterscraper/client.rb +27 -1
- data/lib/twitterscraper/query.rb +94 -68
- data/lib/twitterscraper/template.rb +52 -42
- data/lib/twitterscraper/template/tweets.html.erb +112 -0
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/twitterscraper/type.rb +4 -0
- data/lib/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 73a9e9108284fc79cf5ec6b36b6f7ad3f83f2b4f03a2bc527dc18cb4b33e83c7
|
4
|
+
data.tar.gz: c7fcfdbdd1d808780c56610be9b8717352c812759b9344d9fa87cbd430a8d8e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1019547fe8c37a1bb5b4a9cd96a2737a14491087075ff448b48f72538758337c76ab513e153d4567454b192d30fafaa374913ae0c3548d7802e7bdd478fe4a2f
|
7
|
+
data.tar.gz: 48134e8b6858154850003da8684d3c8b7f124cab6d19e0ce76d05326dc8fef44694b32211e245509993e8b7b1afafa6d95914b05c66b9c95c54bb27d041983fe
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,7 @@ end
|
|
98
98
|
"screen_name": "@name",
|
99
99
|
"name": "Name",
|
100
100
|
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
101
102
|
"tweet_id": 1234000000000000,
|
102
103
|
"text": "Thanks Twitter!",
|
103
104
|
"links": [],
|
@@ -122,6 +123,7 @@ end
|
|
122
123
|
- screen_name
|
123
124
|
- name
|
124
125
|
- user_id
|
126
|
+
- profile_image_url
|
125
127
|
- tweet_id
|
126
128
|
- text
|
127
129
|
- links
|
@@ -173,6 +175,8 @@ Search operators documentation is in [Standard search operators](https://develop
|
|
173
175
|
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
176
|
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
177
|
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | day or hour | auto |
|
179
|
+
| `--chart_grouping` | string | day, hour or minute | auto |
|
176
180
|
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
181
|
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
182
|
| `--format` | string | The format of the output. | json(default) or html |
|
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl =
|
7
|
+
@ttl = 86400 * 3 # 3 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,18 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def exist?(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.exist?(file)
|
32
|
+
end
|
33
|
+
|
34
|
+
def delete(key)
|
35
|
+
key = cache_key(key)
|
36
|
+
file = File.join(@dir, key)
|
37
|
+
File.delete(file) if File.exist?(file)
|
38
|
+
end
|
39
|
+
|
28
40
|
def fetch(key, &block)
|
29
41
|
if (value = read(key))
|
30
42
|
value
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -24,21 +24,25 @@ module Twitterscraper
|
|
24
24
|
daily_limit: options['daily_limit'],
|
25
25
|
order: options['order'],
|
26
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
27
28
|
}
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
30
|
-
export(tweets) unless tweets.empty?
|
31
|
+
export(options['query'], tweets) unless tweets.empty?
|
31
32
|
end
|
32
33
|
|
33
|
-
def export(tweets)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
def export(name, tweets)
|
35
|
+
options['format'].split(',').map(&:strip).each do |format|
|
36
|
+
file = build_output_name(format, options)
|
37
|
+
Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
|
38
|
+
|
39
|
+
if format == 'json'
|
40
|
+
File.write(file, generate_json(tweets))
|
41
|
+
elsif format == 'html'
|
42
|
+
File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
|
43
|
+
else
|
44
|
+
puts "Invalid format #{format}"
|
45
|
+
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
|
@@ -69,6 +73,8 @@ module Twitterscraper
|
|
69
73
|
'daily_limit:',
|
70
74
|
'order:',
|
71
75
|
'threads:',
|
76
|
+
'threads_granularity:',
|
77
|
+
'chart_grouping:',
|
72
78
|
'output:',
|
73
79
|
'format:',
|
74
80
|
'cache:',
|
@@ -82,10 +88,10 @@ module Twitterscraper
|
|
82
88
|
options['lang'] ||= ''
|
83
89
|
options['limit'] = (options['limit'] || 100).to_i
|
84
90
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
-
options['threads'] = (options['threads'] ||
|
91
|
+
options['threads'] = (options['threads'] || 10).to_i
|
92
|
+
options['threads_granularity'] ||= 'auto'
|
86
93
|
options['format'] ||= 'json'
|
87
94
|
options['order'] ||= 'desc'
|
88
|
-
options['output'] ||= "tweets.#{options['format']}"
|
89
95
|
|
90
96
|
options['cache'] = options['cache'] != 'false'
|
91
97
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -93,6 +99,13 @@ module Twitterscraper
|
|
93
99
|
options
|
94
100
|
end
|
95
101
|
|
102
|
+
def build_output_name(format, options)
|
103
|
+
query = options['query'].gsub(/[ :?#&]/, '_')
|
104
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
105
|
+
file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
|
106
|
+
File.join('out', file)
|
107
|
+
end
|
108
|
+
|
96
109
|
def initialize_logger
|
97
110
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
111
|
end
|
@@ -2,9 +2,31 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
+
USER_AGENT_LIST = [
|
6
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
7
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
8
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
9
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
10
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
11
|
+
]
|
12
|
+
|
5
13
|
def initialize(cache: true, proxy: true)
|
14
|
+
@request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
15
|
+
Twitterscraper.logger.info "Headers #{@request_headers}"
|
16
|
+
|
6
17
|
@cache = cache
|
7
|
-
|
18
|
+
|
19
|
+
if (@proxy = proxy)
|
20
|
+
@proxies = Proxy::Pool.new
|
21
|
+
Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
|
22
|
+
else
|
23
|
+
@proxies = []
|
24
|
+
Twitterscraper.logger.debug 'Proxy disabled'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def request_headers
|
29
|
+
@request_headers
|
8
30
|
end
|
9
31
|
|
10
32
|
def cache_enabled?
|
@@ -14,5 +36,9 @@ module Twitterscraper
|
|
14
36
|
def proxy_enabled?
|
15
37
|
@proxy
|
16
38
|
end
|
39
|
+
|
40
|
+
def proxies
|
41
|
+
@proxies
|
42
|
+
end
|
17
43
|
end
|
18
44
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -10,14 +10,6 @@ module Twitterscraper
|
|
10
10
|
module Query
|
11
11
|
include Logger
|
12
12
|
|
13
|
-
USER_AGENT_LIST = [
|
14
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
15
|
-
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
16
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
17
|
-
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
18
|
-
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
19
|
-
]
|
20
|
-
|
21
13
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
22
14
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
15
|
'default&include_available_features=1&include_entities=1&' +
|
@@ -43,13 +35,13 @@ module Twitterscraper
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
def get_single_page(url,
|
38
|
+
def get_single_page(url, timeout = 6, retries = 30)
|
47
39
|
return nil if stop_requested?
|
48
|
-
|
40
|
+
if proxy_enabled?
|
49
41
|
proxy = proxies.sample
|
50
42
|
logger.info("Using proxy #{proxy}")
|
51
43
|
end
|
52
|
-
Http.get(url,
|
44
|
+
Http.get(url, request_headers, proxy, timeout)
|
53
45
|
rescue => e
|
54
46
|
logger.debug "get_single_page: #{e.inspect}"
|
55
47
|
if (retries -= 1) > 0
|
@@ -69,30 +61,33 @@ module Twitterscraper
|
|
69
61
|
else
|
70
62
|
json_resp = JSON.parse(text)
|
71
63
|
items_html = json_resp['items_html'] || ''
|
72
|
-
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
73
64
|
end
|
74
65
|
|
75
66
|
[items_html, json_resp]
|
76
67
|
end
|
77
68
|
|
78
|
-
def query_single_page(query, lang, type, pos
|
69
|
+
def query_single_page(query, lang, type, pos)
|
79
70
|
logger.info "Querying #{query}"
|
80
|
-
|
71
|
+
encoded_query = ERB::Util.url_encode(query)
|
81
72
|
|
82
|
-
url = build_query_url(
|
73
|
+
url = build_query_url(encoded_query, lang, type, pos)
|
83
74
|
http_request = lambda do
|
84
|
-
logger.debug "Scraping tweets from
|
85
|
-
get_single_page(url
|
75
|
+
logger.debug "Scraping tweets from url=#{url}"
|
76
|
+
get_single_page(url)
|
86
77
|
end
|
87
78
|
|
88
79
|
if cache_enabled?
|
89
80
|
client = Cache.new
|
90
81
|
if (response = client.read(url))
|
91
|
-
logger.debug
|
82
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
92
83
|
else
|
93
84
|
response = http_request.call
|
94
85
|
client.write(url, response) unless stop_requested?
|
95
86
|
end
|
87
|
+
if @queries && query == @queries.last && pos.nil?
|
88
|
+
logger.debug "Delete a cache query=#{query}"
|
89
|
+
client.delete(url)
|
90
|
+
end
|
96
91
|
else
|
97
92
|
response = http_request.call
|
98
93
|
end
|
@@ -100,6 +95,12 @@ module Twitterscraper
|
|
100
95
|
|
101
96
|
html, json_resp = parse_single_page(response, pos.nil?)
|
102
97
|
|
98
|
+
if json_resp && json_resp['message']
|
99
|
+
logger.warn json_resp['message'] # Sorry, you are rate limited.
|
100
|
+
@stop_requested = true
|
101
|
+
Cache.new.delete(url) if cache_enabled?
|
102
|
+
end
|
103
|
+
|
103
104
|
tweets = Tweet.from_html(html)
|
104
105
|
|
105
106
|
if tweets.empty?
|
@@ -130,126 +131,151 @@ module Twitterscraper
|
|
130
131
|
if start_date && end_date
|
131
132
|
if start_date == end_date
|
132
133
|
raise Error.new('Please specify different values for :start_date and :end_date.')
|
133
|
-
elsif start_date > end_date
|
134
|
+
elsif Date.parse(start_date) > Date.parse(end_date)
|
134
135
|
raise Error.new(':start_date must occur before :end_date.')
|
135
136
|
end
|
136
137
|
end
|
137
138
|
|
138
139
|
if start_date
|
139
|
-
if start_date < OLDEST_DATE
|
140
|
+
if Date.parse(start_date) < OLDEST_DATE
|
140
141
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
141
142
|
end
|
142
143
|
end
|
144
|
+
end
|
143
145
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
146
|
+
def build_queries(query, start_date, end_date, threads_granularity, type)
|
147
|
+
if type.search?
|
148
|
+
start_date = Date.parse(start_date) if start_date.is_a?(String)
|
149
|
+
end_date = Date.parse(end_date) if end_date.is_a?(String)
|
150
|
+
elsif type.user?
|
151
|
+
start_date = nil
|
152
|
+
end_date = nil
|
149
153
|
end
|
150
|
-
end
|
151
154
|
|
152
|
-
def build_queries(query, start_date, end_date)
|
153
155
|
if start_date && end_date
|
154
|
-
|
155
|
-
|
156
|
+
if threads_granularity == 'auto'
|
157
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
158
|
+
end
|
159
|
+
|
160
|
+
if threads_granularity == 'day'
|
161
|
+
date_range = start_date.upto(end_date - 1)
|
162
|
+
queries = date_range.map { |date| query + " since:#{date}_00:00:00_UTC until:#{date + 1}_00:00:00_UTC" }
|
163
|
+
elsif threads_granularity == 'hour'
|
164
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
165
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
166
|
+
queries = []
|
167
|
+
|
168
|
+
while true
|
169
|
+
if time < Time.now.utc
|
170
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H')}:00:00_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H')}:00:00_UTC")
|
171
|
+
end
|
172
|
+
time += 3600
|
173
|
+
break if time >= end_time
|
174
|
+
end
|
175
|
+
else
|
176
|
+
raise Error.new("Invalid :threads_granularity value=#{threads_granularity}")
|
177
|
+
end
|
178
|
+
|
179
|
+
@queries = queries
|
180
|
+
|
156
181
|
elsif start_date
|
157
|
-
[query + " since:#{start_date}"]
|
182
|
+
[query + " since:#{start_date}_00:00:00_UTC"]
|
158
183
|
elsif end_date
|
159
|
-
[query + " until:#{end_date}"]
|
184
|
+
[query + " until:#{end_date}_00:00:00_UTC"]
|
160
185
|
else
|
161
186
|
[query]
|
162
187
|
end
|
163
188
|
end
|
164
189
|
|
165
|
-
def main_loop(query, lang, type, limit, daily_limit
|
190
|
+
def main_loop(query, lang, type, limit, daily_limit)
|
166
191
|
pos = nil
|
167
|
-
|
192
|
+
tmp_tweets = []
|
168
193
|
|
169
194
|
while true
|
170
|
-
new_tweets, new_pos = query_single_page(query, lang, type, pos
|
195
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos)
|
171
196
|
unless new_tweets.empty?
|
172
|
-
|
173
|
-
|
197
|
+
tmp_tweets.concat(new_tweets)
|
198
|
+
tmp_tweets.uniq! { |t| t.tweet_id }
|
199
|
+
end
|
174
200
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
201
|
+
@results_counter[Parallel.worker_number] = tmp_tweets.size
|
202
|
+
total_size = @all_tweets.size + @results_counter.values.sum
|
203
|
+
logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
|
204
|
+
|
205
|
+
if !@stop_requested && total_size >= limit
|
206
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
|
207
|
+
@stop_requested = true
|
179
208
|
end
|
180
|
-
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
181
209
|
|
182
210
|
break unless new_pos
|
183
|
-
break if
|
211
|
+
break if @stop_requested
|
212
|
+
break if daily_limit && tmp_tweets.size >= daily_limit
|
184
213
|
break if @all_tweets.size >= limit
|
185
214
|
|
186
215
|
pos = new_pos
|
187
216
|
end
|
188
217
|
|
189
|
-
|
190
|
-
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
191
|
-
@stop_requested = true
|
192
|
-
end
|
218
|
+
tmp_tweets
|
193
219
|
end
|
194
220
|
|
195
221
|
def stop_requested?
|
196
222
|
@stop_requested
|
197
223
|
end
|
198
224
|
|
199
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
200
|
-
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
201
|
-
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
202
|
-
queries = build_queries(query, start_date, end_date)
|
225
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
203
226
|
type = Type.new(type)
|
227
|
+
queries = build_queries(query, start_date, end_date, threads_granularity, type)
|
204
228
|
if threads > queries.size
|
205
|
-
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
206
229
|
threads = queries.size
|
207
230
|
end
|
208
|
-
if proxy_enabled?
|
209
|
-
proxies = Proxy::Pool.new
|
210
|
-
logger.debug "Fetch #{proxies.size} proxies"
|
211
|
-
else
|
212
|
-
proxies = []
|
213
|
-
logger.debug 'Proxy disabled'
|
214
|
-
end
|
215
231
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
216
232
|
|
217
|
-
|
218
233
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
219
234
|
|
235
|
+
logger.info "The number of queries #{queries.size}"
|
220
236
|
logger.info "The number of threads #{threads}"
|
221
237
|
|
222
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
223
|
-
logger.info "Headers #{headers}"
|
224
|
-
|
225
238
|
@all_tweets = []
|
226
|
-
@mutex = Mutex.new
|
227
239
|
@stop_requested = false
|
240
|
+
@results_counter = {}
|
228
241
|
|
229
242
|
if threads > 1
|
243
|
+
@mutex = Mutex.new
|
230
244
|
Thread.abort_on_exception = true
|
231
245
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
232
246
|
|
233
247
|
Parallel.each(queries, in_threads: threads) do |query|
|
234
|
-
|
248
|
+
@results_counter[Parallel.worker_number] = 0
|
249
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
250
|
+
@mutex.synchronize {
|
251
|
+
@all_tweets.concat(tmp_tweets)
|
252
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
253
|
+
}
|
254
|
+
@results_counter[Parallel.worker_number] = 0
|
255
|
+
|
235
256
|
raise Parallel::Break if stop_requested?
|
236
257
|
end
|
237
258
|
else
|
238
259
|
queries.each do |query|
|
239
|
-
main_loop(query, lang, type, limit, daily_limit
|
260
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
261
|
+
@all_tweets.concat(tmp_tweets)
|
262
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
263
|
+
|
240
264
|
break if stop_requested?
|
241
265
|
end
|
242
266
|
end
|
243
267
|
|
268
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
269
|
+
|
244
270
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
245
271
|
end
|
246
272
|
|
247
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
248
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
273
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
274
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
249
275
|
end
|
250
276
|
|
251
277
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
252
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
278
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
253
279
|
end
|
254
280
|
end
|
255
281
|
end
|
@@ -1,48 +1,58 @@
|
|
1
1
|
module Twitterscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Template
|
3
|
+
def tweets_embedded_html(name, tweets, options)
|
4
|
+
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
5
|
+
template = ERB.new(File.read(path))
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
tweets = tweets.sort_by { |t| t.created_at.to_i }
|
8
|
+
grouping = options['chart_grouping'] || 'auto'
|
9
|
+
|
10
|
+
template.result_with_hash(
|
11
|
+
chart_name: name,
|
12
|
+
chart_data: chart_data(tweets, grouping: grouping).to_json,
|
13
|
+
first_tweet: tweets[0],
|
14
|
+
last_tweet: tweets[-1],
|
15
|
+
tweets: tweets,
|
16
|
+
convert_limit: 30,
|
17
|
+
)
|
8
18
|
end
|
9
19
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
20
|
+
def chart_data(tweets, grouping: 'auto')
|
21
|
+
if grouping && tweets.size > 100
|
22
|
+
if grouping == 'auto'
|
23
|
+
month = 28 * 24 * 60 * 60 # 28 days
|
24
|
+
duration = tweets[-1].created_at - tweets[0].created_at
|
25
|
+
|
26
|
+
if duration > 3 * month
|
27
|
+
grouping = 'day'
|
28
|
+
elsif duration > month || tweets.size > 10000
|
29
|
+
grouping = 'hour'
|
30
|
+
else
|
31
|
+
grouping = 'minute'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
Twitterscraper.logger.info "Chart grouping #{grouping}"
|
37
|
+
|
38
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
39
|
+
t = tweet.created_at
|
40
|
+
|
41
|
+
if grouping == 'day'
|
42
|
+
time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
|
43
|
+
elsif grouping == 'hour'
|
44
|
+
time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
|
45
|
+
elsif grouping == 'minute'
|
46
|
+
time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
|
47
|
+
else
|
48
|
+
time = t
|
49
|
+
end
|
50
|
+
memo[time.to_i] += 1
|
51
|
+
end
|
52
|
+
|
53
|
+
data.sort_by { |k, _| k }.map do |timestamp, count|
|
54
|
+
[timestamp * 1000, count]
|
55
|
+
end
|
56
|
+
end
|
47
57
|
end
|
48
58
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="ja">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
|
6
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
7
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
8
|
+
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
9
|
+
<script>
|
10
|
+
function updateTweets() {
|
11
|
+
window.twttr = (function (d, s, id) {
|
12
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
13
|
+
if (d.getElementById(id)) return t;
|
14
|
+
js = d.createElement(s);
|
15
|
+
js.id = id;
|
16
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
17
|
+
fjs.parentNode.insertBefore(js, fjs);
|
18
|
+
|
19
|
+
t._e = [];
|
20
|
+
t.ready = function (f) {
|
21
|
+
t._e.push(f);
|
22
|
+
};
|
23
|
+
|
24
|
+
return t;
|
25
|
+
}(document, "script", "twitter-wjs"));
|
26
|
+
}
|
27
|
+
|
28
|
+
function drawChart() {
|
29
|
+
Highcharts.setOptions({
|
30
|
+
time: {
|
31
|
+
timezone: moment.tz.guess()
|
32
|
+
}
|
33
|
+
});
|
34
|
+
|
35
|
+
var data = <%= chart_data %>;
|
36
|
+
var config = {
|
37
|
+
title: {
|
38
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
39
|
+
},
|
40
|
+
subtitle: {
|
41
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
42
|
+
},
|
43
|
+
series: [{
|
44
|
+
data: data
|
45
|
+
}],
|
46
|
+
rangeSelector: {enabled: false},
|
47
|
+
scrollbar: {enabled: false},
|
48
|
+
navigator: {enabled: false},
|
49
|
+
exporting: {enabled: false},
|
50
|
+
credits: {enabled: false}
|
51
|
+
};
|
52
|
+
|
53
|
+
Highcharts.stockChart('chart-container', config);
|
54
|
+
}
|
55
|
+
|
56
|
+
document.addEventListener("DOMContentLoaded", function () {
|
57
|
+
drawChart();
|
58
|
+
updateTweets();
|
59
|
+
});
|
60
|
+
</script>
|
61
|
+
|
62
|
+
<style type=text/css>
|
63
|
+
#chart-container {
|
64
|
+
max-width: 1200px;
|
65
|
+
height: 675px;
|
66
|
+
margin: 0 auto;
|
67
|
+
border: 1px solid rgb(204, 214, 221);
|
68
|
+
display: flex;
|
69
|
+
justify-content: center;
|
70
|
+
align-items: center;
|
71
|
+
}
|
72
|
+
.tweets-container {
|
73
|
+
max-width: 550px;
|
74
|
+
margin: 0 auto 0 auto;
|
75
|
+
}
|
76
|
+
|
77
|
+
.twitter-tweet {
|
78
|
+
margin: 15px 0 15px 0 !important;
|
79
|
+
}
|
80
|
+
</style>
|
81
|
+
</head>
|
82
|
+
<body>
|
83
|
+
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
84
|
+
|
85
|
+
<div class="tweets-container">
|
86
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
|
87
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
88
|
+
<% if i < convert_limit %>
|
89
|
+
<blockquote class="twitter-tweet">
|
90
|
+
<% else %>
|
91
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
92
|
+
<% end %>
|
93
|
+
|
94
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
95
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
96
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
97
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
98
|
+
</div>
|
99
|
+
|
100
|
+
<div><%= tweet.text %></div>
|
101
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
102
|
+
|
103
|
+
<% if i < convert_limit %>
|
104
|
+
</blockquote>
|
105
|
+
<% else %>
|
106
|
+
</div>
|
107
|
+
<% end %>
|
108
|
+
<% end %>
|
109
|
+
</div>
|
110
|
+
|
111
|
+
</body>
|
112
|
+
</html>
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/twitterscraper/type.rb
CHANGED
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.20.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -72,6 +72,7 @@ files:
|
|
72
72
|
- lib/twitterscraper/proxy.rb
|
73
73
|
- lib/twitterscraper/query.rb
|
74
74
|
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/template/tweets.html.erb
|
75
76
|
- lib/twitterscraper/tweet.rb
|
76
77
|
- lib/twitterscraper/type.rb
|
77
78
|
- lib/version.rb
|