twitterscraper-ruby 0.16.0 → 0.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README.md +4 -0
- data/lib/twitterscraper/cache.rb +7 -1
- data/lib/twitterscraper/cli.rb +23 -10
- data/lib/twitterscraper/client.rb +27 -1
- data/lib/twitterscraper/query.rb +84 -69
- data/lib/twitterscraper/template.rb +37 -9
- data/lib/twitterscraper/template/tweets.html.erb +61 -31
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/twitterscraper/type.rb +4 -0
- data/lib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8cb289da12a175a02664132b076349edf457585141b4bd196f1e2fb78ea69587
|
|
4
|
+
data.tar.gz: e9b3c55ee7e096b26746473d0b0dd12b7e259331daa6cae304571f902de72dd9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 49a0d32d438c6c202257b733a877624429771cd6a57d3981716df5e0d946fc4b1af87f18d9029d5b43fb9df65a2a4a06579851d979514302999b85612d01f3e5
|
|
7
|
+
data.tar.gz: a59d26670417db7c57d203486b04798510a5f1c85468dde63aa1e7875c4151bc56169a7090ad745b14a08d473e8882a24e75b9ab0d04ae4bcece5912c8c2a3a5
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -98,6 +98,7 @@ end
|
|
|
98
98
|
"screen_name": "@name",
|
|
99
99
|
"name": "Name",
|
|
100
100
|
"user_id": 12340000,
|
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
|
101
102
|
"tweet_id": 1234000000000000,
|
|
102
103
|
"text": "Thanks Twitter!",
|
|
103
104
|
"links": [],
|
|
@@ -122,6 +123,7 @@ end
|
|
|
122
123
|
- screen_name
|
|
123
124
|
- name
|
|
124
125
|
- user_id
|
|
126
|
+
- profile_image_url
|
|
125
127
|
- tweet_id
|
|
126
128
|
- text
|
|
127
129
|
- links
|
|
@@ -173,6 +175,8 @@ Search operators documentation is in [Standard search operators](https://develop
|
|
|
173
175
|
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
|
174
176
|
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
|
175
177
|
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
|
178
|
+
| `--threads_granularity` | string | day or hour | auto |
|
|
179
|
+
| `--chart_grouping` | string | day, hour or minute | auto |
|
|
176
180
|
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
|
177
181
|
| `--cache` | boolean | Enable caching. | true(default) or false |
|
|
178
182
|
| `--format` | string | The format of the output. | json(default) or html |
|
data/lib/twitterscraper/cache.rb
CHANGED
|
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
|
4
4
|
module Twitterscraper
|
|
5
5
|
class Cache
|
|
6
6
|
def initialize()
|
|
7
|
-
@ttl = 86400 #
|
|
7
|
+
@ttl = 86400 * 3 # 3 day
|
|
8
8
|
@dir = 'cache'
|
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
|
10
10
|
end
|
|
@@ -25,6 +25,12 @@ module Twitterscraper
|
|
|
25
25
|
File.write(file, entry.to_json)
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
def exist?(key)
|
|
29
|
+
key = cache_key(key)
|
|
30
|
+
file = File.join(@dir, key)
|
|
31
|
+
File.exist?(file)
|
|
32
|
+
end
|
|
33
|
+
|
|
28
34
|
def delete(key)
|
|
29
35
|
key = cache_key(key)
|
|
30
36
|
file = File.join(@dir, key)
|
data/lib/twitterscraper/cli.rb
CHANGED
|
@@ -24,6 +24,7 @@ module Twitterscraper
|
|
|
24
24
|
daily_limit: options['daily_limit'],
|
|
25
25
|
order: options['order'],
|
|
26
26
|
threads: options['threads'],
|
|
27
|
+
threads_granularity: options['threads_granularity'],
|
|
27
28
|
}
|
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
|
@@ -31,14 +32,17 @@ module Twitterscraper
|
|
|
31
32
|
end
|
|
32
33
|
|
|
33
34
|
def export(name, tweets)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
35
|
+
options['format'].split(',').map(&:strip).each do |format|
|
|
36
|
+
file = build_output_name(format, options)
|
|
37
|
+
Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
|
|
38
|
+
|
|
39
|
+
if format == 'json'
|
|
40
|
+
File.write(file, generate_json(tweets))
|
|
41
|
+
elsif format == 'html'
|
|
42
|
+
File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
|
|
43
|
+
else
|
|
44
|
+
puts "Invalid format #{format}"
|
|
45
|
+
end
|
|
42
46
|
end
|
|
43
47
|
end
|
|
44
48
|
|
|
@@ -69,6 +73,8 @@ module Twitterscraper
|
|
|
69
73
|
'daily_limit:',
|
|
70
74
|
'order:',
|
|
71
75
|
'threads:',
|
|
76
|
+
'threads_granularity:',
|
|
77
|
+
'chart_grouping:',
|
|
72
78
|
'output:',
|
|
73
79
|
'format:',
|
|
74
80
|
'cache:',
|
|
@@ -82,10 +88,10 @@ module Twitterscraper
|
|
|
82
88
|
options['lang'] ||= ''
|
|
83
89
|
options['limit'] = (options['limit'] || 100).to_i
|
|
84
90
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
|
85
|
-
options['threads'] = (options['threads'] ||
|
|
91
|
+
options['threads'] = (options['threads'] || 10).to_i
|
|
92
|
+
options['threads_granularity'] ||= 'auto'
|
|
86
93
|
options['format'] ||= 'json'
|
|
87
94
|
options['order'] ||= 'desc'
|
|
88
|
-
options['output'] ||= "tweets.#{options['format']}"
|
|
89
95
|
|
|
90
96
|
options['cache'] = options['cache'] != 'false'
|
|
91
97
|
options['proxy'] = options['proxy'] != 'false'
|
|
@@ -93,6 +99,13 @@ module Twitterscraper
|
|
|
93
99
|
options
|
|
94
100
|
end
|
|
95
101
|
|
|
102
|
+
def build_output_name(format, options)
|
|
103
|
+
query = options['query'].gsub(/[ :?#&]/, '_')
|
|
104
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
|
105
|
+
file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
|
|
106
|
+
File.join('out', file)
|
|
107
|
+
end
|
|
108
|
+
|
|
96
109
|
def initialize_logger
|
|
97
110
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
|
98
111
|
end
|
|
@@ -2,9 +2,31 @@ module Twitterscraper
|
|
|
2
2
|
class Client
|
|
3
3
|
include Query
|
|
4
4
|
|
|
5
|
+
USER_AGENT_LIST = [
|
|
6
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
|
7
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
|
8
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
|
9
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
|
10
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
|
11
|
+
]
|
|
12
|
+
|
|
5
13
|
def initialize(cache: true, proxy: true)
|
|
14
|
+
@request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
|
15
|
+
Twitterscraper.logger.info "Headers #{@request_headers}"
|
|
16
|
+
|
|
6
17
|
@cache = cache
|
|
7
|
-
|
|
18
|
+
|
|
19
|
+
if (@proxy = proxy)
|
|
20
|
+
@proxies = Proxy::Pool.new
|
|
21
|
+
Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
|
|
22
|
+
else
|
|
23
|
+
@proxies = []
|
|
24
|
+
Twitterscraper.logger.debug 'Proxy disabled'
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def request_headers
|
|
29
|
+
@request_headers
|
|
8
30
|
end
|
|
9
31
|
|
|
10
32
|
def cache_enabled?
|
|
@@ -14,5 +36,9 @@ module Twitterscraper
|
|
|
14
36
|
def proxy_enabled?
|
|
15
37
|
@proxy
|
|
16
38
|
end
|
|
39
|
+
|
|
40
|
+
def proxies
|
|
41
|
+
@proxies
|
|
42
|
+
end
|
|
17
43
|
end
|
|
18
44
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
|
@@ -10,14 +10,6 @@ module Twitterscraper
|
|
|
10
10
|
module Query
|
|
11
11
|
include Logger
|
|
12
12
|
|
|
13
|
-
USER_AGENT_LIST = [
|
|
14
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
|
15
|
-
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
|
16
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
|
17
|
-
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
|
18
|
-
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
|
19
|
-
]
|
|
20
|
-
|
|
21
13
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
|
22
14
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
|
23
15
|
'default&include_available_features=1&include_entities=1&' +
|
|
@@ -43,13 +35,13 @@ module Twitterscraper
|
|
|
43
35
|
end
|
|
44
36
|
end
|
|
45
37
|
|
|
46
|
-
def get_single_page(url,
|
|
38
|
+
def get_single_page(url, timeout = 6, retries = 30)
|
|
47
39
|
return nil if stop_requested?
|
|
48
|
-
|
|
40
|
+
if proxy_enabled?
|
|
49
41
|
proxy = proxies.sample
|
|
50
42
|
logger.info("Using proxy #{proxy}")
|
|
51
43
|
end
|
|
52
|
-
Http.get(url,
|
|
44
|
+
Http.get(url, request_headers, proxy, timeout)
|
|
53
45
|
rescue => e
|
|
54
46
|
logger.debug "get_single_page: #{e.inspect}"
|
|
55
47
|
if (retries -= 1) > 0
|
|
@@ -74,24 +66,28 @@ module Twitterscraper
|
|
|
74
66
|
[items_html, json_resp]
|
|
75
67
|
end
|
|
76
68
|
|
|
77
|
-
def query_single_page(query, lang, type, pos
|
|
69
|
+
def query_single_page(query, lang, type, pos)
|
|
78
70
|
logger.info "Querying #{query}"
|
|
79
|
-
|
|
71
|
+
encoded_query = ERB::Util.url_encode(query)
|
|
80
72
|
|
|
81
|
-
url = build_query_url(
|
|
73
|
+
url = build_query_url(encoded_query, lang, type, pos)
|
|
82
74
|
http_request = lambda do
|
|
83
|
-
logger.debug "Scraping tweets from
|
|
84
|
-
get_single_page(url
|
|
75
|
+
logger.debug "Scraping tweets from url=#{url}"
|
|
76
|
+
get_single_page(url)
|
|
85
77
|
end
|
|
86
78
|
|
|
87
79
|
if cache_enabled?
|
|
88
80
|
client = Cache.new
|
|
89
81
|
if (response = client.read(url))
|
|
90
|
-
logger.debug
|
|
82
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
|
91
83
|
else
|
|
92
84
|
response = http_request.call
|
|
93
85
|
client.write(url, response) unless stop_requested?
|
|
94
86
|
end
|
|
87
|
+
if @queries && query == @queries.last && pos.nil?
|
|
88
|
+
logger.debug "Delete a cache query=#{query}"
|
|
89
|
+
client.delete(url)
|
|
90
|
+
end
|
|
95
91
|
else
|
|
96
92
|
response = http_request.call
|
|
97
93
|
end
|
|
@@ -135,132 +131,151 @@ module Twitterscraper
|
|
|
135
131
|
if start_date && end_date
|
|
136
132
|
if start_date == end_date
|
|
137
133
|
raise Error.new('Please specify different values for :start_date and :end_date.')
|
|
138
|
-
elsif start_date > end_date
|
|
134
|
+
elsif Date.parse(start_date) > Date.parse(end_date)
|
|
139
135
|
raise Error.new(':start_date must occur before :end_date.')
|
|
140
136
|
end
|
|
141
137
|
end
|
|
142
138
|
|
|
143
139
|
if start_date
|
|
144
|
-
if start_date < OLDEST_DATE
|
|
140
|
+
if Date.parse(start_date) < OLDEST_DATE
|
|
145
141
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
|
146
142
|
end
|
|
147
143
|
end
|
|
148
144
|
end
|
|
149
145
|
|
|
150
|
-
def build_queries(query, start_date, end_date)
|
|
151
|
-
if
|
|
152
|
-
|
|
153
|
-
|
|
146
|
+
def build_queries(query, start_date, end_date, threads_granularity, type)
|
|
147
|
+
if type.search?
|
|
148
|
+
start_date = Date.parse(start_date) if start_date.is_a?(String)
|
|
149
|
+
end_date = Date.parse(end_date) if end_date.is_a?(String)
|
|
150
|
+
elsif type.user?
|
|
151
|
+
start_date = nil
|
|
152
|
+
end_date = nil
|
|
153
|
+
end
|
|
154
154
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
155
|
+
if start_date && end_date
|
|
156
|
+
if threads_granularity == 'auto'
|
|
157
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
|
158
|
+
end
|
|
158
159
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
160
|
+
if threads_granularity == 'day'
|
|
161
|
+
date_range = start_date.upto(end_date - 1)
|
|
162
|
+
queries = date_range.map { |date| query + " since:#{date}_00:00:00_UTC until:#{date + 1}_00:00:00_UTC" }
|
|
163
|
+
elsif threads_granularity == 'hour'
|
|
164
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
|
165
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
|
166
|
+
queries = []
|
|
167
|
+
|
|
168
|
+
while true
|
|
169
|
+
if time < Time.now.utc
|
|
170
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H')}:00:00_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H')}:00:00_UTC")
|
|
171
|
+
end
|
|
172
|
+
time += 3600
|
|
173
|
+
break if time >= end_time
|
|
162
174
|
end
|
|
163
|
-
|
|
164
|
-
|
|
175
|
+
else
|
|
176
|
+
raise Error.new("Invalid :threads_granularity value=#{threads_granularity}")
|
|
165
177
|
end
|
|
166
178
|
|
|
167
|
-
queries
|
|
179
|
+
@queries = queries
|
|
168
180
|
|
|
169
181
|
elsif start_date
|
|
170
|
-
[query + " since:#{start_date}"]
|
|
182
|
+
[query + " since:#{start_date}_00:00:00_UTC"]
|
|
171
183
|
elsif end_date
|
|
172
|
-
[query + " until:#{end_date}"]
|
|
184
|
+
[query + " until:#{end_date}_00:00:00_UTC"]
|
|
173
185
|
else
|
|
174
186
|
[query]
|
|
175
187
|
end
|
|
176
188
|
end
|
|
177
189
|
|
|
178
|
-
def main_loop(query, lang, type, limit, daily_limit
|
|
190
|
+
def main_loop(query, lang, type, limit, daily_limit)
|
|
179
191
|
pos = nil
|
|
180
|
-
|
|
192
|
+
tmp_tweets = []
|
|
181
193
|
|
|
182
194
|
while true
|
|
183
|
-
new_tweets, new_pos = query_single_page(query, lang, type, pos
|
|
195
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos)
|
|
184
196
|
unless new_tweets.empty?
|
|
185
|
-
|
|
186
|
-
|
|
197
|
+
tmp_tweets.concat(new_tweets)
|
|
198
|
+
tmp_tweets.uniq! { |t| t.tweet_id }
|
|
199
|
+
end
|
|
187
200
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
201
|
+
@results_counter[Parallel.worker_number] = tmp_tweets.size
|
|
202
|
+
total_size = @all_tweets.size + @results_counter.values.sum
|
|
203
|
+
logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
|
|
204
|
+
|
|
205
|
+
if !@stop_requested && total_size >= limit
|
|
206
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
|
|
207
|
+
@stop_requested = true
|
|
192
208
|
end
|
|
193
|
-
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
|
194
209
|
|
|
195
210
|
break unless new_pos
|
|
196
|
-
break if
|
|
211
|
+
break if @stop_requested
|
|
212
|
+
break if daily_limit && tmp_tweets.size >= daily_limit
|
|
197
213
|
break if @all_tweets.size >= limit
|
|
198
214
|
|
|
199
215
|
pos = new_pos
|
|
200
216
|
end
|
|
201
217
|
|
|
202
|
-
|
|
203
|
-
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
|
204
|
-
@stop_requested = true
|
|
205
|
-
end
|
|
218
|
+
tmp_tweets
|
|
206
219
|
end
|
|
207
220
|
|
|
208
221
|
def stop_requested?
|
|
209
222
|
@stop_requested
|
|
210
223
|
end
|
|
211
224
|
|
|
212
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
|
213
|
-
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
|
214
|
-
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
|
215
|
-
queries = build_queries(query, start_date, end_date)
|
|
225
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
|
216
226
|
type = Type.new(type)
|
|
227
|
+
queries = build_queries(query, start_date, end_date, threads_granularity, type)
|
|
217
228
|
if threads > queries.size
|
|
218
229
|
threads = queries.size
|
|
219
230
|
end
|
|
220
|
-
if proxy_enabled?
|
|
221
|
-
proxies = Proxy::Pool.new
|
|
222
|
-
logger.debug "Fetch #{proxies.size} proxies"
|
|
223
|
-
else
|
|
224
|
-
proxies = []
|
|
225
|
-
logger.debug 'Proxy disabled'
|
|
226
|
-
end
|
|
227
231
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
|
228
232
|
|
|
229
233
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
|
230
234
|
|
|
235
|
+
logger.info "The number of queries #{queries.size}"
|
|
231
236
|
logger.info "The number of threads #{threads}"
|
|
232
237
|
|
|
233
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
|
234
|
-
logger.info "Headers #{headers}"
|
|
235
|
-
|
|
236
238
|
@all_tweets = []
|
|
237
|
-
@mutex = Mutex.new
|
|
238
239
|
@stop_requested = false
|
|
240
|
+
@results_counter = {}
|
|
239
241
|
|
|
240
242
|
if threads > 1
|
|
243
|
+
@mutex = Mutex.new
|
|
241
244
|
Thread.abort_on_exception = true
|
|
242
245
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
|
243
246
|
|
|
244
247
|
Parallel.each(queries, in_threads: threads) do |query|
|
|
245
|
-
|
|
248
|
+
@results_counter[Parallel.worker_number] = 0
|
|
249
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
|
250
|
+
@mutex.synchronize {
|
|
251
|
+
@all_tweets.concat(tmp_tweets)
|
|
252
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
|
253
|
+
}
|
|
254
|
+
@results_counter[Parallel.worker_number] = 0
|
|
255
|
+
|
|
246
256
|
raise Parallel::Break if stop_requested?
|
|
247
257
|
end
|
|
248
258
|
else
|
|
249
259
|
queries.each do |query|
|
|
250
|
-
main_loop(query, lang, type, limit, daily_limit
|
|
260
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
|
261
|
+
@all_tweets.concat(tmp_tweets)
|
|
262
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
|
263
|
+
|
|
251
264
|
break if stop_requested?
|
|
252
265
|
end
|
|
253
266
|
end
|
|
254
267
|
|
|
268
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
|
269
|
+
|
|
255
270
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
|
256
271
|
end
|
|
257
272
|
|
|
258
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
|
259
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
|
273
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
|
274
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
|
260
275
|
end
|
|
261
276
|
|
|
262
277
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
|
263
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
|
278
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
|
264
279
|
end
|
|
265
280
|
end
|
|
266
281
|
end
|
|
@@ -4,25 +4,53 @@ module Twitterscraper
|
|
|
4
4
|
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
|
5
5
|
template = ERB.new(File.read(path))
|
|
6
6
|
|
|
7
|
+
tweets = tweets.sort_by { |t| t.created_at.to_i }
|
|
8
|
+
grouping = options['chart_grouping'] || 'auto'
|
|
9
|
+
|
|
7
10
|
template.result_with_hash(
|
|
8
11
|
chart_name: name,
|
|
9
|
-
chart_data: chart_data(tweets).to_json,
|
|
10
|
-
first_tweet: tweets
|
|
11
|
-
last_tweet: tweets
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
chart_data: chart_data(tweets, grouping: grouping).to_json,
|
|
13
|
+
first_tweet: tweets[0],
|
|
14
|
+
last_tweet: tweets[-1],
|
|
15
|
+
tweets: tweets,
|
|
16
|
+
convert_limit: 30,
|
|
14
17
|
)
|
|
15
18
|
end
|
|
16
19
|
|
|
17
|
-
def chart_data(tweets)
|
|
20
|
+
def chart_data(tweets, grouping: 'auto')
|
|
21
|
+
if grouping && tweets.size > 100
|
|
22
|
+
if grouping == 'auto'
|
|
23
|
+
month = 28 * 24 * 60 * 60 # 28 days
|
|
24
|
+
duration = tweets[-1].created_at - tweets[0].created_at
|
|
25
|
+
|
|
26
|
+
if duration > 3 * month
|
|
27
|
+
grouping = 'day'
|
|
28
|
+
elsif duration > month || tweets.size > 10000
|
|
29
|
+
grouping = 'hour'
|
|
30
|
+
else
|
|
31
|
+
grouping = 'minute'
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
Twitterscraper.logger.info "Chart grouping #{grouping}"
|
|
37
|
+
|
|
18
38
|
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
|
19
39
|
t = tweet.created_at
|
|
20
|
-
|
|
21
|
-
|
|
40
|
+
|
|
41
|
+
if grouping == 'day'
|
|
42
|
+
time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
|
|
43
|
+
elsif grouping == 'hour'
|
|
44
|
+
time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
|
|
45
|
+
elsif grouping == 'minute'
|
|
46
|
+
time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
|
|
47
|
+
else
|
|
48
|
+
time = t
|
|
49
|
+
end
|
|
22
50
|
memo[time.to_i] += 1
|
|
23
51
|
end
|
|
24
52
|
|
|
25
|
-
data.sort_by { |k,
|
|
53
|
+
data.sort_by { |k, _| k }.map do |timestamp, count|
|
|
26
54
|
[timestamp * 1000, count]
|
|
27
55
|
end
|
|
28
56
|
end
|
|
@@ -1,27 +1,30 @@
|
|
|
1
|
-
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="ja">
|
|
2
3
|
<head>
|
|
3
|
-
<
|
|
4
|
-
window.twttr = (function (d, s, id) {
|
|
5
|
-
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
|
6
|
-
if (d.getElementById(id)) return t;
|
|
7
|
-
js = d.createElement(s);
|
|
8
|
-
js.id = id;
|
|
9
|
-
js.src = "https://platform.twitter.com/widgets.js";
|
|
10
|
-
fjs.parentNode.insertBefore(js, fjs);
|
|
11
|
-
|
|
12
|
-
t._e = [];
|
|
13
|
-
t.ready = function (f) {
|
|
14
|
-
t._e.push(f);
|
|
15
|
-
};
|
|
16
|
-
|
|
17
|
-
return t;
|
|
18
|
-
}(document, "script", "twitter-wjs"));
|
|
19
|
-
</script>
|
|
4
|
+
<meta charset="UTF-8">
|
|
20
5
|
|
|
21
6
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
|
22
7
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
|
23
8
|
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
|
24
9
|
<script>
|
|
10
|
+
function updateTweets() {
|
|
11
|
+
window.twttr = (function (d, s, id) {
|
|
12
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
|
13
|
+
if (d.getElementById(id)) return t;
|
|
14
|
+
js = d.createElement(s);
|
|
15
|
+
js.id = id;
|
|
16
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
|
17
|
+
fjs.parentNode.insertBefore(js, fjs);
|
|
18
|
+
|
|
19
|
+
t._e = [];
|
|
20
|
+
t.ready = function (f) {
|
|
21
|
+
t._e.push(f);
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
return t;
|
|
25
|
+
}(document, "script", "twitter-wjs"));
|
|
26
|
+
}
|
|
27
|
+
|
|
25
28
|
function drawChart() {
|
|
26
29
|
Highcharts.setOptions({
|
|
27
30
|
time: {
|
|
@@ -29,30 +32,43 @@
|
|
|
29
32
|
}
|
|
30
33
|
});
|
|
31
34
|
|
|
32
|
-
|
|
35
|
+
var data = <%= chart_data %>;
|
|
36
|
+
var config = {
|
|
33
37
|
title: {
|
|
34
|
-
text: '<%=
|
|
38
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
|
35
39
|
},
|
|
36
40
|
subtitle: {
|
|
37
|
-
text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
|
|
41
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
|
38
42
|
},
|
|
39
43
|
series: [{
|
|
40
|
-
data:
|
|
44
|
+
data: data
|
|
41
45
|
}],
|
|
42
46
|
rangeSelector: {enabled: false},
|
|
43
47
|
scrollbar: {enabled: false},
|
|
44
48
|
navigator: {enabled: false},
|
|
45
49
|
exporting: {enabled: false},
|
|
46
50
|
credits: {enabled: false}
|
|
47
|
-
}
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
Highcharts.stockChart('chart-container', config);
|
|
48
54
|
}
|
|
49
55
|
|
|
50
56
|
document.addEventListener("DOMContentLoaded", function () {
|
|
51
57
|
drawChart();
|
|
58
|
+
updateTweets();
|
|
52
59
|
});
|
|
53
60
|
</script>
|
|
54
61
|
|
|
55
62
|
<style type=text/css>
|
|
63
|
+
#chart-container {
|
|
64
|
+
max-width: 1200px;
|
|
65
|
+
height: 675px;
|
|
66
|
+
margin: 0 auto;
|
|
67
|
+
border: 1px solid rgb(204, 214, 221);
|
|
68
|
+
display: flex;
|
|
69
|
+
justify-content: center;
|
|
70
|
+
align-items: center;
|
|
71
|
+
}
|
|
56
72
|
.tweets-container {
|
|
57
73
|
max-width: 550px;
|
|
58
74
|
margin: 0 auto 0 auto;
|
|
@@ -64,17 +80,31 @@
|
|
|
64
80
|
</style>
|
|
65
81
|
</head>
|
|
66
82
|
<body>
|
|
67
|
-
<div id="chart"></div>
|
|
83
|
+
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
|
68
84
|
|
|
69
85
|
<div class="tweets-container">
|
|
70
|
-
<% tweets.each do |tweet| %>
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
86
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
|
|
87
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
|
88
|
+
<% if i < convert_limit %>
|
|
89
|
+
<blockquote class="twitter-tweet">
|
|
90
|
+
<% else %>
|
|
91
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
|
92
|
+
<% end %>
|
|
93
|
+
|
|
94
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
|
95
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
|
96
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
|
97
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
|
98
|
+
</div>
|
|
99
|
+
|
|
100
|
+
<div><%= tweet.text %></div>
|
|
101
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
|
75
102
|
|
|
76
|
-
|
|
77
|
-
|
|
103
|
+
<% if i < convert_limit %>
|
|
104
|
+
</blockquote>
|
|
105
|
+
<% else %>
|
|
106
|
+
</div>
|
|
107
|
+
<% end %>
|
|
78
108
|
<% end %>
|
|
79
109
|
</div>
|
|
80
110
|
|
data/lib/twitterscraper/tweet.rb
CHANGED
|
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
|
6
6
|
:screen_name,
|
|
7
7
|
:name,
|
|
8
8
|
:user_id,
|
|
9
|
+
:profile_image_url,
|
|
9
10
|
:tweet_id,
|
|
10
11
|
:text,
|
|
11
12
|
:links,
|
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
|
51
52
|
end
|
|
52
53
|
end
|
|
53
54
|
|
|
55
|
+
# .js-stream-item
|
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
|
57
|
+
# .stream-item-header
|
|
58
|
+
# .js-tweet-text-container
|
|
59
|
+
# .stream-item-footer
|
|
54
60
|
def from_html(text)
|
|
55
61
|
html = Nokogiri::HTML(text)
|
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
|
72
78
|
end
|
|
73
79
|
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
|
81
|
+
|
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
|
99
107
|
screen_name: screen_name,
|
|
100
108
|
name: html.attr('data-name'),
|
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
|
110
|
+
profile_image_url: profile_image_url,
|
|
102
111
|
tweet_id: tweet_id,
|
|
103
112
|
text: text,
|
|
104
113
|
links: links,
|
data/lib/twitterscraper/type.rb
CHANGED
data/lib/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: twitterscraper-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.20.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- ts-3156
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-04-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|