twitterscraper-ruby 0.15.1 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README.md +3 -0
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +13 -1
- data/lib/twitterscraper/cli.rb +24 -12
- data/lib/twitterscraper/client.rb +27 -1
- data/lib/twitterscraper/query.rb +92 -67
- data/lib/twitterscraper/template.rb +51 -42
- data/lib/twitterscraper/template/tweets.html.erb +112 -0
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/twitterscraper/type.rb +19 -0
- data/lib/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
|
4
|
+
data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
|
7
|
+
data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,7 @@ end
|
|
98
98
|
"screen_name": "@name",
|
99
99
|
"name": "Name",
|
100
100
|
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
101
102
|
"tweet_id": 1234000000000000,
|
102
103
|
"text": "Thanks Twitter!",
|
103
104
|
"links": [],
|
@@ -122,6 +123,7 @@ end
|
|
122
123
|
- screen_name
|
123
124
|
- name
|
124
125
|
- user_id
|
126
|
+
- profile_image_url
|
125
127
|
- tweet_id
|
126
128
|
- text
|
127
129
|
- links
|
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
|
|
173
175
|
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
174
176
|
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
175
177
|
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | | auto |
|
176
179
|
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
177
180
|
| `--cache` | boolean | Enable caching. | true(default) or false |
|
178
181
|
| `--format` | string | The format of the output. | json(default) or html |
|
data/lib/twitterscraper.rb
CHANGED
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl =
|
7
|
+
@ttl = 86400 * 3 # 3 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,18 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def exist?(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.exist?(file)
|
32
|
+
end
|
33
|
+
|
34
|
+
def delete(key)
|
35
|
+
key = cache_key(key)
|
36
|
+
file = File.join(@dir, key)
|
37
|
+
File.delete(file) if File.exist?(file)
|
38
|
+
end
|
39
|
+
|
28
40
|
def fetch(key, &block)
|
29
41
|
if (value = read(key))
|
30
42
|
value
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -24,21 +24,25 @@ module Twitterscraper
|
|
24
24
|
daily_limit: options['daily_limit'],
|
25
25
|
order: options['order'],
|
26
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
27
28
|
}
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
30
|
-
export(tweets) unless tweets.empty?
|
31
|
+
export(options['query'], tweets) unless tweets.empty?
|
31
32
|
end
|
32
33
|
|
33
|
-
def export(tweets)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
def export(name, tweets)
|
35
|
+
options['format'].split(',').map(&:strip).each do |format|
|
36
|
+
file = build_output_name(format, options)
|
37
|
+
Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
|
38
|
+
|
39
|
+
if format == 'json'
|
40
|
+
File.write(file, generate_json(tweets))
|
41
|
+
elsif format == 'html'
|
42
|
+
File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
|
43
|
+
else
|
44
|
+
puts "Invalid format #{format}"
|
45
|
+
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
|
@@ -69,6 +73,7 @@ module Twitterscraper
|
|
69
73
|
'daily_limit:',
|
70
74
|
'order:',
|
71
75
|
'threads:',
|
76
|
+
'threads_granularity:',
|
72
77
|
'output:',
|
73
78
|
'format:',
|
74
79
|
'cache:',
|
@@ -82,10 +87,10 @@ module Twitterscraper
|
|
82
87
|
options['lang'] ||= ''
|
83
88
|
options['limit'] = (options['limit'] || 100).to_i
|
84
89
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
-
options['threads'] = (options['threads'] ||
|
90
|
+
options['threads'] = (options['threads'] || 10).to_i
|
91
|
+
options['threads_granularity'] ||= 'auto'
|
86
92
|
options['format'] ||= 'json'
|
87
93
|
options['order'] ||= 'desc'
|
88
|
-
options['output'] ||= "tweets.#{options['format']}"
|
89
94
|
|
90
95
|
options['cache'] = options['cache'] != 'false'
|
91
96
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -93,6 +98,13 @@ module Twitterscraper
|
|
93
98
|
options
|
94
99
|
end
|
95
100
|
|
101
|
+
def build_output_name(format, options)
|
102
|
+
query = options['query'].gsub(/[ :?#&]/, '_')
|
103
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
104
|
+
file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
|
105
|
+
File.join('out', file)
|
106
|
+
end
|
107
|
+
|
96
108
|
def initialize_logger
|
97
109
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
110
|
end
|
@@ -2,9 +2,31 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
+
USER_AGENT_LIST = [
|
6
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
7
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
8
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
9
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
10
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
11
|
+
]
|
12
|
+
|
5
13
|
def initialize(cache: true, proxy: true)
|
14
|
+
@request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
15
|
+
Twitterscraper.logger.info "Headers #{@request_headers}"
|
16
|
+
|
6
17
|
@cache = cache
|
7
|
-
|
18
|
+
|
19
|
+
if (@proxy = proxy)
|
20
|
+
@proxies = Proxy::Pool.new
|
21
|
+
Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
|
22
|
+
else
|
23
|
+
@proxies = []
|
24
|
+
Twitterscraper.logger.debug 'Proxy disabled'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def request_headers
|
29
|
+
@request_headers
|
8
30
|
end
|
9
31
|
|
10
32
|
def cache_enabled?
|
@@ -14,5 +36,9 @@ module Twitterscraper
|
|
14
36
|
def proxy_enabled?
|
15
37
|
@proxy
|
16
38
|
end
|
39
|
+
|
40
|
+
def proxies
|
41
|
+
@proxies
|
42
|
+
end
|
17
43
|
end
|
18
44
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -10,14 +10,6 @@ module Twitterscraper
|
|
10
10
|
module Query
|
11
11
|
include Logger
|
12
12
|
|
13
|
-
USER_AGENT_LIST = [
|
14
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
15
|
-
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
16
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
17
|
-
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
18
|
-
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
19
|
-
]
|
20
|
-
|
21
13
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
22
14
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
15
|
'default&include_available_features=1&include_entities=1&' +
|
@@ -28,7 +20,7 @@ module Twitterscraper
|
|
28
20
|
'max_position=__POS__&reset_error_state=false'
|
29
21
|
|
30
22
|
def build_query_url(query, lang, type, pos)
|
31
|
-
if type
|
23
|
+
if type.user?
|
32
24
|
if pos
|
33
25
|
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
26
|
else
|
@@ -43,13 +35,13 @@ module Twitterscraper
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
def get_single_page(url,
|
38
|
+
def get_single_page(url, timeout = 6, retries = 30)
|
47
39
|
return nil if stop_requested?
|
48
|
-
|
40
|
+
if proxy_enabled?
|
49
41
|
proxy = proxies.sample
|
50
42
|
logger.info("Using proxy #{proxy}")
|
51
43
|
end
|
52
|
-
Http.get(url,
|
44
|
+
Http.get(url, request_headers, proxy, timeout)
|
53
45
|
rescue => e
|
54
46
|
logger.debug "get_single_page: #{e.inspect}"
|
55
47
|
if (retries -= 1) > 0
|
@@ -69,30 +61,33 @@ module Twitterscraper
|
|
69
61
|
else
|
70
62
|
json_resp = JSON.parse(text)
|
71
63
|
items_html = json_resp['items_html'] || ''
|
72
|
-
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
73
64
|
end
|
74
65
|
|
75
66
|
[items_html, json_resp]
|
76
67
|
end
|
77
68
|
|
78
|
-
def query_single_page(query, lang, type, pos
|
69
|
+
def query_single_page(query, lang, type, pos)
|
79
70
|
logger.info "Querying #{query}"
|
80
|
-
|
71
|
+
encoded_query = ERB::Util.url_encode(query)
|
81
72
|
|
82
|
-
url = build_query_url(
|
73
|
+
url = build_query_url(encoded_query, lang, type, pos)
|
83
74
|
http_request = lambda do
|
84
|
-
logger.debug "Scraping tweets from
|
85
|
-
get_single_page(url
|
75
|
+
logger.debug "Scraping tweets from url=#{url}"
|
76
|
+
get_single_page(url)
|
86
77
|
end
|
87
78
|
|
88
79
|
if cache_enabled?
|
89
80
|
client = Cache.new
|
90
81
|
if (response = client.read(url))
|
91
|
-
logger.debug
|
82
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
92
83
|
else
|
93
84
|
response = http_request.call
|
94
85
|
client.write(url, response) unless stop_requested?
|
95
86
|
end
|
87
|
+
if @queries && query == @queries.last && pos.nil?
|
88
|
+
logger.debug "Delete a cache query=#{query}"
|
89
|
+
client.delete(url)
|
90
|
+
end
|
96
91
|
else
|
97
92
|
response = http_request.call
|
98
93
|
end
|
@@ -100,6 +95,12 @@ module Twitterscraper
|
|
100
95
|
|
101
96
|
html, json_resp = parse_single_page(response, pos.nil?)
|
102
97
|
|
98
|
+
if json_resp && json_resp['message']
|
99
|
+
logger.warn json_resp['message'] # Sorry, you are rate limited.
|
100
|
+
@stop_requested = true
|
101
|
+
Cache.new.delete(url) if cache_enabled?
|
102
|
+
end
|
103
|
+
|
103
104
|
tweets = Tweet.from_html(html)
|
104
105
|
|
105
106
|
if tweets.empty?
|
@@ -108,7 +109,7 @@ module Twitterscraper
|
|
108
109
|
|
109
110
|
if json_resp
|
110
111
|
[tweets, json_resp['min_position']]
|
111
|
-
elsif type
|
112
|
+
elsif type.user?
|
112
113
|
[tweets, tweets[-1].tweet_id]
|
113
114
|
else
|
114
115
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
@@ -140,19 +141,33 @@ module Twitterscraper
|
|
140
141
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
141
142
|
end
|
142
143
|
end
|
143
|
-
|
144
|
-
if end_date
|
145
|
-
today = Date.today
|
146
|
-
if end_date > Date.today
|
147
|
-
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
148
|
-
end
|
149
|
-
end
|
150
144
|
end
|
151
145
|
|
152
|
-
def build_queries(query, start_date, end_date)
|
146
|
+
def build_queries(query, start_date, end_date, threads_granularity)
|
153
147
|
if start_date && end_date
|
154
|
-
|
155
|
-
|
148
|
+
if threads_granularity == 'auto'
|
149
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
150
|
+
end
|
151
|
+
|
152
|
+
if threads_granularity == 'day'
|
153
|
+
date_range = start_date.upto(end_date - 1)
|
154
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
155
|
+
elsif threads_granularity == 'hour'
|
156
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
157
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
158
|
+
queries = []
|
159
|
+
|
160
|
+
while true
|
161
|
+
if time < Time.now.utc
|
162
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
|
163
|
+
end
|
164
|
+
time += 3600
|
165
|
+
break if time >= end_time
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
@queries = queries
|
170
|
+
|
156
171
|
elsif start_date
|
157
172
|
[query + " since:#{start_date}"]
|
158
173
|
elsif end_date
|
@@ -162,93 +177,103 @@ module Twitterscraper
|
|
162
177
|
end
|
163
178
|
end
|
164
179
|
|
165
|
-
def main_loop(query, lang, type, limit, daily_limit
|
180
|
+
def main_loop(query, lang, type, limit, daily_limit)
|
166
181
|
pos = nil
|
167
|
-
|
182
|
+
tmp_tweets = []
|
168
183
|
|
169
184
|
while true
|
170
|
-
new_tweets, new_pos = query_single_page(query, lang, type, pos
|
185
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos)
|
171
186
|
unless new_tweets.empty?
|
172
|
-
|
173
|
-
|
187
|
+
tmp_tweets.concat(new_tweets)
|
188
|
+
tmp_tweets.uniq! { |t| t.tweet_id }
|
189
|
+
end
|
174
190
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
191
|
+
@results_counter[Parallel.worker_number] = tmp_tweets.size
|
192
|
+
total_size = @all_tweets.size + @results_counter.values.sum
|
193
|
+
logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
|
194
|
+
|
195
|
+
if !@stop_requested && total_size >= limit
|
196
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
|
197
|
+
@stop_requested = true
|
179
198
|
end
|
180
|
-
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
181
199
|
|
182
200
|
break unless new_pos
|
183
|
-
break if
|
201
|
+
break if @stop_requested
|
202
|
+
break if daily_limit && tmp_tweets.size >= daily_limit
|
184
203
|
break if @all_tweets.size >= limit
|
185
204
|
|
186
205
|
pos = new_pos
|
187
206
|
end
|
188
207
|
|
189
|
-
|
190
|
-
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
191
|
-
@stop_requested = true
|
192
|
-
end
|
208
|
+
tmp_tweets
|
193
209
|
end
|
194
210
|
|
195
211
|
def stop_requested?
|
196
212
|
@stop_requested
|
197
213
|
end
|
198
214
|
|
199
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
200
|
-
|
201
|
-
|
202
|
-
|
215
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
216
|
+
type = Type.new(type)
|
217
|
+
if type.search?
|
218
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
219
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
220
|
+
elsif type.user?
|
221
|
+
start_date = nil
|
222
|
+
end_date = nil
|
223
|
+
end
|
224
|
+
|
225
|
+
queries = build_queries(query, start_date, end_date, threads_granularity)
|
203
226
|
if threads > queries.size
|
204
|
-
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
205
227
|
threads = queries.size
|
206
228
|
end
|
207
|
-
if proxy_enabled?
|
208
|
-
proxies = Proxy::Pool.new
|
209
|
-
logger.debug "Fetch #{proxies.size} proxies"
|
210
|
-
else
|
211
|
-
proxies = []
|
212
|
-
logger.debug 'Proxy disabled'
|
213
|
-
end
|
214
229
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
215
230
|
|
216
|
-
|
217
231
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
218
232
|
|
233
|
+
logger.info "The number of queries #{queries.size}"
|
219
234
|
logger.info "The number of threads #{threads}"
|
220
235
|
|
221
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
222
|
-
logger.info "Headers #{headers}"
|
223
|
-
|
224
236
|
@all_tweets = []
|
225
|
-
@mutex = Mutex.new
|
226
237
|
@stop_requested = false
|
238
|
+
@results_counter = {}
|
227
239
|
|
228
240
|
if threads > 1
|
241
|
+
@mutex = Mutex.new
|
229
242
|
Thread.abort_on_exception = true
|
230
243
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
231
244
|
|
232
245
|
Parallel.each(queries, in_threads: threads) do |query|
|
233
|
-
|
246
|
+
@results_counter[Parallel.worker_number] = 0
|
247
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
248
|
+
@mutex.synchronize {
|
249
|
+
@all_tweets.concat(tmp_tweets)
|
250
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
251
|
+
}
|
252
|
+
@results_counter[Parallel.worker_number] = 0
|
253
|
+
|
234
254
|
raise Parallel::Break if stop_requested?
|
235
255
|
end
|
236
256
|
else
|
237
257
|
queries.each do |query|
|
238
|
-
main_loop(query, lang, type, limit, daily_limit
|
258
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
259
|
+
@all_tweets.concat(tmp_tweets)
|
260
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
261
|
+
|
239
262
|
break if stop_requested?
|
240
263
|
end
|
241
264
|
end
|
242
265
|
|
266
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
267
|
+
|
243
268
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
244
269
|
end
|
245
270
|
|
246
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
247
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
271
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
272
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
248
273
|
end
|
249
274
|
|
250
275
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
276
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
252
277
|
end
|
253
278
|
end
|
254
279
|
end
|
@@ -1,48 +1,57 @@
|
|
1
1
|
module Twitterscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Template
|
3
|
+
def tweets_embedded_html(name, tweets, options)
|
4
|
+
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
5
|
+
template = ERB.new(File.read(path))
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
tweets = tweets.sort_by { |t| t.created_at.to_i }
|
8
|
+
|
9
|
+
template.result_with_hash(
|
10
|
+
chart_name: name,
|
11
|
+
chart_data: chart_data(tweets).to_json,
|
12
|
+
first_tweet: tweets[0],
|
13
|
+
last_tweet: tweets[-1],
|
14
|
+
tweets: tweets,
|
15
|
+
convert_limit: 30,
|
16
|
+
)
|
8
17
|
end
|
9
18
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
19
|
+
def chart_data(tweets, grouping: 'auto')
|
20
|
+
if grouping && tweets.size > 100
|
21
|
+
if grouping == 'auto'
|
22
|
+
month = 28 * 24 * 60 * 60 # 28 days
|
23
|
+
duration = tweets[-1].created_at - tweets[0].created_at
|
24
|
+
|
25
|
+
if duration > 3 * month
|
26
|
+
grouping = 'day'
|
27
|
+
elsif duration > month || tweets.size > 10000
|
28
|
+
grouping = 'hour'
|
29
|
+
else
|
30
|
+
grouping = 'minute'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
Twitterscraper.logger.info "Chart grouping #{grouping}"
|
36
|
+
|
37
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
38
|
+
t = tweet.created_at
|
39
|
+
|
40
|
+
if grouping == 'day'
|
41
|
+
time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
|
42
|
+
elsif grouping == 'hour'
|
43
|
+
time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
|
44
|
+
elsif grouping == 'minute'
|
45
|
+
time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
|
46
|
+
else
|
47
|
+
time = t
|
48
|
+
end
|
49
|
+
memo[time.to_i] += 1
|
50
|
+
end
|
51
|
+
|
52
|
+
data.sort_by { |k, _| k }.map do |timestamp, count|
|
53
|
+
[timestamp * 1000, count]
|
54
|
+
end
|
55
|
+
end
|
47
56
|
end
|
48
57
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="ja">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
|
6
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
7
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
8
|
+
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
9
|
+
<script>
|
10
|
+
function updateTweets() {
|
11
|
+
window.twttr = (function (d, s, id) {
|
12
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
13
|
+
if (d.getElementById(id)) return t;
|
14
|
+
js = d.createElement(s);
|
15
|
+
js.id = id;
|
16
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
17
|
+
fjs.parentNode.insertBefore(js, fjs);
|
18
|
+
|
19
|
+
t._e = [];
|
20
|
+
t.ready = function (f) {
|
21
|
+
t._e.push(f);
|
22
|
+
};
|
23
|
+
|
24
|
+
return t;
|
25
|
+
}(document, "script", "twitter-wjs"));
|
26
|
+
}
|
27
|
+
|
28
|
+
function drawChart() {
|
29
|
+
Highcharts.setOptions({
|
30
|
+
time: {
|
31
|
+
timezone: moment.tz.guess()
|
32
|
+
}
|
33
|
+
});
|
34
|
+
|
35
|
+
var data = <%= chart_data %>;
|
36
|
+
var config = {
|
37
|
+
title: {
|
38
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
39
|
+
},
|
40
|
+
subtitle: {
|
41
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
42
|
+
},
|
43
|
+
series: [{
|
44
|
+
data: data
|
45
|
+
}],
|
46
|
+
rangeSelector: {enabled: false},
|
47
|
+
scrollbar: {enabled: false},
|
48
|
+
navigator: {enabled: false},
|
49
|
+
exporting: {enabled: false},
|
50
|
+
credits: {enabled: false}
|
51
|
+
};
|
52
|
+
|
53
|
+
Highcharts.stockChart('chart-container', config);
|
54
|
+
}
|
55
|
+
|
56
|
+
document.addEventListener("DOMContentLoaded", function () {
|
57
|
+
drawChart();
|
58
|
+
updateTweets();
|
59
|
+
});
|
60
|
+
</script>
|
61
|
+
|
62
|
+
<style type=text/css>
|
63
|
+
#chart-container {
|
64
|
+
max-width: 1200px;
|
65
|
+
height: 675px;
|
66
|
+
margin: 0 auto;
|
67
|
+
border: 1px solid rgb(204, 214, 221);
|
68
|
+
display: flex;
|
69
|
+
justify-content: center;
|
70
|
+
align-items: center;
|
71
|
+
}
|
72
|
+
.tweets-container {
|
73
|
+
max-width: 550px;
|
74
|
+
margin: 0 auto 0 auto;
|
75
|
+
}
|
76
|
+
|
77
|
+
.twitter-tweet {
|
78
|
+
margin: 15px 0 15px 0 !important;
|
79
|
+
}
|
80
|
+
</style>
|
81
|
+
</head>
|
82
|
+
<body>
|
83
|
+
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
84
|
+
|
85
|
+
<div class="tweets-container">
|
86
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
|
87
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
88
|
+
<% if i < convert_limit %>
|
89
|
+
<blockquote class="twitter-tweet">
|
90
|
+
<% else %>
|
91
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
92
|
+
<% end %>
|
93
|
+
|
94
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
95
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
96
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
97
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
98
|
+
</div>
|
99
|
+
|
100
|
+
<div><%= tweet.text %></div>
|
101
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
102
|
+
|
103
|
+
<% if i < convert_limit %>
|
104
|
+
</blockquote>
|
105
|
+
<% else %>
|
106
|
+
</div>
|
107
|
+
<% end %>
|
108
|
+
<% end %>
|
109
|
+
</div>
|
110
|
+
|
111
|
+
</body>
|
112
|
+
</html>
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.19.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -72,7 +72,9 @@ files:
|
|
72
72
|
- lib/twitterscraper/proxy.rb
|
73
73
|
- lib/twitterscraper/query.rb
|
74
74
|
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/template/tweets.html.erb
|
75
76
|
- lib/twitterscraper/tweet.rb
|
77
|
+
- lib/twitterscraper/type.rb
|
76
78
|
- lib/version.rb
|
77
79
|
- twitterscraper-ruby.gemspec
|
78
80
|
homepage: https://github.com/ts-3156/twitterscraper-ruby
|