twitterscraper-ruby 0.14.0 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +25 -16
- data/lib/twitterscraper/cli.rb +3 -0
- data/lib/twitterscraper/query.rb +35 -26
- data/lib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a950fb24329aaa1020441e258a8a2144100d732142b6c227bb9b026b8bb73996
|
4
|
+
data.tar.gz: 1f64f31e43189e2ee439f5ef6f6d54bc6ea58895adbed67cb8ddbe91af07681a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8573affbc9a5faa05e5e489364bb2ba0da1aa4f12af35445e5de8b1f8c399eb0575cc9f408b2ba96c3d7fd8b2a74b7dd703229053a33c1f8a883856818033cb9
|
7
|
+
data.tar.gz: 2b2b3ad0b2dd9d089a7b6127ed1b0db21e7f4fa5f0c31e6b366d9b5ae444e2244d4200c813b7a3257f43702d2caa9f264515e701602c24f4482a746b89d41328
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,26 +33,39 @@ $ gem install twitterscraper-ruby
|
|
33
33
|
Command-line interface:
|
34
34
|
|
35
35
|
```shell script
|
36
|
-
|
37
|
-
|
36
|
+
# Returns a collection of relevant tweets matching a specified query.
|
37
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
38
|
+
--limit 100 --threads 10 --output tweets.json
|
39
|
+
```
|
40
|
+
|
41
|
+
```shell script
|
42
|
+
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
43
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
38
44
|
```
|
39
45
|
|
40
46
|
From Within Ruby:
|
41
47
|
|
42
48
|
```ruby
|
43
49
|
require 'twitterscraper'
|
50
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
51
|
+
```
|
44
52
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
53
|
+
```ruby
|
54
|
+
# Returns a collection of relevant tweets matching a specified query.
|
55
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
56
|
+
```
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
60
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
61
|
+
```
|
52
62
|
|
53
|
-
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
54
|
-
tweets = client.query_tweets(KEYWORD, options)
|
55
63
|
|
64
|
+
## Attributes
|
65
|
+
|
66
|
+
### Tweet
|
67
|
+
|
68
|
+
```ruby
|
56
69
|
tweets.each do |tweet|
|
57
70
|
puts tweet.tweet_id
|
58
71
|
puts tweet.text
|
@@ -64,11 +77,6 @@ tweets.each do |tweet|
|
|
64
77
|
end
|
65
78
|
```
|
66
79
|
|
67
|
-
|
68
|
-
## Attributes
|
69
|
-
|
70
|
-
### Tweet
|
71
|
-
|
72
80
|
- screen_name
|
73
81
|
- name
|
74
82
|
- user_id
|
@@ -136,6 +144,7 @@ $ cat tweets.json | jq . | less
|
|
136
144
|
| Option | Description | Default |
|
137
145
|
| ------------- | ------------- | ------------- |
|
138
146
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
147
|
+
| `--type` | Specify a search type. | search |
|
139
148
|
| `--query` | Specify a keyword used during the search. | |
|
140
149
|
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
150
|
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,6 +16,7 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
@@ -59,6 +60,7 @@ module Twitterscraper
|
|
59
60
|
'help',
|
60
61
|
'v',
|
61
62
|
'version',
|
63
|
+
'type:',
|
62
64
|
'query:',
|
63
65
|
'start_date:',
|
64
66
|
'end_date:',
|
@@ -75,6 +77,7 @@ module Twitterscraper
|
|
75
77
|
'verbose',
|
76
78
|
)
|
77
79
|
|
80
|
+
options['type'] ||= 'search'
|
78
81
|
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
79
82
|
options['lang'] ||= ''
|
80
83
|
options['limit'] = (options['limit'] || 100).to_i
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,23 +22,24 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, from_user, pos)
|
31
|
+
if from_user
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
@@ -74,11 +75,11 @@ module Twitterscraper
|
|
74
75
|
[items_html, json_resp]
|
75
76
|
end
|
76
77
|
|
77
|
-
def query_single_page(query, lang,
|
78
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
78
79
|
logger.info "Querying #{query}"
|
79
80
|
query = ERB::Util.url_encode(query)
|
80
81
|
|
81
|
-
url = build_query_url(query, lang,
|
82
|
+
url = build_query_url(query, lang, type == 'user', pos)
|
82
83
|
http_request = lambda do
|
83
84
|
logger.debug "Scraping tweets from #{url}"
|
84
85
|
get_single_page(url, headers, proxies)
|
@@ -107,8 +108,8 @@ module Twitterscraper
|
|
107
108
|
|
108
109
|
if json_resp
|
109
110
|
[tweets, json_resp['min_position']]
|
110
|
-
elsif
|
111
|
-
|
111
|
+
elsif type
|
112
|
+
[tweets, tweets[-1].tweet_id]
|
112
113
|
else
|
113
114
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
114
115
|
end
|
@@ -116,7 +117,7 @@ module Twitterscraper
|
|
116
117
|
|
117
118
|
OLDEST_DATE = Date.parse('2006-03-21')
|
118
119
|
|
119
|
-
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
120
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
120
121
|
query = queries[0]
|
121
122
|
if query.nil? || query == ''
|
122
123
|
raise Error.new('Please specify a search query.')
|
@@ -161,12 +162,12 @@ module Twitterscraper
|
|
161
162
|
end
|
162
163
|
end
|
163
164
|
|
164
|
-
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
165
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
165
166
|
pos = nil
|
166
167
|
daily_tweets = []
|
167
168
|
|
168
169
|
while true
|
169
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
170
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
170
171
|
unless new_tweets.empty?
|
171
172
|
daily_tweets.concat(new_tweets)
|
172
173
|
daily_tweets.uniq! { |t| t.tweet_id }
|
@@ -195,7 +196,7 @@ module Twitterscraper
|
|
195
196
|
@stop_requested
|
196
197
|
end
|
197
198
|
|
198
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
199
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
199
200
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
200
201
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
201
202
|
queries = build_queries(query, start_date, end_date)
|
@@ -213,7 +214,7 @@ module Twitterscraper
|
|
213
214
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
214
215
|
|
215
216
|
|
216
|
-
validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
218
|
|
218
219
|
logger.info "The number of threads #{threads}"
|
219
220
|
|
@@ -229,17 +230,25 @@ module Twitterscraper
|
|
229
230
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
231
|
|
231
232
|
Parallel.each(queries, in_threads: threads) do |query|
|
232
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
233
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
233
234
|
raise Parallel::Break if stop_requested?
|
234
235
|
end
|
235
236
|
else
|
236
237
|
queries.each do |query|
|
237
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
238
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
238
239
|
break if stop_requested?
|
239
240
|
end
|
240
241
|
end
|
241
242
|
|
242
243
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
243
244
|
end
|
245
|
+
|
246
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
247
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
248
|
+
end
|
249
|
+
|
250
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
252
|
+
end
|
244
253
|
end
|
245
254
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|